diff --git a/sitedata/papers.json b/sitedata/papers.json index eee8ad64e..e94759a23 100644 --- a/sitedata/papers.json +++ b/sitedata/papers.json @@ -1 +1 @@ -{"rkgt0REKwS": {"content": {"appendix": "", "TL;DR": "A novel loss bridges curriculum learning and robust learning", "keywords": ["curriculum learning", "expressive power", "generalization", "robust learning", "robustness"], "paperhash": "lyu|curriculum_loss_robust_learning_and_generalization_against_label_corruption", "spotlight_video": "", "poster": "", "slides": "", "abstract": "Deep neural networks (DNNs) have great expressive power, which can even memorize samples with wrong labels. It is vitally important to reiterate robustness and generalization in DNNs against label corruption. To this end, this paper studies the 0-1 loss, which has a monotonic relationship between empirical adversary (reweighted) risk (Hu et al. 2018). Although the 0-1 loss is robust to outliers, it is also difficult to optimize. To efficiently optimize the 0-1 loss while keeping its robust properties, we propose a very simple and efficient loss, i.e. curriculum loss (CL). Our CL is a tighter upper bound of the 0-1 loss compared with conventional summation based surrogate losses. Moreover, CL can adaptively select samples for stagewise training. As a result, our loss can be deemed as a novel perspective of curriculum sample selection strategy, which bridges a connection between curriculum learning and robust learning. Experimental results on noisy MNIST, CIFAR10 and CIFAR100 dataset validate the robustness of the proposed loss.", "_bibtex": "@inproceedings{\nLyu2020Curriculum,\ntitle={Curriculum Loss: Robust Learning and Generalization against Label Corruption},\nauthor={Yueming Lyu and Ivor W. Tsang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkgt0REKwS}\n}", "authorids": ["lv_yueming@outlook.com", "ivor.tsang@uts.edu.au"], "title": "Curriculum Loss: Robust Learning and Generalization against Label Corruption", "authors": ["Yueming Lyu", "Ivor W. Tsang"], "original_pdf": "/attachment/8f92c68c81714ce3ea0423465464268d22752db5.pdf", "pdf": "/pdf/800966a39eedb870f2172779e97ac57edbff69e5.pdf", "full_presentation_video": ""}, "forum": "rkgt0REKwS", "id": "rkgt0REKwS"}, "rJeB36NKvB": {"content": {"appendix": "", "TL;DR": "Our work shows positional information has been implicitly encoded in a network. This information is important for detecting position-dependent features, e.g. semantic and saliency.", "keywords": ["cnn"], "paperhash": "islam|how_much_position_information_do_convolutional_neural_networks_encode", "spotlight_video": "", "poster": "", "slides": "", "abstract": "In contrast to fully connected networks, Convolutional Neural Networks (CNNs) achieve efficiency by learning weights associated with local filters with a finite spatial extent. An implication of this is that a filter may know what it is looking at, but not where it is positioned in the image. Information concerning absolute position is inherently useful, and it is reasonable to assume that deep CNNs may implicitly learn to encode this information if there is a means to do so. In this paper, we test this hypothesis revealing the surprising degree of absolute position information that is encoded in commonly used neural networks. A comprehensive set of experiments show the validity of this hypothesis and shed light on how and where this information is represented while offering clues to where positional information is derived from in deep CNNs.", "_bibtex": "@inproceedings{\nIslam*2020How,\ntitle={How much Position Information Do Convolutional Neural Networks Encode?},\nauthor={Md Amirul Islam* and Sen Jia* and Neil D. B. Bruce},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJeB36NKvB}\n}", "authorids": ["amirul@scs.ryerson.ca", "sen.jia@ryerson.ca", "bruce@ryerson.ca"], "title": "How much Position Information Do Convolutional Neural Networks Encode?", "authors": ["Md Amirul Islam*", "Sen Jia*", "Neil D. B. Bruce"], "original_pdf": "/attachment/2584bac4c5038bef361ca3b8a96152ac8e05c7db.pdf", "pdf": "/pdf/2267055f8221e283014aba7ef46092ba93ff450f.pdf", "full_presentation_video": ""}, "forum": "rJeB36NKvB", "id": "rJeB36NKvB"}, "rkl8dlHYvB": {"content": {"appendix": "", "TL;DR": "A zero-shot segmentation framework for 3D object part segmentation. Model the segmentation as a decision-making process and solve as a contextual bandit problem.", "keywords": ["zero shot learning"], "paperhash": "luo|learning_to_group_a_bottomup_framework_for_3d_part_discovery_in_unseen_categories", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Tiange Luo", "Kaichun Mo", "Zhiao Huang", "Jiarui Xu", "Siyu Hu", "Liwei Wang", "Hao Su"], "_bibtex": "@inproceedings{\nLuo2020Learning,\ntitle={Learning to Group: A Bottom-Up Framework for 3D Part Discovery in Unseen Categories},\nauthor={Tiange Luo and Kaichun Mo and Zhiao Huang and Jiarui Xu and Siyu Hu and Liwei Wang and Hao Su},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkl8dlHYvB}\n}", "authorids": ["luotg@pku.edu.cn", "kaichun@cs.stanford.edu", "z2huang@eng.ucsd.edu", "jxuat@connect.ust.hk", "sy89128@mail.ustc.edu.cn", "wanglw@cis.pku.edu.cn", "haosu@eng.ucsd.edu"], "title": "Learning to Group: A Bottom-Up Framework for 3D Part Discovery in Unseen Categories", "original_pdf": "/attachment/436a0d110d9df4c3acad7f7fb8193a0ad5f16e94.pdf", "pdf": "/pdf/671a399fc30f738f3307611ff2ad88acbb64b12a.pdf", "abstract": "We address the problem of learning to discover 3D parts for objects in unseen categories. Being able to learn the geometry prior of parts and transfer this prior to unseen categories pose fundamental challenges on data-driven shape segmentation approaches. Formulated as a contextual bandit problem, we propose a learning-based iterative grouping framework which learns a grouping policy to progressively merge small part proposals into bigger ones in a bottom-up fashion. At the core of our approach is to restrict the local context for extracting part-level features, which encourages the generalizability to novel categories. On a recently proposed large-scale fine-grained 3D part dataset, PartNet, we demonstrate that our method can transfer knowledge of parts learned from 3 training categories to 21 unseen testing categories without seeing any annotated samples. Quantitative comparisons against four strong shape segmentation baselines show that we achieve the state-of-the-art performance.", "full_presentation_video": ""}, "forum": "rkl8dlHYvB", "id": "rkl8dlHYvB"}, "HyxY6JHKwr": {"content": {"appendix": "", "TL;DR": "A method to train a single model simultaneously minimizing a family of loss functions instead of training a set of per-loss models.", "keywords": ["compression", "generation", "image generation"], "paperhash": "dosovitskiy|you_only_train_once_lossconditional_training_of_deep_networks", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Alexey Dosovitskiy", "Josip Djolonga"], "_bibtex": "@inproceedings{\nDosovitskiy2020You,\ntitle={You Only Train Once: Loss-Conditional Training of Deep Networks},\nauthor={Alexey Dosovitskiy and Josip Djolonga},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HyxY6JHKwr}\n}", "authorids": ["adosovitskiy@gmail.com", "josip@djolonga.com"], "title": "You Only Train Once: Loss-Conditional Training of Deep Networks", "original_pdf": "/attachment/b2e8b65d0c00c3e75ffed4aa5e015f15701fe89c.pdf", "pdf": "/pdf/4b71d0b19e942323e80951bf63fb2ed630f565bb.pdf", "abstract": "In many machine learning problems, loss functions are weighted sums of several terms. A typical approach to dealing with these is to train multiple separate models with different selections of weights and then either choose the best one according to some criterion or keep multiple models if it is desirable to maintain a diverse set of solutions. This is inefficient both at training and at inference time. We propose a method that allows replacing multiple models trained on one loss function each by a single model trained on a distribution of losses. At test time a model trained this way can be conditioned to generate outputs corresponding to any loss from the training distribution of losses. We demonstrate this approach on three tasks with parametrized losses: beta-VAE, learned image compression, and fast style transfer.", "full_presentation_video": ""}, "forum": "HyxY6JHKwr", "id": "HyxY6JHKwr"}, "rJxycxHKDS": {"content": {"appendix": "", "TL;DR": "A Multiflow Network is a dynamic architecture for domain adaptation that learns potentially different computational graphs per domain, so as to map them to a common representation where inference can be performed in a domain-agnostic fashion.", "keywords": ["computer vision", "domain adaptation", "unsupervised"], "paperhash": "berm\u00fadezchac\u00f3n|domain_adaptive_multibranch_networks", "spotlight_video": "", "poster": "", "slides": "", "authors": ["R\u00f3ger Berm\u00fadez-Chac\u00f3n", "Mathieu Salzmann", "Pascal Fua"], "_bibtex": "@inproceedings{\nBerm\u00fadez-Chac\u00f3n2020Domain,\ntitle={Domain Adaptive Multibranch Networks},\nauthor={R\u00f3ger Berm\u00fadez-Chac\u00f3n and Mathieu Salzmann and Pascal Fua},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJxycxHKDS}\n}", "authorids": ["roger.bermudez@epfl.ch", "mathieu.salzmann@epfl.ch", "pascal.fua@epfl.ch"], "title": "Domain Adaptive Multibranch Networks", "original_pdf": "/attachment/7a5cb9eb4fa9d872a9eb34be33bc3b4a08e85d12.pdf", "pdf": "/pdf/2590950d8db05c10ce655a6b0f2a6e404611b61e.pdf", "abstract": "We tackle unsupervised domain adaptation by accounting for the fact that different domains may need to be processed differently to arrive to a common feature representation effective for recognition. To this end, we introduce a deep learning framework where each domain undergoes a different sequence of operations, allowing some, possibly more complex, domains to go through more computations than others.\nThis contrasts with state-of-the-art domain adaptation techniques that force all domains to be processed with the same series of operations, even when using multi-stream architectures whose parameters are not shared.\nAs evidenced by our experiments, the greater flexibility of our method translates to higher accuracy. Furthermore, it allows us to handle any number of domains simultaneously.", "full_presentation_video": ""}, "forum": "rJxycxHKDS", "id": "rJxycxHKDS"}, "HJgLLyrYwB": {"content": {"appendix": "", "TL;DR": "Algorithm for imitation with state-only expert demonstrations; builds on adversarial-IRL; experiments with transition dynamics mismatch b/w expert and imitator", "keywords": ["adversarial", "imitation learning", "inverse reinforcement learning", "optimization", "reinforcement learning"], "paperhash": "gangwani|stateonly_imitation_with_transition_dynamics_mismatch", "code": "https://github.com/tgangwani/RL-Indirect-imitation", "spotlight_video": "", "authorids": ["gangwan2@illinois.edu", "jianpeng@illinois.edu"], "poster": "", "slides": "", "authors": ["Tanmay Gangwani", "Jian Peng"], "_bibtex": "@inproceedings{\nGangwani2020State-only,\ntitle={State-only Imitation with Transition Dynamics Mismatch},\nauthor={Tanmay Gangwani and Jian Peng},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJgLLyrYwB}\n}", "original_pdf": "/attachment/0b912ccf40f0114c22564b796056a983ef7e44a2.pdf", "title": "State-only Imitation with Transition Dynamics Mismatch", "pdf": "/pdf/d0b1a39c252097c699c922036af72b5546249ea5.pdf", "abstract": "Imitation Learning (IL) is a popular paradigm for training agents to achieve complicated goals by leveraging expert behavior, rather than dealing with the hardships of designing a correct reward function. With the environment modeled as a Markov Decision Process (MDP), most of the existing IL algorithms are contingent on the availability of expert demonstrations in the same MDP as the one in which a new imitator policy is to be learned. This is uncharacteristic of many real-life scenarios where discrepancies between the expert and the imitator MDPs are common, especially in the transition dynamics function. Furthermore, obtaining expert actions may be costly or infeasible, making the recent trend towards state-only IL (where expert demonstrations constitute only states or observations) ever so promising. Building on recent adversarial imitation approaches that are motivated by the idea of divergence minimization, we present a new state-only IL algorithm in this paper. It divides the overall optimization objective into two subproblems by introducing an indirection step and solves the subproblems iteratively. We show that our algorithm is particularly effective when there is a transition dynamics mismatch between the expert and imitator MDPs, while the baseline IL methods suffer from performance degradation. To analyze this, we construct several interesting MDPs by modifying the configuration parameters for the MuJoCo locomotion tasks from OpenAI Gym.", "full_presentation_video": ""}, "forum": "HJgLLyrYwB", "id": "HJgLLyrYwB"}, "BkxSmlBFvr": {"content": {"appendix": "", "TL;DR": "We study the impact of training strategies on the performance of knowledge graph embeddings.", "keywords": ["graph embedding", "knowledge graph embeddings", "optimization"], "paperhash": "ruffinelli|you_can_teach_an_old_dog_new_tricks_on_training_knowledge_graph_embeddings", "code": "https://github.com/uma-pi1/kge", "spotlight_video": "", "authorids": ["daniel@informatik.uni-mannheim.de", "broscheit@informatik.uni-mannheim.de", "rgemulla@uni-mannheim.de"], "poster": "", "slides": "", "authors": ["Daniel Ruffinelli", "Samuel Broscheit", "Rainer Gemulla"], "_bibtex": "@inproceedings{\nRuffinelli2020You,\ntitle={You CAN Teach an Old Dog New Tricks! On Training Knowledge Graph Embeddings},\nauthor={Daniel Ruffinelli and Samuel Broscheit and Rainer Gemulla},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BkxSmlBFvr}\n}", "original_pdf": "/attachment/e06abd255af19418802e92d7cdab9d6e8f05be89.pdf", "title": "You CAN Teach an Old Dog New Tricks! On Training Knowledge Graph Embeddings", "pdf": "/pdf/d8532341877a4ce6e4fee643e629af2957579771.pdf", "abstract": "Knowledge graph embedding (KGE) models learn algebraic representations of the entities and relations in a knowledge graph. A vast number of KGE techniques for multi-relational link prediction have been proposed in the recent literature, often with state-of-the-art performance. These approaches differ along a number of dimensions, including different model architectures, different training strategies, and different approaches to hyperparameter optimization. In this paper, we take a step back and aim to summarize and quantify empirically the impact of each of these dimensions on model performance. We report on the results of an extensive experimental study with popular model architectures and training strategies across a wide range of hyperparameter settings. We found that when trained appropriately, the relative performance differences between various model architectures often shrinks and sometimes even reverses when compared to prior results. For example, RESCAL~\\citep{nickel2011three}, one of the first KGE models, showed strong performance when trained with state-of-the-art techniques; it was competitive to or outperformed more recent architectures. We also found that good (and often superior to prior studies) model configurations can be found by exploring relatively few random samples from a large hyperparameter space. Our results suggest that many of the more advanced architectures and techniques proposed in the literature should be revisited to reassess their individual benefits. To foster further reproducible research, we provide all our implementations and experimental results as part of the open source LibKGE framework.", "full_presentation_video": ""}, "forum": "BkxSmlBFvr", "id": "BkxSmlBFvr"}, "Syx79eBKwr": {"content": {"appendix": "", "keywords": ["computer vision", "mutual information", "nlp", "representation learning", "word embedding"], "paperhash": "kong|a_mutual_information_maximization_perspective_of_language_representation_learning", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Lingpeng Kong", "Cyprien de Masson d'Autume", "Lei Yu", "Wang Ling", "Zihang Dai", "Dani Yogatama"], "_bibtex": "@inproceedings{\nKong2020A,\ntitle={A Mutual Information Maximization Perspective of Language Representation Learning},\nauthor={Lingpeng Kong and Cyprien de Masson d'Autume and Lei Yu and Wang Ling and Zihang Dai and Dani Yogatama},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Syx79eBKwr}\n}", "authorids": ["lingpenk@google.com", "cyprien@google.com", "leiyu@google.com", "lingwang@google.com", "zihangd@google.com", "dyogatama@google.com"], "title": "A Mutual Information Maximization Perspective of Language Representation Learning", "original_pdf": "/attachment/b76d744edf0d66b50fe0756cefd4c645652084c2.pdf", "pdf": "/pdf/eb7c97ac3d4465bf40ae5911244e1fba78fd3314.pdf", "abstract": "We show state-of-the-art word representation learning methods maximize an objective function that is a lower bound on the mutual information between different parts of a word sequence (i.e., a sentence). Our formulation provides an alternative perspective that unifies classical word embedding models (e.g., Skip-gram) and modern contextual embeddings (e.g., BERT, XLNet). In addition to enhancing our theoretical understanding of these methods, our derivation leads to a principled framework that can be used to construct new self-supervised tasks. We provide an example by drawing inspirations from related methods based on mutual information maximization that have been successful in computer vision, and introduce a simple self-supervised objective that maximizes the mutual information between a global sentence representation and n-grams in the sentence. Our analysis offers a holistic view of representation learning methods to transfer knowledge and translate progress across multiple domains (e.g., natural language processing, computer vision, audio processing).", "full_presentation_video": ""}, "forum": "Syx79eBKwr", "id": "Syx79eBKwr"}, "SyxrxR4KPS": {"content": {"appendix": "", "TL;DR": "We built a physical simulation of a rodent, trained it to solve a set of tasks, and analyzed the resulting networks.", "keywords": ["computational neuroscience", "reinforcement learning"], "paperhash": "merel|deep_neuroethology_of_a_virtual_rodent", "spotlight_video": "", "poster": "", "slides": "", "abstract": "Parallel developments in neuroscience and deep learning have led to mutually productive exchanges, pushing our understanding of real and artificial neural networks in sensory and cognitive systems. However, this interaction between fields is less developed in the study of motor control. In this work, we develop a virtual rodent as a platform for the grounded study of motor activity in artificial models of embodied control. We then use this platform to study motor activity across contexts by training a model to solve four complex tasks. Using methods familiar to neuroscientists, we describe the behavioral representations and algorithms employed by different layers of the network using a neuroethological approach to characterize motor activity relative to the rodent's behavior and goals. We find that the model uses two classes of representations which respectively encode the task-specific behavioral strategies and task-invariant behavioral kinematics. These representations are reflected in the sequential activity and population dynamics of neural subpopulations. Overall, the virtual rodent facilitates grounded collaborations between deep reinforcement learning and motor neuroscience.", "_bibtex": "@inproceedings{\nMerel2020Deep,\ntitle={Deep neuroethology of a virtual rodent},\nauthor={Josh Merel and Diego Aldarondo and Jesse Marshall and Yuval Tassa and Greg Wayne and Bence Olveczky},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SyxrxR4KPS}\n}", "authorids": ["jsmerel@google.com", "diegoaldarondo@g.harvard.edu", "jesse_d_marshall@fas.harvard.edu", "tassa@google.com", "gregwayne@google.com", "olveczky@fas.harvard.edu"], "title": "Deep neuroethology of a virtual rodent", "authors": ["Josh Merel", "Diego Aldarondo", "Jesse Marshall", "Yuval Tassa", "Greg Wayne", "Bence Olveczky"], "original_pdf": "/attachment/cdac87d51f32786d0a3af853d5e0bfa125d8eef0.pdf", "pdf": "/pdf/31957327a576d1964b20f9e1881af36e76d59d6e.pdf", "full_presentation_video": ""}, "forum": "SyxrxR4KPS", "id": "SyxrxR4KPS"}, "HylsTT4FvB": {"content": {"appendix": "", "TL;DR": "Interpolations in the latent space demonstrate generalization capacity of GANs and the effect of dataset biases.", "keywords": ["adversarial", "capacity", "gan", "generalization", "generative models"], "paperhash": "jahanian|on_the_steerability_of_generative_adversarial_networks", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Ali Jahanian*", "Lucy Chai*", "Phillip Isola"], "_bibtex": "@inproceedings{\nJahanian*2020On,\ntitle={On the \"steerability\" of generative adversarial networks},\nauthor={Ali Jahanian* and Lucy Chai* and Phillip Isola},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HylsTT4FvB}\n}", "authorids": ["jahanian@mit.edu", "lrchai@mit.edu", "phillipi@mit.edu"], "title": "On the \"steerability\" of generative adversarial networks", "original_pdf": "/attachment/85b25c85f7c2b0782dd6604116211a14fb7a12ed.pdf", "pdf": "/pdf/bc1679a28ae669c26f89b4d9f7d420faad2e1712.pdf", "abstract": "An open secret in contemporary machine learning is that many models work beautifully on standard benchmarks but fail to generalize outside the lab. This has been attributed to biased training data, which provide poor coverage over real world events. Generative models are no exception, but recent advances in generative adversarial networks (GANs) suggest otherwise -- these models can now synthesize strikingly realistic and diverse images. Is generative modeling of photos a solved problem? We show that although current GANs can fit standard datasets very well, they still fall short of being comprehensive models of the visual manifold. In particular, we study their ability to fit simple transformations such as camera movements and color changes. We find that the models reflect the biases of the datasets on which they are trained (e.g., centered objects), but that they also exhibit some capacity for generalization: by \"steering\" in latent space, we can shift the distribution while still creating realistic images. We hypothesize that the degree of distributional shift is related to the breadth of the training data distribution. Thus, we conduct experiments to quantify the limits of GAN transformations and introduce techniques to mitigate the problem. Code is released on our project page: https://ali-design.github.io/gan_steerability/", "full_presentation_video": ""}, "forum": "HylsTT4FvB", "id": "HylsTT4FvB"}, "BkglSTNFDB": {"content": {"appendix": "", "TL;DR": "We adapt Q-learning with UCB-exploration bonus to infinite-horizon MDP with discounted rewards without accessing a generative model, and improves the previously best known result.", "keywords": ["generative models", "reinforcement learning"], "paperhash": "wang|qlearning_with_ucb_exploration_is_sample_efficient_for_infinitehorizon_mdp", "spotlight_video": "", "poster": "", "slides": "", "abstract": "A fundamental question in reinforcement learning is whether model-free algorithms are sample efficient. Recently, Jin et al. (2018) proposed a Q-learning algorithm with UCB exploration policy, and proved it has nearly optimal regret bound for finite-horizon episodic MDP. In this paper, we adapt Q-learning with UCB-exploration bonus to infinite-horizon MDP with discounted rewards \\emph{without} accessing a generative model. We show that the \\textit{sample complexity of exploration} of our algorithm is bounded by $\\tilde{O}({\\frac{SA}{\\epsilon^2(1-\\gamma)^7}})$. This improves the previously best known result of $\\tilde{O}({\\frac{SA}{\\epsilon^4(1-\\gamma)^8}})$ in this setting achieved by delayed Q-learning (Strehlet al., 2006),, and matches the lower bound in terms of $\\epsilon$ as well as $S$ and $A$ up to logarithmic factors.", "_bibtex": "@inproceedings{\nWang2020Q-learning,\ntitle={Q-learning with UCB Exploration is Sample Efficient for Infinite-Horizon MDP},\nauthor={Yuanhao Wang and Kefan Dong and Xiaoyu Chen and Liwei Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BkglSTNFDB}\n}", "authorids": ["yuanhao-16@mails.tsinghua.edu.cn", "dkf16@mails.tsinghua.edu.cn", "cxy30@pku.edu.cn", "wanglw@cis.pku.edu.cn"], "title": "Q-learning with UCB Exploration is Sample Efficient for Infinite-Horizon MDP", "authors": ["Yuanhao Wang", "Kefan Dong", "Xiaoyu Chen", "Liwei Wang"], "original_pdf": "/attachment/f18e675c2d09159773e2a027f9c388e9156c4d70.pdf", "pdf": "/pdf/3a261a8730f71c072aeea18bbd7d9a757ebc725d.pdf", "full_presentation_video": ""}, "forum": "BkglSTNFDB", "id": "BkglSTNFDB"}, "rJeW1yHYwH": {"content": {"appendix": "", "keywords": ["attention", "graph embedding", "representation learning", "self attention"], "paperhash": "xu|inductive_representation_learning_on_temporal_graphs", "code": "https://drive.google.com/drive/folders/1GaH8vusCXJj4ucayfO-PyHpnNsJRkB78?usp=sharing", "spotlight_video": "", "poster": "", "slides": "", "abstract": "Inductive representation learning on temporal graphs is an important step toward salable machine learning on real-world dynamic networks. The evolving nature of temporal dynamic graphs requires handling new nodes as well as capturing temporal patterns. The node embeddings, which are now functions of time, should represent both the static node features and the evolving topological structures. Moreover, node and topological features can be temporal as well, whose patterns the node embeddings should also capture. We propose the temporal graph attention (TGAT) layer to efficiently aggregate temporal-topological neighborhood features to learn the time-feature interactions. For TGAT, we use the self-attention mechanism as building block and develop a novel functional time encoding technique based on the classical Bochner's theorem from harmonic analysis. By stacking TGAT layers, the network recognizes the node embeddings as functions of time and is able to inductively infer embeddings for both new and observed nodes as the graph evolves. The proposed approach handles both node classification and link prediction task, and can be naturally extended to include the temporal edge features. We evaluate our method with transductive and inductive tasks under temporal settings with two benchmark and one industrial dataset. Our TGAT model compares favorably to state-of-the-art baselines as well as the previous temporal graph embedding approaches.", "_bibtex": "@inproceedings{\nXu2020Inductive,\ntitle={Inductive representation learning on temporal graphs},\nauthor={da Xu and chuanwei ruan and evren korpeoglu and sushant kumar and kannan achan},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJeW1yHYwH}\n}", "authorids": ["da.xu@walmartlabs.com", "ruanchuanwei@gmail.com", "ekorpeoglu@walmart.com", "skumar4@walmartlabs.com", "kachan@walmartlabs.com"], "title": "Inductive representation learning on temporal graphs", "authors": ["da Xu", "chuanwei ruan", "evren korpeoglu", "sushant kumar", "kannan achan"], "original_pdf": "/attachment/ffe516c2a82926ac3057fb5673bdea3dc553205d.pdf", "pdf": "/pdf/35f0e0e0b42200e2c21ced03637e8b30f6e6b6fc.pdf", "full_presentation_video": ""}, "forum": "rJeW1yHYwH", "id": "rJeW1yHYwH"}, "ByeUBANtvB": {"content": {"appendix": "", "TL;DR": "Perturbations can be used to train feedback weights to learn in fully connected and convolutional neural networks", "keywords": ["cnn", "feedback alignment", "perturbation", "reinforcement learning"], "paperhash": "lansdell|learning_to_solve_the_credit_assignment_problem", "code": "https://github.com/benlansdell/synthfeedback", "spotlight_video": "", "authorids": ["ben.lansdell@gmail.com", "prprak@seas.upenn.edu", "koerding@gmail.com"], "poster": "", "slides": "", "authors": ["Benjamin James Lansdell", "Prashanth Ravi Prakash", "Konrad Paul Kording"], "_bibtex": "@inproceedings{\nLansdell2020Learning,\ntitle={Learning to solve the credit assignment problem},\nauthor={Benjamin James Lansdell and Prashanth Ravi Prakash and Konrad Paul Kording},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ByeUBANtvB}\n}", "original_pdf": "/attachment/f471af050add2bbd4e67d136fcdb65accf30e974.pdf", "title": "Learning to solve the credit assignment problem", "pdf": "/pdf/3b76872cdc5c2763ccdfeb90e7f14298c71aea05.pdf", "abstract": "Backpropagation is driving today's artificial neural networks (ANNs). However, despite extensive research, it remains unclear if the brain implements this algorithm. Among neuroscientists, reinforcement learning (RL) algorithms are often seen as a realistic alternative: neurons can randomly introduce change, and use unspecific feedback signals to observe their effect on the cost and thus approximate their gradient. However, the convergence rate of such learning scales poorly with the number of involved neurons. Here we propose a hybrid learning approach. Each neuron uses an RL-type strategy to learn how to approximate the gradients that backpropagation would provide. We provide proof that our approach converges to the true gradient for certain classes of networks. In both feedforward and convolutional networks, we empirically show that our approach learns to approximate the gradient, and can match the performance of gradient-based learning. Learning feedback weights provides a biologically plausible mechanism of achieving good performance, without the need for precise, pre-specified learning rules.", "full_presentation_video": ""}, "forum": "ByeUBANtvB", "id": "ByeUBANtvB"}, "r1lGO0EKDH": {"content": {"appendix": "", "TL;DR": "A multi-level spectral approach to improving the quality and scalability of unsupervised graph embedding.", "keywords": ["graph embedding", "memory", "optimization", "scalability", "unsupervised"], "paperhash": "deng|graphzoom_a_multilevel_spectral_approach_for_accurate_and_scalable_graph_embedding", "code": "https://github.com/cornell-zhang/GraphZoom", "spotlight_video": "", "authorids": ["cd574@cornell.edu", "qzzhao@mtu.edu", "yongyuw@mtu.edu", "zhiruz@cornell.edu", "zfeng12@stevens.edu"], "poster": "", "slides": "", "authors": ["Chenhui Deng", "Zhiqiang Zhao", "Yongyu Wang", "Zhiru Zhang", "Zhuo Feng"], "_bibtex": "@inproceedings{\nDeng2020GraphZoom:,\ntitle={GraphZoom: A Multi-level Spectral Approach for Accurate and Scalable Graph Embedding},\nauthor={Chenhui Deng and Zhiqiang Zhao and Yongyu Wang and Zhiru Zhang and Zhuo Feng},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1lGO0EKDH}\n}", "original_pdf": "/attachment/1bfd85d0bdeadb891b6b86c8ad5b5c5edb06f5a5.pdf", "title": "GraphZoom: A Multi-level Spectral Approach for Accurate and Scalable Graph Embedding", "pdf": "/pdf/ba876dacf8c1a71726795072471113740912f739.pdf", "abstract": "Graph embedding techniques have been increasingly deployed in a multitude of different applications that involve learning on non-Euclidean data. However, existing graph embedding models either fail to incorporate node attribute information during training or suffer from node attribute noise, which compromises the accuracy. Moreover, very few of them scale to large graphs due to their high computational complexity and memory usage. In this paper we propose GraphZoom, a multi-level framework for improving both accuracy and scalability of unsupervised graph embedding algorithms. GraphZoom first performs graph fusion to generate a new graph that effectively encodes the topology of the original graph and the node attribute information. This fused graph is then repeatedly coarsened into much smaller graphs by merging nodes with high spectral similarities. GraphZoom allows any existing embedding methods to be applied to the coarsened graph, before it progressively refine the embeddings obtained at the coarsest level to increasingly finer graphs. We have evaluated our approach on a number of popular graph datasets for both transductive and inductive tasks. Our experiments show that GraphZoom can substantially increase the classification accuracy and significantly accelerate the entire graph embedding process by up to $40.8 \\times$, when compared to the state-of-the-art unsupervised embedding methods. ", "full_presentation_video": ""}, "forum": "r1lGO0EKDH", "id": "r1lGO0EKDH"}, "ryxWIgBFPS": {"content": {"appendix": "", "TL;DR": "This paper proposes a meta-learning objective based on speed of adaptation to transfer distributions to discover a modular decomposition and causal variables.", "keywords": ["causality", "meta learning", "structure learning", "transfer learning"], "paperhash": "bengio|a_metatransfer_objective_for_learning_to_disentangle_causal_mechanisms", "code": "https://github.com/ec6dde01667145e58de60f864e05a4/CausalOptimizationAnon", "spotlight_video": "", "authorids": ["yoshua.bengio@mila.quebec", "tristan.deleu@gmail.com", "nasim.rahaman@tuebingen.mpg.de", "rosemary.nan.ke@gmail.com", "sebastien.lachapelle@umontreal.ca", "obilaniu@gmail.com", "anirudhgoyal9119@gmail.com", "chris.j.pal@gmail.com"], "poster": "", "slides": "", "authors": ["Yoshua Bengio", "Tristan Deleu", "Nasim Rahaman", "Nan Rosemary Ke", "Sebastien Lachapelle", "Olexa Bilaniuk", "Anirudh Goyal", "Christopher Pal"], "_bibtex": "@inproceedings{\nBengio2020A,\ntitle={A Meta-Transfer Objective for Learning to Disentangle Causal Mechanisms},\nauthor={Yoshua Bengio and Tristan Deleu and Nasim Rahaman and Nan Rosemary Ke and Sebastien Lachapelle and Olexa Bilaniuk and Anirudh Goyal and Christopher Pal},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ryxWIgBFPS}\n}", "original_pdf": "/attachment/9c43debd88b8a2ed82b1428ae0d9920f28fd5e08.pdf", "title": "A Meta-Transfer Objective for Learning to Disentangle Causal Mechanisms", "pdf": "/pdf/51b7d9328681c393a1b9a0f6762e206582064fb8.pdf", "abstract": "We propose to use a meta-learning objective that maximizes the speed of transfer on a modified distribution to learn how to modularize acquired knowledge. In particular, we focus on how to factor a joint distribution into appropriate conditionals, consistent with the causal directions. We explain when this can work, using the assumption that the changes in distributions are localized (e.g. to one of the marginals, for example due to an intervention on one of the variables). We prove that under this assumption of localized changes in causal mechanisms, the correct causal graph will tend to have only a few of its parameters with non-zero gradient, i.e. that need to be adapted (those of the modified variables). We argue and observe experimentally that this leads to faster adaptation, and use this property to define a meta-learning surrogate score which, in addition to a continuous parametrization of graphs, would favour correct causal graphs. Finally, motivated by the AI agent point of view (e.g. of a robot discovering its environment autonomously), we consider how the same objective can discover the causal variables themselves, as a transformation of observed low-level variables with no causal meaning. Experiments in the two-variable case validate the proposed ideas and theoretical results.", "full_presentation_video": ""}, "forum": "ryxWIgBFPS", "id": "ryxWIgBFPS"}, "HklxbgBKvr": {"content": {"appendix": "", "TL;DR": "We augment model-free policy learning with a sequence-level surrogate reward functions and count-based visitation bonus and demonstrate effectiveness in the large batch, low-round regime seen in designing DNA and protein sequences.", "keywords": ["capacity", "model based reinforcement learning", "optimization", "reinforcement learning", "sample efficiency"], "paperhash": "angermueller|modelbased_reinforcement_learning_for_biological_sequence_design", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Christof Angermueller", "David Dohan", "David Belanger", "Ramya Deshpande", "Kevin Murphy", "Lucy Colwell"], "_bibtex": "@inproceedings{\nAngermueller2020Model-based,\ntitle={Model-based reinforcement learning for biological sequence design},\nauthor={Christof Angermueller and David Dohan and David Belanger and Ramya Deshpande and Kevin Murphy and Lucy Colwell},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HklxbgBKvr}\n}", "authorids": ["christofa@google.com", "ddohan@google.com", "dbelanger@google.com", "ramyadeshpande@google.com", "lcolwell@google.com", "kpmurphy@google.com"], "title": "Model-based reinforcement learning for biological sequence design", "original_pdf": "/attachment/79f3637b9c4b6fb947084c22862243b85769a26c.pdf", "pdf": "/pdf/d20fd415b269aeddb4b8f0183242fecb381f7437.pdf", "abstract": "The ability to design biological structures such as DNA or proteins would have considerable medical and industrial impact. Doing so presents a challenging black-box optimization problem characterized by the large-batch, low round setting due to the need for labor-intensive wet lab evaluations. In response, we propose using reinforcement learning (RL) based on proximal-policy optimization (PPO) for biological sequence design. RL provides a flexible framework for optimization generative sequence models to achieve specific criteria, such as diversity among the high-quality sequences discovered. We propose a model-based variant of PPO, DyNA-PPO, to improve sample efficiency, where the policy for a new round is trained offline using a simulator fit on functional measurements from prior rounds. To accommodate the growing number of observations across rounds, the simulator model is automatically selected at each round from a pool of diverse models of varying capacity. On the tasks of designing DNA transcription factor binding sites, designing antimicrobial proteins, and optimizing the energy of Ising models based on protein structure, we find that DyNA-PPO performs significantly better than existing methods in settings in which modeling is feasible, while still not performing worse in situations in which a reliable model cannot be learned.", "full_presentation_video": ""}, "forum": "HklxbgBKvr", "id": "HklxbgBKvr"}, "Hke0K1HKwr": {"content": {"appendix": "", "TL;DR": "Our approach is the first attempt to leverage a sequential latent variable model for knowledge selection in the multi-turn knowledge-grounded dialogue. It achieves the new state-of-the-art performance on Wizard of Wikipedia benchmark.", "keywords": ["generation", "nlp", "transformer"], "paperhash": "kim|sequential_latent_knowledge_selection_for_knowledgegrounded_dialogue", "code": "https://github.com/bckim92/sequential-knowledge-transformer", "spotlight_video": "", "authorids": ["byeongchang.kim@vision.snu.ac.kr", "jaewoo.ahn@vision.snu.ac.kr", "gunhee@snu.ac.kr"], "poster": "", "slides": "", "authors": ["Byeongchang Kim", "Jaewoo Ahn", "Gunhee Kim"], "_bibtex": "@inproceedings{\nKim2020Sequential,\ntitle={Sequential Latent Knowledge Selection for Knowledge-Grounded Dialogue},\nauthor={Byeongchang Kim and Jaewoo Ahn and Gunhee Kim},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Hke0K1HKwr}\n}", "original_pdf": "/attachment/63006ade3338aa6dcc914324e4a9f120baf0ff75.pdf", "title": "Sequential Latent Knowledge Selection for Knowledge-Grounded Dialogue", "pdf": "/pdf/1a7e5e43b339d1db95cd03b0dc33382c6af1e444.pdf", "abstract": "Knowledge-grounded dialogue is a task of generating an informative response based on both discourse context and external knowledge. As we focus on better modeling the knowledge selection in the multi-turn knowledge-grounded dialogue, we propose a sequential latent variable model as the first approach to this matter. The model named sequential knowledge transformer (SKT) can keep track of the prior and posterior distribution over knowledge; as a result, it can not only reduce the ambiguity caused from the diversity in knowledge selection of conversation but also better leverage the response information for proper choice of knowledge. Our experimental results show that the proposed model improves the knowledge selection accuracy and subsequently the performance of utterance generation. We achieve the new state-of-the-art performance on Wizard of Wikipedia (Dinan et al., 2019) as one of the most large-scale and challenging benchmarks. We further validate the effectiveness of our model over existing conversation methods in another knowledge-based dialogue Holl-E dataset (Moghe et al., 2018).", "full_presentation_video": ""}, "forum": "Hke0K1HKwr", "id": "Hke0K1HKwr"}, "Hkekl0NFPr": {"content": {"appendix": "", "TL;DR": "We propose a novel algorithm for learning fair representations that can simultaneously mitigate two notions of disparity among different demographic subgroups.", "keywords": ["fairness", "representation learning"], "paperhash": "zhao|conditional_learning_of_fair_representations", "spotlight_video": "", "poster": "", "slides": "", "abstract": "We propose a novel algorithm for learning fair representations that can simultaneously mitigate two notions of disparity among different demographic subgroups in the classification setting. Two key components underpinning the design of our algorithm are balanced error rate and conditional alignment of representations. We show how these two components contribute to ensuring accuracy parity and equalized false-positive and false-negative rates across groups without impacting demographic parity. Furthermore, we also demonstrate both in theory and on two real-world experiments that the proposed algorithm leads to a better utility-fairness trade-off on balanced datasets compared with existing algorithms on learning fair representations for classification. \n", "_bibtex": "@inproceedings{\nZhao2020Conditional,\ntitle={Conditional Learning of Fair Representations},\nauthor={Han Zhao and Amanda Coston and Tameem Adel and Geoffrey J. Gordon},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Hkekl0NFPr}\n}", "authorids": ["han.zhao@cs.cmu.edu", "acoston@cs.cmu.edu", "tah47@cam.ac.uk", "ggordon@cs.cmu.edu"], "title": "Conditional Learning of Fair Representations", "authors": ["Han Zhao", "Amanda Coston", "Tameem Adel", "Geoffrey J. Gordon"], "original_pdf": "/attachment/8d7bc4f6ba8dde3b4c243e2544b16b32c6e0c6c4.pdf", "pdf": "/pdf/be79751a1a7fdbbffaf86bc2065f2ade9f465e3b.pdf", "full_presentation_video": ""}, "forum": "Hkekl0NFPr", "id": "Hkekl0NFPr"}, "BJg4NgBKvH": {"content": {"appendix": "", "keywords": ["attention", "imagenet", "optimization"], "paperhash": "martinez|training_binary_neural_networks_with_realtobinary_convolutions", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Brais Martinez", "Jing Yang", "Adrian Bulat", "Georgios Tzimiropoulos"], "_bibtex": "@inproceedings{\nMartinez2020Training,\ntitle={Training binary neural networks with real-to-binary convolutions},\nauthor={Brais Martinez and Jing Yang and Adrian Bulat and Georgios Tzimiropoulos},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJg4NgBKvH}\n}", "authorids": ["brais.mart@gmail.com", "psxjy3@nottingham.ac.uk", "adrian@adrianbulat.com", "yorgos.tzimiropoulos@nottingham.ac.uk"], "title": "Training binary neural networks with real-to-binary convolutions", "original_pdf": "/attachment/6b0115eac68b04eb68da4a6dacd4ace23cf4e615.pdf", "pdf": "/pdf/5407a146b291f949769445cf84c4e2b4f152e5fc.pdf", "abstract": "This paper shows how to train binary networks to within a few percent points (~3-5%) of the full precision counterpart. We first show how to build a strong baseline, which already achieves state-of-the-art accuracy, by combining recently proposed advances and carefully adjusting the optimization procedure. Secondly, we show that by attempting to minimize the discrepancy between the output of the binary and the corresponding real-valued convolution, additional significant accuracy gains can be obtained. We materialize this idea in two complementary ways: (1) with a loss function, during training, by matching the spatial attention maps computed at the output of the binary and real-valued convolutions, and (2) in a data-driven manner, by using the real-valued activations, available during inference prior to the binarization process, for re-scaling the activations right after the binary convolution. Finally, we show that, when putting all of our improvements together, the proposed model beats the current state of the art by more than 5% top-1 accuracy on ImageNet and reduces the gap to its real-valued counterpart to less than 3% and 5% top-1 accuracy on CIFAR-100 and ImageNet respectively when using a ResNet-18 architecture. Code available at https://github.com/brais-martinez/real2binary", "full_presentation_video": ""}, "forum": "BJg4NgBKvH", "id": "BJg4NgBKvH"}, "ryxgJTEYDr": {"content": {"appendix": "", "TL;DR": "Learning an implicit master policy, as a master policy in HRL can fail to generalize.", "keywords": ["ensembles", "generalization", "hierarchical reinforcement learning", "information bottleneck", "reinforcement learning", "variational information bottleneck"], "paperhash": "goyal|reinforcement_learning_with_competitive_ensembles_of_informationconstrained_primitives", "spotlight_video": "", "poster": "", "slides": "", "abstract": "Reinforcement learning agents that operate in diverse and complex environments can benefit from the structured decomposition of their behavior. Often, this is addressed in the context of hierarchical reinforcement learning, where the aim is to decompose a policy into lower-level primitives or options, and a higher-level meta-policy that triggers the appropriate behaviors for a given situation. However, the meta-policy must still produce appropriate decisions in all states.\nIn this work, we propose a policy design that decomposes into primitives, similarly to hierarchical reinforcement learning, but without a high-level meta-policy. Instead, each primitive can decide for themselves whether they wish to act in the current state.\nWe use an information-theoretic mechanism for enabling this decentralized decision: each primitive chooses how much information it needs about the current state to make a decision and the primitive that requests the most information about the current state acts in the world. The primitives are regularized to use as little information as possible, which leads to natural competition and specialization. We experimentally demonstrate that this policy architecture improves over both flat and hierarchical policies in terms of generalization. ", "_bibtex": "@inproceedings{\nGoyal2020Reinforcement,\ntitle={Reinforcement Learning with Competitive Ensembles of Information-Constrained Primitives},\nauthor={Anirudh Goyal and Shagun Sodhani and Jonathan Binas and Xue Bin Peng and Sergey Levine and Yoshua Bengio},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ryxgJTEYDr}\n}", "authorids": ["anirudhgoyal9119@gmail.com", "sshagunsodhani@gmail.com", "jbinas@gmail.com", "xbpeng@berkeley.edu", "svlevine@eecs.berkeley.edu", "yoshua.bengio@mila.quebec"], "title": "Reinforcement Learning with Competitive Ensembles of Information-Constrained Primitives", "authors": ["Anirudh Goyal", "Shagun Sodhani", "Jonathan Binas", "Xue Bin Peng", "Sergey Levine", "Yoshua Bengio"], "original_pdf": "/attachment/6f09efb119d0c6ca9615d906ea4c837e770b7efb.pdf", "pdf": "/pdf/8ce5bb18a2df4be9d75fa4294af41e2cf8a4ab37.pdf", "full_presentation_video": ""}, "forum": "ryxgJTEYDr", "id": "ryxgJTEYDr"}, "rJg8TeSFDH": {"content": {"appendix": "", "TL;DR": "We propose an exponentially growing learning rate schedule for networks with BatchNorm, which surprisingly performs well in practice and is provably equivalent to popular LR schedules like Step Decay.", "keywords": ["batch normalization", "deep learning theory", "generalization", "learning rate", "momentum", "optimization"], "paperhash": "li|an_exponential_learning_rate_schedule_for_deep_learning", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Zhiyuan Li", "Sanjeev Arora"], "_bibtex": "@inproceedings{\nLi2020An,\ntitle={An Exponential Learning Rate Schedule for Deep Learning},\nauthor={Zhiyuan Li and Sanjeev Arora},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJg8TeSFDH}\n}", "authorids": ["zhiyuanli@cs.princeton.edu", "arora@cs.princeton.edu"], "title": "An Exponential Learning Rate Schedule for Deep Learning", "original_pdf": "/attachment/6f1a16fdd8f783b8f9e37e23095efdcd973081d0.pdf", "pdf": "/pdf/c0a5feba02a5a8332a346bf69eb4ec194a8c9a24.pdf", "abstract": "Intriguing empirical evidence exists that deep learning can work well with exotic schedules for varying the learning rate. This paper suggests that the phenomenon may be due to Batch Normalization or BN(Ioffe & Szegedy, 2015), which is ubiq- uitous and provides benefits in optimization and generalization across all standard architectures. The following new results are shown about BN with weight decay and momentum (in other words, the typical use case which was not considered in earlier theoretical analyses of stand-alone BN (Ioffe & Szegedy, 2015; Santurkar et al., 2018; Arora et al., 2018)\n\u2022 Training can be done using SGD with momentum and an exponentially in- creasing learning rate schedule, i.e., learning rate increases by some (1 + \u03b1) factor in every epoch for some \u03b1 > 0. (Precise statement in the paper.) To the best of our knowledge this is the first time such a rate schedule has been successfully used, let alone for highly successful architectures. As ex- pected, such training rapidly blows up network weights, but the net stays well-behaved due to normalization.\n\u2022 Mathematical explanation of the success of the above rate schedule: a rigor- ous proof that it is equivalent to the standard setting of BN + SGD + Standard Rate Tuning + Weight Decay + Momentum. This equivalence holds for other normalization layers as well, Group Normalization(Wu & He, 2018), Layer Normalization(Ba et al., 2016), Instance Norm(Ulyanov et al., 2016), etc.\n\u2022 A worked-out toy example illustrating the above linkage of hyper- parameters. Using either weight decay or BN alone reaches global minimum, but convergence fails when both are used.", "full_presentation_video": ""}, "forum": "rJg8TeSFDH", "id": "rJg8TeSFDH"}, "BJe-91BtvH": {"content": {"appendix": "", "keywords": ["attention", "generation", "semantic segmentation", "unsupervised"], "paperhash": "mokady|masked_based_unsupervised_content_transfer", "code": "https://github.com/rmokady/mbu-content-tansfer", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Ron Mokady", "Sagie Benaim", "Lior Wolf", "Amit Bermano"], "_bibtex": "@inproceedings{\nMokady2020Masked,\ntitle={Masked Based Unsupervised Content Transfer},\nauthor={Ron Mokady and Sagie Benaim and Lior Wolf and Amit Bermano},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJe-91BtvH}\n}", "authorids": ["sagiebenaim@gmail.com", "ron.mokady@gmail.com", "wolf@fb.com", "amit.bermano@gmail.com"], "title": "Masked Based Unsupervised Content Transfer", "original_pdf": "/attachment/6d44c1e33b011f87d29d07a9c908457e5074e6a2.pdf", "pdf": "/pdf/b13c0059577e9b0336cdb2648d98320b62954582.pdf", "abstract": "We consider the problem of translating, in an unsupervised manner, between two domains where one contains some additional information compared to the other. The proposed method disentangles the common and separate parts of these domains and, through the generation of a mask, focuses the attention of the underlying network to the desired augmentation alone, without wastefully reconstructing the entire target. This enables state-of-the-art quality and variety of content translation, as demonstrated through extensive quantitative and qualitative evaluation. Our method is also capable of adding the separate content of different guide images and domains as well as remove existing separate content. Furthermore, our method enables weakly-supervised semantic segmentation of the separate part of each domain, where only class labels are provided. Our code is available at https://github.com/rmokady/mbu-content-tansfer.\n", "full_presentation_video": ""}, "forum": "BJe-91BtvH", "id": "BJe-91BtvH"}, "BJeAHkrYDS": {"content": {"appendix": "", "TL;DR": "We introduce Variational Intrinsic Successor FeatuRes (VISR), a novel algorithm which learns controllable features that can be leveraged to provide fast task inference through the successor features framework.", "keywords": ["generalization", "reinforcement learning", "unsupervised"], "paperhash": "hansen|fast_task_inference_with_variational_intrinsic_successor_features", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Steven Hansen", "Will Dabney", "Andre Barreto", "David Warde-Farley", "Tom Van de Wiele", "Volodymyr Mnih"], "_bibtex": "@inproceedings{\nHansen2020Fast,\ntitle={Fast Task Inference with Variational Intrinsic Successor Features},\nauthor={Steven Hansen and Will Dabney and Andre Barreto and David Warde-Farley and Tom Van de Wiele and Volodymyr Mnih},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJeAHkrYDS}\n}", "authorids": ["stevenhansen@google.com", "wdabney@google.com", "andrebarreto@google.com", "dwf@google.com", "tvdwiele@gmail.com", "vmnih@google.com"], "title": "Fast Task Inference with Variational Intrinsic Successor Features", "original_pdf": "/attachment/e3295f4268387bd664c3adc067c77b1932246411.pdf", "pdf": "/pdf/16e17334730743e99b581dffa985e2cc2b2e8603.pdf", "abstract": "It has been established that diverse behaviors spanning the controllable subspace of a Markov decision process can be trained by rewarding a policy for being distinguishable from other policies. However, one limitation of this formulation is the difficulty to generalize beyond the finite set of behaviors being explicitly learned, as may be needed in subsequent tasks. Successor features provide an appealing solution to this generalization problem, but require defining the reward function as linear in some grounded feature space. In this paper, we show that these two techniques can be combined, and that each method solves the other's primary limitation. To do so we introduce Variational Intrinsic Successor FeatuRes (VISR), a novel algorithm which learns controllable features that can be leveraged to provide enhanced generalization and fast task inference through the successor features framework. We empirically validate VISR on the full Atari suite, in a novel setup wherein the rewards are only exposed briefly after a long unsupervised phase. Achieving human-level performance on 12 games and beating all baselines, we believe VISR represents a step towards agents that rapidly learn from limited feedback.", "full_presentation_video": ""}, "forum": "BJeAHkrYDS", "id": "BJeAHkrYDS"}, "rkecJ6VFvr": {"content": {"appendix": "", "TL;DR": "We introduce the 2-simplicial Transformer and show that this architecture is a useful inductive bias for logical reasoning in the context of deep reinforcement learning.", "keywords": ["attention", "inductive bias", "logic", "logical reasoning", "reasoning", "reinforcement learning", "transformer"], "paperhash": "clift|logic_and_the_2simplicial_transformer", "code": "https://github.com/dmurfet/2simplicialtransformer", "spotlight_video": "", "authorids": ["jamesedwardclift@gmail.com", "dmitry.doryn@gmail.com", "d.murfet@unimelb.edu.au", "james.wallbridge@gmail.com"], "poster": "", "slides": "", "authors": ["James Clift", "Dmitry Doryn", "Daniel Murfet", "James Wallbridge"], "_bibtex": "@inproceedings{\nClift2020Logic,\ntitle={Logic and the 2-Simplicial Transformer},\nauthor={James Clift and Dmitry Doryn and Daniel Murfet and James Wallbridge},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkecJ6VFvr}\n}", "original_pdf": "/attachment/c0ae404bcf2a15a99c78c49f9331bbdfdfd4cd1d.pdf", "title": "Logic and the 2-Simplicial Transformer", "pdf": "/pdf/3c77c97bf2a64439c08bd8853682b3a03a8b74ed.pdf", "abstract": "We introduce the 2-simplicial Transformer, an extension of the Transformer which includes a form of higher-dimensional attention generalising the dot-product attention, and uses this attention to update entity representations with tensor products of value vectors. We show that this architecture is a useful inductive bias for logical reasoning in the context of deep reinforcement learning.\n", "full_presentation_video": ""}, "forum": "rkecJ6VFvr", "id": "rkecJ6VFvr"}, "rylmoxrFDH": {"content": {"appendix": "", "TL;DR": "signal propagation theory applied to continuous surrogates of binary nets; counter intuitive initialisation; reparameterisation trick not helpful", "keywords": [], "paperhash": "stamatescu|critical_initialisation_in_continuous_approximations_of_binary_neural_networks", "spotlight_video": "", "poster": "", "slides": "", "authors": ["George Stamatescu", "Federica Gerace", "Carlo Lucibello", "Ian Fuss", "Langford White"], "_bibtex": "@inproceedings{\nStamatescu2020Critical,\ntitle={Critical initialisation in continuous approximations of binary neural networks},\nauthor={George Stamatescu and Federica Gerace and Carlo Lucibello and Ian Fuss and Langford White},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rylmoxrFDH}\n}", "authorids": ["george.stamatescu@gmail.com", "federicagerace91@gmail.com", "carlo.lucibello@gmail.com", "ian.fuss@adelaide.edu.au", "lang.white@adelaide.edu.au"], "title": "Critical initialisation in continuous approximations of binary neural networks", "original_pdf": "/attachment/b649da9286f2afd69b331f5bcd514ad70753595d.pdf", "pdf": "/pdf/71a41b43a6dc91d6e4322cb83f251fd555711e7e.pdf", "abstract": "The training of stochastic neural network models with binary ($\\pm1$) weights and activations via continuous surrogate networks is investigated. We derive new surrogates using a novel derivation based on writing the stochastic neural network as a Markov chain. This derivation also encompasses existing variants of the surrogates presented in the literature. Following this, we theoretically study the surrogates at initialisation. We derive, using mean field theory, a set of scalar equations describing how input signals propagate through the randomly initialised networks. The equations reveal whether so-called critical initialisations exist for each surrogate network, where the network can be trained to arbitrary depth. Moreover, we predict theoretically and confirm numerically, that common weight initialisation schemes used in standard continuous networks, when applied to the mean values of the stochastic binary weights, yield poor training performance. This study shows that, contrary to common intuition, the means of the stochastic binary weights should be initialised close to $\\pm 1$, for deeper networks to be trainable.", "full_presentation_video": ""}, "forum": "rylmoxrFDH", "id": "rylmoxrFDH"}, "BJl6bANtwH": {"content": {"appendix": "", "TL;DR": "We present local ensembles, a method for detecting extrapolation in trained models, which approximates the variance of an ensemble using local-second order information.", "keywords": ["active learning", "ensembles", "reliability"], "paperhash": "madras|detecting_extrapolation_with_local_ensembles", "code": "https://github.com/dmadras/local-ensembles", "spotlight_video": "", "authorids": ["david.madras@mail.utoronto.ca", "atwoodj@google.com", "alexdamour@google.com"], "poster": "", "slides": "", "authors": ["David Madras", "James Atwood", "Alexander D'Amour"], "_bibtex": "@inproceedings{\nMadras2020Detecting,\ntitle={Detecting Extrapolation with Local Ensembles},\nauthor={David Madras and James Atwood and Alexander D'Amour},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJl6bANtwH}\n}", "original_pdf": "/attachment/cf90c6458647ed082aa6f4ac21990b37977d5e15.pdf", "title": "Detecting Extrapolation with Local Ensembles", "pdf": "/pdf/2395829e9ecb40fc8f0538d51a34289f765d9c90.pdf", "abstract": "We present local ensembles, a method for detecting extrapolation at test time in a pre-trained model. We focus on underdetermination as a key component of extrapolation: we aim to detect when many possible predictions are consistent with the training data and model class. Our method uses local second-order information to approximate the variance of predictions across an ensemble of models from the same class. We compute this approximation by estimating the norm of the component of a test point's gradient that aligns with the low-curvature directions of the Hessian, and provide a tractable method for estimating this quantity. Experimentally, we show that our method is capable of detecting when a pre-trained model is extrapolating on test data, with applications to out-of-distribution detection, detecting spurious correlates, and active learning.", "full_presentation_video": ""}, "forum": "BJl6bANtwH", "id": "BJl6bANtwH"}, "ryeHuJBtPH": {"content": {"appendix": "", "TL;DR": "We develop a new self-attention based graph neural network called Hyper-SAGNN applicable to homogeneous and heterogeneous hypergraphs with variable hyperedge sizes that can fulfill tasks like node classification and hyperedge prediction. ", "keywords": ["attention", "graph networks", "representation learning", "self attention"], "paperhash": "zhang|hypersagnn_a_selfattention_based_graph_neural_network_for_hypergraphs", "code": "https://drive.google.com/drive/folders/1kIOc4SlAJllUJsrr2OnZ4izIQIw2JexU?usp=sharing", "spotlight_video": "", "authorids": ["ruochiz@andrew.cmu.edu", "logic.zys@gmail.com", "jianma@cs.cmu.edu"], "poster": "", "slides": "", "authors": ["Ruochi Zhang", "Yuesong Zou", "Jian Ma"], "_bibtex": "@inproceedings{\nZhang2020Hyper-SAGNN:,\ntitle={Hyper-SAGNN: a self-attention based graph neural network for hypergraphs},\nauthor={Ruochi Zhang and Yuesong Zou and Jian Ma},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ryeHuJBtPH}\n}", "original_pdf": "/attachment/62080177bc5348c92d7ad385820308f7de3311f5.pdf", "title": "Hyper-SAGNN: a self-attention based graph neural network for hypergraphs", "pdf": "/pdf/78bbb8eaf72d13a892c120e09b784166991d78f9.pdf", "abstract": "Graph representation learning for hypergraphs can be utilized to extract patterns among higher-order interactions that are critically important in many real world problems. Current approaches designed for hypergraphs, however, are unable to handle different types of hypergraphs and are typically not generic for various learning tasks. Indeed, models that can predict variable-sized heterogeneous hyperedges have not been available. Here we develop a new self-attention based graph neural network called Hyper-SAGNN applicable to homogeneous and heterogeneous hypergraphs with variable hyperedge sizes. We perform extensive evaluations on multiple datasets, including four benchmark network datasets and two single-cell Hi-C datasets in genomics. We demonstrate that Hyper-SAGNN significantly outperforms state-of-the-art methods on traditional tasks while also achieving great performance on a new task called outsider identification. We believe that Hyper-SAGNN will be useful for graph representation learning to uncover complex higher-order interactions in different applications. ", "full_presentation_video": ""}, "forum": "ryeHuJBtPH", "id": "ryeHuJBtPH"}, "HJxEhREKDH": {"content": {"appendix": "", "TL;DR": "Under certain condition on the input and output linear transformations, both GD and SGD can achieve global convergence for training deep linear ResNets.", "keywords": ["gradient descent"], "paperhash": "zou|on_the_global_convergence_of_training_deep_linear_resnets", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Difan Zou", "Philip M. Long", "Quanquan Gu"], "_bibtex": "@inproceedings{\nZou2020On,\ntitle={On the Global Convergence of Training Deep Linear ResNets},\nauthor={Difan Zou and Philip M. Long and Quanquan Gu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJxEhREKDH}\n}", "authorids": ["knowzou@ucla.edu", "plong@google.com", "qgu@cs.ucla.edu"], "title": "On the Global Convergence of Training Deep Linear ResNets", "original_pdf": "/attachment/e9f06e9d1384a7cd6235301dbc5b970844fe863a.pdf", "pdf": "/pdf/40f991e7d57fcf3070e9cc9829c477216a40cc72.pdf", "abstract": "We study the convergence of gradient descent (GD) and stochastic gradient descent (SGD) for training $L$-hidden-layer linear residual networks (ResNets). We prove that for training deep residual networks with certain linear transformations at input and output layers, which are fixed throughout training, both GD and SGD with zero initialization on all hidden weights can converge to the global minimum of the training loss. Moreover, when specializing to appropriate Gaussian random linear transformations, GD and SGD provably optimize wide enough deep linear ResNets. Compared with the global convergence result of GD for training standard deep linear networks \\citep{du2019width}, our condition on the neural network width is sharper by a factor of $O(\\kappa L)$, where $\\kappa$ denotes the condition number of the covariance matrix of the training data. We further propose a modified identity input and output transformations, and show that a $(d+k)$-wide neural network is sufficient to guarantee the global convergence of GD/SGD, where $d,k$ are the input and output dimensions respectively.", "full_presentation_video": ""}, "forum": "HJxEhREKDH", "id": "HJxEhREKDH"}, "SklkDkSFPB": {"content": {"appendix": "", "TL;DR": "A simple and effective method for reducing large neural networks to flexible parameter targets based on block substitution.", "keywords": ["capacity", "cnn", "compression", "imagenet", "model compression", "network compression", "neural architecture search"], "paperhash": "turner|blockswap_fisherguided_block_substitution_for_network_compression_on_a_budget", "code": "https://github.com/BayesWatch/pytorch-blockswap", "spotlight_video": "", "authorids": ["jack.turner@ed.ac.uk", "elliot.j.crowley@ed.ac.uk", "mob@inf.ed.ac.uk", "a.storkey@ed.ac.uk", "g.d.b.gray@ed.ac.uk"], "poster": "", "slides": "", "authors": ["Jack Turner", "Elliot J. Crowley", "Michael O'Boyle", "Amos Storkey", "Gavin Gray"], "_bibtex": "@inproceedings{\nTurner2020BlockSwap:,\ntitle={BlockSwap: Fisher-guided Block Substitution for Network Compression on a Budget},\nauthor={Jack Turner and Elliot J. Crowley and Michael O'Boyle and Amos Storkey and Gavin Gray},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SklkDkSFPB}\n}", "original_pdf": "/attachment/51d9029cb758cf9f4062b5ab7041b21a1b5aec10.pdf", "title": "BlockSwap: Fisher-guided Block Substitution for Network Compression on a Budget", "pdf": "/pdf/b4bc11393d862c80d37a657a9317c7f4a14f7fa4.pdf", "abstract": "The desire to map neural networks to varying-capacity devices has led to the development of a wealth of compression techniques, many of which involve replacing standard convolutional blocks in a large network with cheap alternative blocks. However, not all blocks are created equally; for a required compute budget there may exist a potent combination of many different cheap blocks, though exhaustively searching for such a combination is prohibitively expensive. In this work, we develop BlockSwap: a fast algorithm for choosing networks with interleaved block types by passing a single minibatch of training data through randomly initialised networks and gauging their Fisher potential. These networks can then be used as students and distilled with the original large network as a teacher. We demonstrate the effectiveness of the chosen networks across CIFAR-10 and ImageNet for classification, and COCO for detection, and provide a comprehensive ablation study of our approach. BlockSwap quickly explores possible block configurations using a simple architecture ranking system, yielding highly competitive networks in orders of magnitude less time than most architecture search techniques (e.g. under 5 minutes on a single GPU for CIFAR-10).", "full_presentation_video": ""}, "forum": "SklkDkSFPB", "id": "SklkDkSFPB"}, "ByeMPlHKPH": {"content": {"appendix": "", "keywords": ["attention", "automl", "compression", "language modeling", "machine translation", "neural architecture search", "nlp", "question answering", "transformer"], "paperhash": "wu|lite_transformer_with_longshort_range_attention", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Zhanghao Wu*", "Zhijian Liu*", "Ji Lin", "Yujun Lin", "Song Han"], "_bibtex": "@inproceedings{\nWu*2020Lite,\ntitle={Lite Transformer with Long-Short Range Attention},\nauthor={Zhanghao Wu* and Zhijian Liu* and Ji Lin and Yujun Lin and Song Han},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ByeMPlHKPH}\n}", "authorids": ["zhanghao.wu@outlook.com", "zhijian@mit.edu", "jilin@mit.edu", "yujunlin@mit.edu", "songhan@mit.edu"], "title": "Lite Transformer with Long-Short Range Attention", "original_pdf": "/attachment/25cd999b7b58f27536252c6bbed5a240037a58c3.pdf", "pdf": "/pdf/2ce670d80c32b983dbdf805e3314243b2505ce9b.pdf", "abstract": "Transformer has become ubiquitous in natural language processing (e.g., machine translation, question answering); however, it requires enormous amount of computations to achieve high performance, which makes it not suitable for mobile applications since mobile phones are tightly constrained by the hardware resources and battery. In this paper, we investigate the mobile setting (under 500M Mult-Adds) for NLP tasks to facilitate the deployment on the edge devices. We present Long-Short Range Attention (LSRA), where one group of heads specializes in the local context modeling (by convolution) while another group captures the long-distance relationship (by attention). Based on this primitive, we design Lite Transformer that is tailored for the mobile NLP application. Our Lite Transformer demonstrates consistent improvement over the transformer on three well-established language tasks: machine translation, abstractive summarization, and language modeling. It outperforms the transformer on WMT\u201914 English-French by 1.2 BLEU under 500M Mult-Adds and 1.7 BLEU under 100M Mult-Adds, and reduces the computation of transformer base model by 2.5x. Further, with general techniques, our Lite Transformer achieves 18.2x model size compression. For language modeling, our Lite Transformer also achieves 3.8 lower perplexity than the transformer around 500M Mult-Adds. Without the costly architecture search that requires more than 250 GPU years, our Lite Transformer outperforms the AutoML-based Evolved Transformer by 0.5 higher BLEU under the mobile setting. ", "full_presentation_video": ""}, "forum": "ByeMPlHKPH", "id": "ByeMPlHKPH"}, "Bkeb7lHtvH": {"content": {"appendix": "", "TL;DR": "How to prevent stale gradients (in asynchronous SGD) from changing minima stability and degrade steady state generalization?", "keywords": ["generalization", "gradient descent", "implicit bias", "learning rate", "momentum", "stability"], "paperhash": "giladi|at_stabilitys_edge_how_to_adjust_hyperparameters_to_preserve_minima_selection_in_asynchronous_training_of_neural_networks", "code": "https://github.com/paper-submissions/delay_stability", "spotlight_video": "", "authorids": ["giladiniv@gmail.com", "mor.shpigel@gmail.com", "elad.hoffer@gmail.com", "daniel.soudry@gmail.com"], "poster": "", "slides": "", "authors": ["Niv Giladi", "Mor Shpigel Nacson", "Elad Hoffer", "Daniel Soudry"], "_bibtex": "@inproceedings{\nGiladi2020At,\ntitle={At Stability's Edge: How to Adjust Hyperparameters to Preserve Minima Selection in Asynchronous Training of Neural Networks?},\nauthor={Niv Giladi and Mor Shpigel Nacson and Elad Hoffer and Daniel Soudry},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Bkeb7lHtvH}\n}", "original_pdf": "/attachment/8f220cc33ccda4faa32407ea0ae047c4745ad029.pdf", "title": "At Stability's Edge: How to Adjust Hyperparameters to Preserve Minima Selection in Asynchronous Training of Neural Networks?", "pdf": "/pdf/8908d2b62bb681ad54ffe8d0a00e8c5d5a64f67c.pdf", "abstract": "Background: Recent developments have made it possible to accelerate neural networks training significantly using large batch sizes and data parallelism. Training in an asynchronous fashion, where delay occurs, can make training even more scalable. However, asynchronous training has its pitfalls, mainly a degradation in generalization, even after convergence of the algorithm. This gap remains not well understood, as theoretical analysis so far mainly focused on the convergence rate of asynchronous methods.\nContributions: We examine asynchronous training from the perspective of dynamical stability. We find that the degree of delay interacts with the learning rate, to change the set of minima accessible by an asynchronous stochastic gradient descent algorithm. We derive closed-form rules on how the learning rate could be changed, while keeping the accessible set the same. Specifically, for high delay values, we find that the learning rate should be kept inversely proportional to the delay. We then extend this analysis to include momentum. We find momentum should be either turned off, or modified to improve training stability. We provide empirical experiments to validate our theoretical findings.", "full_presentation_video": ""}, "forum": "Bkeb7lHtvH", "id": "Bkeb7lHtvH"}, "SJgwNerKvB": {"content": {"appendix": "", "keywords": ["capacity", "catastrophic forgetting", "continual learning", "hypernetworks", "memory", "transfer learning"], "paperhash": "oswald|continual_learning_with_hypernetworks", "code": "https://github.com/chrhenning/hypercl", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Johannes von Oswald", "Christian Henning", "Jo\u00e3o Sacramento", "Benjamin F. Grewe"], "_bibtex": "@inproceedings{\nOswald2020Continual,\ntitle={Continual learning with hypernetworks},\nauthor={Johannes von Oswald and Christian Henning and Jo\u00e3o Sacramento and Benjamin F. Grewe},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJgwNerKvB}\n}", "authorids": ["voswaldj@ethz.ch", "henningc@ethz.ch", "sacramento@ini.ethz.ch", "bgrewe@ethz.ch"], "title": "Continual learning with hypernetworks", "original_pdf": "/attachment/4d899720737328e780cf03facbd7355540a21b41.pdf", "pdf": "/pdf/5206218e137ab12a45ab2c7cdde9c53fb4c73b94.pdf", "abstract": "Artificial neural networks suffer from catastrophic forgetting when they are sequentially trained on multiple tasks. To overcome this problem, we present a novel approach based on task-conditioned hypernetworks, i.e., networks that generate the weights of a target model based on task identity. Continual learning (CL) is less difficult for this class of models thanks to a simple key feature: instead of recalling the input-output relations of all previously seen data, task-conditioned hypernetworks only require rehearsing task-specific weight realizations, which can be maintained in memory using a simple regularizer. Besides achieving state-of-the-art performance on standard CL benchmarks, additional experiments on long task sequences reveal that task-conditioned hypernetworks display a very large capacity to retain previous memories. Notably, such long memory lifetimes are achieved in a compressive regime, when the number of trainable hypernetwork weights is comparable or smaller than target network size. We provide insight into the structure of low-dimensional task embedding spaces (the input space of the hypernetwork) and show that task-conditioned hypernetworks demonstrate transfer learning. Finally, forward information transfer is further supported by empirical results on a challenging CL benchmark based on the CIFAR-10/100 image datasets.", "full_presentation_video": ""}, "forum": "SJgwNerKvB", "id": "SJgwNerKvB"}, "Sye0XkBKvS": {"content": {"appendix": "", "TL;DR": "This paper proposes the use of spectral element methods for fast and accurate training of Neural Ordinary Differential Equations for system identification.", "keywords": ["generalization", "memory", "optimization", "rnn"], "paperhash": "quaglino|snode_spectral_discretization_of_neural_odes_for_system_identification", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Alessio Quaglino", "Marco Gallieri", "Jonathan Masci", "Jan Koutn\u00edk"], "_bibtex": "@inproceedings{\nQuaglino2020SNODE:,\ntitle={SNODE: Spectral Discretization of Neural ODEs for System Identification},\nauthor={Alessio Quaglino and Marco Gallieri and Jonathan Masci and Jan Koutn\u00edk},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Sye0XkBKvS}\n}", "authorids": ["alessio@nnaisense.com", "marco@nnaisense.com", "jonathan@nnaisense.com", "jan@nnaisense.com"], "title": "SNODE: Spectral Discretization of Neural ODEs for System Identification", "original_pdf": "/attachment/78df4362d0818e83f4cf6602f1509061ba041866.pdf", "pdf": "/pdf/2f86e6d4b91a4bbc9621a7ebd67771ac0f1a9e02.pdf", "abstract": "This paper proposes the use of spectral element methods \\citep{canuto_spectral_1988} for fast and accurate training of Neural Ordinary Differential Equations (ODE-Nets; \\citealp{Chen2018NeuralOD}) for system identification. This is achieved by expressing their dynamics as a truncated series of Legendre polynomials. The series coefficients, as well as the network weights, are computed by minimizing the weighted sum of the loss function and the violation of the ODE-Net dynamics. The problem is solved by coordinate descent that alternately minimizes, with respect to the coefficients and the weights, two unconstrained sub-problems using standard backpropagation and gradient methods. The resulting optimization scheme is fully time-parallel and results in a low memory footprint. Experimental comparison to standard methods, such as backpropagation through explicit solvers and the adjoint technique \\citep{Chen2018NeuralOD}, on training surrogate models of small and medium-scale dynamical systems shows that it is at least one order of magnitude faster at reaching a comparable value of the loss function. The corresponding testing MSE is one order of magnitude smaller as well, suggesting generalization capabilities increase.", "full_presentation_video": ""}, "forum": "Sye0XkBKvS", "id": "Sye0XkBKvS"}, "SkeAaJrKDS": {"content": {"appendix": "", "TL;DR": "We propose a model-based method called \"Search with Amortized Value Estimates\" (SAVE) which leverages both real and planned experience by combining Q-learning with Monte-Carlo Tree Search, achieving strong performance with very small search budgets.", "keywords": ["model based rl", "planning", "reasoning", "search"], "paperhash": "hamrick|combining_qlearning_and_search_with_amortized_value_estimates", "spotlight_video": "", "poster": "", "slides": "", "abstract": "We introduce \"Search with Amortized Value Estimates\" (SAVE), an approach for combining model-free Q-learning with model-based Monte-Carlo Tree Search (MCTS). In SAVE, a learned prior over state-action values is used to guide MCTS, which estimates an improved set of state-action values. The new Q-estimates are then used in combination with real experience to update the prior. This effectively amortizes the value computation performed by MCTS, resulting in a cooperative relationship between model-free learning and model-based search. SAVE can be implemented on top of any Q-learning agent with access to a model, which we demonstrate by incorporating it into agents that perform challenging physical reasoning tasks and Atari. SAVE consistently achieves higher rewards with fewer training steps, and---in contrast to typical model-based search approaches---yields strong performance with very small search budgets. By combining real experience with information computed during search, SAVE demonstrates that it is possible to improve on both the performance of model-free learning and the computational cost of planning.", "_bibtex": "@inproceedings{\nHamrick2020Combining,\ntitle={Combining Q-Learning and Search with Amortized Value Estimates},\nauthor={Jessica B. Hamrick and Victor Bapst and Alvaro Sanchez-Gonzalez and Tobias Pfaff and Theophane Weber and Lars Buesing and Peter W. Battaglia},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SkeAaJrKDS}\n}", "authorids": ["jhamrick@google.com", "vbapst@google.com", "alvarosg@google.com", "tpfaff@google.com", "theophane@google.com", "lbuesing@google.com", "peterbattaglia@google.com"], "title": "Combining Q-Learning and Search with Amortized Value Estimates", "authors": ["Jessica B. Hamrick", "Victor Bapst", "Alvaro Sanchez-Gonzalez", "Tobias Pfaff", "Theophane Weber", "Lars Buesing", "Peter W. Battaglia"], "original_pdf": "/attachment/02781cbeabdbc845e39e1bc80df0dcab9c31d00e.pdf", "pdf": "/pdf/24f332c1667c84011b2b9eaa63a26cd70cf9e86f.pdf", "full_presentation_video": ""}, "forum": "SkeAaJrKDS", "id": "SkeAaJrKDS"}, "r1lF_CEYwS": {"content": {"appendix": "", "keywords": ["adversarial", "adversarial attacks", "generative models", "robust learning", "security"], "paperhash": "jang|on_the_need_for_topologyaware_generative_models_for_manifoldbased_defenses", "spotlight_video": "", "poster": "", "slides": "", "abstract": "ML algorithms or models, especially deep neural networks (DNNs), have shown significant promise in several areas. However, recently researchers have demonstrated that ML algorithms, especially DNNs, are vulnerable to adversarial examples (slightly perturbed samples that cause mis-classification). Existence of adversarial examples has hindered deployment of ML algorithms in safety-critical sectors, such as security. Several defenses for adversarial examples exist in the literature. One of the important classes of defenses are manifold-based defenses, where a sample is \"pulled back\" into the data manifold before classifying. These defenses rely on the manifold assumption (data lie in a manifold of lower dimension than the input space). These defenses use a generative model to approximate the input distribution. This paper asks the following question: do the generative models used in manifold-based defenses need to be topology-aware? Our paper suggests the answer is yes. We provide theoretical and empirical evidence to support our claim.", "_bibtex": "@inproceedings{\nJang2020On,\ntitle={On the Need for Topology-Aware Generative Models for Manifold-Based Defenses},\nauthor={Uyeong Jang and Susmit Jha and Somesh Jha},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1lF_CEYwS}\n}", "authorids": ["wjang@cs.wisc.edu", "susmit.jha@sri.com", "jha@cs.wisc.edu"], "title": "On the Need for Topology-Aware Generative Models for Manifold-Based Defenses", "authors": ["Uyeong Jang", "Susmit Jha", "Somesh Jha"], "original_pdf": "/attachment/b5599f72458355fabacab83fa9ba243395d54702.pdf", "pdf": "/pdf/26c44f3f0ebe4dfd3ad843ef7ec7ec2146c645e7.pdf", "full_presentation_video": ""}, "forum": "r1lF_CEYwS", "id": "r1lF_CEYwS"}, "S1esMkHYPr": {"content": {"appendix": "", "TL;DR": "A flow-based autoregressive model for molecular graph generation. Reaching state-of-the-art results on molecule generation and properties optimization.", "keywords": ["attention", "autoregressive models", "fine tuning", "generation", "generative models", "optimization", "reinforcement learning"], "paperhash": "shi|graphaf_a_flowbased_autoregressive_model_for_molecular_graph_generation", "code": "http://bit.ly/2lCkfsr", "spotlight_video": "", "authorids": ["chenceshi@pku.edu.cn", "mkxu@apex.sjtu.edu.cn", "zhaocheng.zhu@umontreal.ca", "wnzhang@sjtu.edu.cn", "mzhang_cs@pku.edu.cn", "jian.tang@hec.ca"], "poster": "", "slides": "", "authors": ["Chence Shi*", "Minkai Xu*", "Zhaocheng Zhu", "Weinan Zhang", "Ming Zhang", "Jian Tang"], "_bibtex": "@inproceedings{\nShi*2020GraphAF:,\ntitle={GraphAF: a Flow-based Autoregressive Model for Molecular Graph Generation},\nauthor={Chence Shi* and Minkai Xu* and Zhaocheng Zhu and Weinan Zhang and Ming Zhang and Jian Tang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1esMkHYPr}\n}", "original_pdf": "/attachment/51ead0a922d808dff548cd24a53f34ad5e2e8ced.pdf", "title": "GraphAF: a Flow-based Autoregressive Model for Molecular Graph Generation", "pdf": "/pdf/e2ef8a6407f03fbdb526bf73073ba5c5c4d81678.pdf", "abstract": "Molecular graph generation is a fundamental problem for drug discovery and has been attracting growing attention. The problem is challenging since it requires not only generating chemically valid molecular structures but also optimizing their chemical properties in the meantime. Inspired by the recent progress in deep generative models, in this paper we propose a flow-based autoregressive model for graph generation called GraphAF. GraphAF combines the advantages of both autoregressive and flow-based approaches and enjoys: (1) high model flexibility for data density estimation; (2) efficient parallel computation for training; (3) an iterative sampling process, which allows leveraging chemical domain knowledge for valency checking. Experimental results show that GraphAF is able to generate 68\\% chemically valid molecules even without chemical knowledge rules and 100\\% valid molecules with chemical rules. The training process of GraphAF is two times faster than the existing state-of-the-art approach GCPN. After fine-tuning the model for goal-directed property optimization with reinforcement learning, GraphAF achieves state-of-the-art performance on both chemical property optimization and constrained property optimization. ", "full_presentation_video": ""}, "forum": "S1esMkHYPr", "id": "S1esMkHYPr"}, "Byg5ZANtvH": {"content": {"appendix": "", "keywords": ["acceleration", "optimization"], "paperhash": "lau|short_and_sparse_deconvolution_a_geometric_approach", "code": "https://github.com/qingqu06/sparse_deconvolution", "spotlight_video": "", "poster": "", "slides": "", "abstract": "Short-and-sparse deconvolution (SaSD) is the problem of extracting localized, recurring motifs in signals with spatial or temporal structure. Variants of this problem arise in applications such as image deblurring, microscopy, neural spike sorting, and more. The problem is challenging in both theory and practice, as natural optimization formulations are nonconvex. Moreover, practical deconvolution problems involve smooth motifs (kernels) whose spectra decay rapidly, resulting in poor conditioning and numerical challenges. This paper is motivated by recent theoretical advances \\citep{zhang2017global,kuo2019geometry}, which characterize the optimization landscape of a particular nonconvex formulation of SaSD. This is used to derive a provable algorithm that exactly solves certain non-practical instances of the SaSD problem. We leverage the key ideas from this theory (sphere constraints, data-driven initialization) to develop a practical algorithm, which performs well on data arising from a range of application areas. We highlight key additional challenges posed by the ill-conditioning of real SaSD problems and suggest heuristics (acceleration, continuation, reweighting) to mitigate them. Experiments demonstrate the performance and generality of the proposed method.", "_bibtex": "@inproceedings{\nLau2020Short,\ntitle={Short and Sparse Deconvolution --- A Geometric Approach},\nauthor={Yenson Lau and Qing Qu and Han-Wen Kuo and Pengcheng Zhou and Yuqian Zhang and John Wright},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Byg5ZANtvH}\n}", "authorids": ["y.lau@columbia.edu", "qq213@nyu.edu", "hk2673@columbia.edu", "pz2230@columbia.edu", "yz2557@cornell.edu", "jw2966@columbia.edu"], "title": "Short and Sparse Deconvolution --- A Geometric Approach", "authors": ["Yenson Lau", "Qing Qu", "Han-Wen Kuo", "Pengcheng Zhou", "Yuqian Zhang", "John Wright"], "original_pdf": "/attachment/0df29318caee2f3f27c9d9b5d59e3f9602f66ba8.pdf", "pdf": "/pdf/8cb35e242b7d8e9629eb764ef53ab1bd262667ae.pdf", "full_presentation_video": ""}, "forum": "Byg5ZANtvH", "id": "Byg5ZANtvH"}, "HygOjhEYDH": {"content": {"appendix": "", "TL;DR": "Learn in temporal point processes by modeling the conditional density, not the conditional intensity.", "keywords": ["missing data"], "paperhash": "shchur|intensityfree_learning_of_temporal_point_processes", "code": "https://github.com/shchur/ifl-tpp", "spotlight_video": "", "authorids": ["shchur@in.tum.de", "bilos@in.tum.de", "guennemann@in.tum.de"], "poster": "", "slides": "", "authors": ["Oleksandr Shchur", "Marin Bilo\u0161", "Stephan G\u00fcnnemann"], "_bibtex": "@inproceedings{\nShchur2020Intensity-Free,\ntitle={Intensity-Free Learning of Temporal Point Processes},\nauthor={Oleksandr Shchur and Marin Bilo\u0161 and Stephan G\u00fcnnemann},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HygOjhEYDH}\n}", "original_pdf": "/attachment/a179a8a248f1d64e104f269768fe258536ba0697.pdf", "title": "Intensity-Free Learning of Temporal Point Processes", "pdf": "/pdf/1e207a0a70e1401c49767367a73abdfbdfc47d39.pdf", "abstract": "Temporal point processes are the dominant paradigm for modeling sequences of events happening at irregular intervals. The standard way of learning in such models is by estimating the conditional intensity function. However, parameterizing the intensity function usually incurs several trade-offs. We show how to overcome the limitations of intensity-based approaches by directly modeling the conditional distribution of inter-event times. We draw on the literature on normalizing flows to design models that are flexible and efficient. We additionally propose a simple mixture model that matches the flexibility of flow-based models, but also permits sampling and computing moments in closed form. The proposed models achieve state-of-the-art performance in standard prediction tasks and are suitable for novel applications, such as learning sequence embeddings and imputing missing data.", "full_presentation_video": ""}, "forum": "HygOjhEYDH", "id": "HygOjhEYDH"}, "SJlpYJBKvH": {"content": {"appendix": "", "TL;DR": "A novel set of metrics for measuring reliability of reinforcement learning algorithms (+ accompanying statistical tests)", "keywords": ["attention", "reinforcement learning", "reliability"], "paperhash": "chan|measuring_the_reliability_of_reinforcement_learning_algorithms", "code": "https://github.com/google-research/rl-reliability-metrics", "spotlight_video": "", "authorids": ["scychan@google.com", "sfishman@google.com", "kbanoop@google.com", "canny@google.com", "sguada@google.com"], "poster": "", "slides": "", "authors": ["Stephanie C.Y. Chan", "Samuel Fishman", "Anoop Korattikara", "John Canny", "Sergio Guadarrama"], "_bibtex": "@inproceedings{\nChan2020Measuring,\ntitle={Measuring the Reliability of Reinforcement Learning Algorithms},\nauthor={Stephanie C.Y. Chan and Samuel Fishman and Anoop Korattikara and John Canny and Sergio Guadarrama},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJlpYJBKvH}\n}", "original_pdf": "/attachment/fdffc4b1c1da94b495237eb23c0482d0c3741a65.pdf", "title": "Measuring the Reliability of Reinforcement Learning Algorithms", "pdf": "/pdf/4b88278b4db6dff57af01390b2d4bc557699fbd7.pdf", "abstract": "Lack of reliability is a well-known issue for reinforcement learning (RL) algorithms. This problem has gained increasing attention in recent years, and efforts to improve it have grown substantially. To aid RL researchers and production users with the evaluation and improvement of reliability, we propose a set of metrics that quantitatively measure different aspects of reliability. In this work, we focus on variability and risk, both during training and after learning (on a fixed policy). We designed these metrics to be general-purpose, and we also designed complementary statistical tests to enable rigorous comparisons on these metrics. In this paper, we first describe the desired properties of the metrics and their design, the aspects of reliability that they measure, and their applicability to different scenarios. We then describe the statistical tests and make additional practical recommendations for reporting results. The metrics and accompanying statistical tools have been made available as an open-source library. We apply our metrics to a set of common RL algorithms and environments, compare them, and analyze the results.", "full_presentation_video": ""}, "forum": "SJlpYJBKvH", "id": "SJlpYJBKvH"}, "SJlHwkBYDH": {"content": {"appendix": "", "TL;DR": "We proposed a Nesterov Iterative Fast Gradient Sign Method (NI-FGSM) and a Scale-Invariant attack Method (SIM) that can boost the transferability of adversarial examples for image classification.", "keywords": ["adversarial", "adversarial attacks", "generation", "imagenet", "optimization", "overfitting", "perturbation", "transfer learning"], "paperhash": "lin|nesterov_accelerated_gradient_and_scale_invariance_for_adversarial_attacks", "code": "https://github.com/JHL-HUST/SI-NI-FGSM", "spotlight_video": "", "authorids": ["jdlin@hust.edu.cn", "cbsong@hust.edu.cn", "brooklet60@hust.edu.cn", "wanglw@cis.pku.edu.cn", "jeh@cs.cornell.edu"], "poster": "", "slides": "", "authors": ["Jiadong Lin", "Chuanbiao Song", "Kun He", "Liwei Wang", "John E. Hopcroft"], "_bibtex": "@inproceedings{\nLin2020Nesterov,\ntitle={Nesterov Accelerated Gradient and Scale Invariance for Adversarial Attacks},\nauthor={Jiadong Lin and Chuanbiao Song and Kun He and Liwei Wang and John E. Hopcroft},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJlHwkBYDH}\n}", "original_pdf": "/attachment/99337dfa794aff2735e8745833e5332a1c76c372.pdf", "title": "Nesterov Accelerated Gradient and Scale Invariance for Adversarial Attacks", "pdf": "/pdf/222a1a42bd9fdc01887c142f6bb482b6a25007fa.pdf", "abstract": "Deep learning models are vulnerable to adversarial examples crafted by applying human-imperceptible perturbations on benign inputs. However, under the black-box setting, most existing adversaries often have a poor transferability to attack other defense models. In this work, from the perspective of regarding the adversarial example generation as an optimization process, we propose two new methods to improve the transferability of adversarial examples, namely Nesterov Iterative Fast Gradient Sign Method (NI-FGSM) and Scale-Invariant attack Method (SIM). NI-FGSM aims to adapt Nesterov accelerated gradient into the iterative attacks so as to effectively look ahead and improve the transferability of adversarial examples. While SIM is based on our discovery on the scale-invariant property of deep learning models, for which we leverage to optimize the adversarial perturbations over the scale copies of the input images so as to avoid \"overfitting\u201d on the white-box model being attacked and generate more transferable adversarial examples. NI-FGSM and SIM can be naturally integrated to build a robust gradient-based attack to generate more transferable adversarial examples against the defense models. Empirical results on ImageNet dataset demonstrate that our attack methods exhibit higher transferability and achieve higher attack success rates than state-of-the-art gradient-based attacks.", "full_presentation_video": ""}, "forum": "SJlHwkBYDH", "id": "SJlHwkBYDH"}, "rke7geHtwH": {"content": {"appendix": "", "TL;DR": "We develop a method for stable offline reinforcement learning from logged data. The key is to regularize the RL policy towards a learned \"advantage weighted\" model of the data.", "keywords": ["continuous control", "off policy", "reinforcement learning"], "paperhash": "siegel|keep_doing_what_worked_behavior_modelling_priors_for_offline_reinforcement_learning", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Noah Siegel", "Jost Tobias Springenberg", "Felix Berkenkamp", "Abbas Abdolmaleki", "Michael Neunert", "Thomas Lampe", "Roland Hafner", "Nicolas Heess", "Martin Riedmiller"], "_bibtex": "@inproceedings{\nSiegel2020Keep,\ntitle={Keep Doing What Worked: Behavior Modelling Priors for Offline Reinforcement Learning},\nauthor={Noah Siegel and Jost Tobias Springenberg and Felix Berkenkamp and Abbas Abdolmaleki and Michael Neunert and Thomas Lampe and Roland Hafner and Nicolas Heess and Martin Riedmiller},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rke7geHtwH}\n}", "authorids": ["siegeln@google.com", "springenberg@google.com", "befelix@inf.ethz.ch", "aabdolmaleki@google.com", "neunertm@google.com", "thomaslampe@google.com", "rhafner@google.com", "heess@google.com", "riedmiller@google.com"], "title": "Keep Doing What Worked: Behavior Modelling Priors for Offline Reinforcement Learning", "original_pdf": "/attachment/f863c0327c3971cd1b142e1b92c75145a522a63f.pdf", "pdf": "/pdf/a61fb5ccb3c042ba9d9e0f74dc7b7ee44b8d96f8.pdf", "abstract": "Off-policy reinforcement learning algorithms promise to be applicable in settings where only a fixed data-set (batch) of environment interactions is available and no new experience can be acquired. This property makes these algorithms appealing for real world problems such as robot control. In practice, however, standard off-policy algorithms fail in the batch setting for continuous control. In this paper, we propose a simple solution to this problem. It admits the use of data generated by arbitrary behavior policies and uses a learned prior -- the advantage-weighted behavior model (ABM) -- to bias the RL policy towards actions that have previously been executed and are likely to be successful on the new task. Our method can be seen as an extension of recent work on batch-RL that enables stable learning from conflicting data-sources. We find improvements on competitive baselines in a variety of RL tasks -- including standard continuous control benchmarks and multi-task learning for simulated and real-world robots. ", "full_presentation_video": ""}, "forum": "rke7geHtwH", "id": "rke7geHtwH"}, "HygpthEtvr": {"content": {"appendix": "", "TL;DR": "We propose a convergent proximal-type stochastic gradient descent algorithm for constrained nonsmooth nonconvex optimization problems", "keywords": ["gradient descent", "learning rate", "momentum", "optimization", "regularization"], "paperhash": "yang|proxsgd_training_structured_neural_networks_under_regularization_and_constraints", "code": "https://github.com/optyang/proxsgd; https://github.com/cc-hpc-itwm/proxsgd", "spotlight_video": "", "authorids": ["yang.yang@itwm.fraunhofer.de", "yaxiong.yuan@uni.lu", "avraam.chatzimichailidis@itwm.fraunhofer.de", "r.j.g.v.sloun@tue.nl", "lei.lei@uni.lu", "symeon.chatzinotas@uni.lu"], "poster": "", "slides": "", "authors": ["Yang Yang", "Yaxiong Yuan", "Avraam Chatzimichailidis", "Ruud JG van Sloun", "Lei Lei", "Symeon Chatzinotas"], "_bibtex": "@inproceedings{\nYang2020ProxSGD:,\ntitle={ProxSGD: Training Structured Neural Networks under Regularization and Constraints},\nauthor={Yang Yang and Yaxiong Yuan and Avraam Chatzimichailidis and Ruud JG van Sloun and Lei Lei and Symeon Chatzinotas},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HygpthEtvr}\n}", "original_pdf": "/attachment/f783ab183448956c6b6042e8ca42b7ad65217a63.pdf", "title": "ProxSGD: Training Structured Neural Networks under Regularization and Constraints", "pdf": "/pdf/2e4a2c27aad4203712646c19389c185bbe43adc7.pdf", "abstract": "In this paper, we consider the problem of training neural networks (NN). To promote a NN with specific structures, we explicitly take into consideration the nonsmooth regularization (such as L1-norm) and constraints (such as interval constraint). This is formulated as a constrained nonsmooth nonconvex optimization problem, and we propose a convergent proximal-type stochastic gradient descent (Prox-SGD) algorithm. We show that under properly selected learning rates, momentum eventually resembles the unknown real gradient and thus is crucial in analyzing the convergence. We establish that with probability 1, every limit point of the sequence generated by the proposed Prox-SGD is a stationary point. Then the Prox-SGD is tailored to train a sparse neural network and a binary neural network, and the theoretical analysis is also supported by extensive numerical tests.", "full_presentation_video": ""}, "forum": "HygpthEtvr", "id": "HygpthEtvr"}, "SJe5P6EYvS": {"content": {"appendix": "", "TL;DR": "An LSTM extension with state-of-the-art language modelling results.", "keywords": ["generalization", "memory", "nlp", "rnn", "transformer"], "paperhash": "melis|mogrifier_lstm", "spotlight_video": "", "poster": "", "slides": "", "abstract": "Many advances in Natural Language Processing have been based upon more expressive models for how inputs interact with the context in which they occur. Recurrent networks, which have enjoyed a modicum of success, still lack the generalization and systematicity ultimately required for modelling language. In this work, we propose an extension to the venerable Long Short-Term Memory in the form of mutual gating of the current input and the previous output. This mechanism affords the modelling of a richer space of interactions between inputs and their context. Equivalently, our model can be viewed as making the transition function given by the LSTM context-dependent. Experiments demonstrate markedly improved generalization on language modelling in the range of 3\u20134 perplexity points on Penn Treebank and Wikitext-2, and 0.01\u20130.05 bpc on four character-based datasets. We establish a new state of the art on all datasets with the exception of Enwik8, where we close a large gap between the LSTM and Transformer models.\n", "_bibtex": "@inproceedings{\nMelis2020Mogrifier,\ntitle={Mogrifier LSTM},\nauthor={G\u00e1bor Melis and Tom\u00e1\u0161 Ko\u010disk\u00fd and Phil Blunsom},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJe5P6EYvS}\n}", "authorids": ["melisgl@google.com", "tkocisky@google.com", "pblunsom@google.com"], "title": "Mogrifier LSTM", "authors": ["G\u00e1bor Melis", "Tom\u00e1\u0161 Ko\u010disk\u00fd", "Phil Blunsom"], "original_pdf": "/attachment/f23f0239f57b453d7df6d7c2e128b691c9eb075a.pdf", "pdf": "/pdf/dec59d4add1086f58d1c0626c2f4a095c7e455c7.pdf", "full_presentation_video": ""}, "forum": "SJe5P6EYvS", "id": "SJe5P6EYvS"}, "rklnDgHtDS": {"content": {"appendix": "", "keywords": ["catastrophic forgetting", "compositionality", "continual learning", "lifelong learning", "machine translation", "nlp", "sequence modeling"], "paperhash": "li|compositional_language_continual_learning", "code": "https://github.com/yli1/CLCL", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Yuanpeng Li", "Liang Zhao", "Kenneth Church", "Mohamed Elhoseiny"], "_bibtex": "@inproceedings{\nLi2020Compositional,\ntitle={Compositional Language Continual Learning},\nauthor={Yuanpeng Li and Liang Zhao and Kenneth Church and Mohamed Elhoseiny},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rklnDgHtDS}\n}", "authorids": ["yuanpeng16@gmail.com", "lzhao4ever@gmail.com", "kenneth.ward.church@gmail.com", "mohamed.elhoseiny@gmail.com"], "title": "Compositional Language Continual Learning", "original_pdf": "/attachment/9d0445817a704781fe49bcbb0371b963a7a82ab3.pdf", "pdf": "/pdf/4e2192bb77fa7bcaebfbef4e750c6f30c5ae38f0.pdf", "abstract": "Motivated by the human's ability to continually learn and gain knowledge over time, several research efforts have been pushing the limits of machines to constantly learn while alleviating catastrophic forgetting. Most of the existing methods have been focusing on continual learning of label prediction tasks, which have fixed input and output sizes. In this paper, we propose a new scenario of continual learning which handles sequence-to-sequence tasks common in language learning. We further propose an approach to use label prediction continual learning algorithm for sequence-to-sequence continual learning by leveraging compositionality. Experimental results show that the proposed method has significant improvement over state-of-the-art methods. It enables knowledge transfer and prevents catastrophic forgetting, resulting in more than 85% accuracy up to 100 stages, compared with less than 50% accuracy for baselines in instruction learning task. It also shows significant improvement in machine translation task. This is the first work to combine continual learning and compositionality for language learning, and we hope this work will make machines more helpful in various tasks.", "full_presentation_video": ""}, "forum": "rklnDgHtDS", "id": "rklnDgHtDS"}, "rkeu30EtvS": {"content": {"appendix": "", "TL;DR": "We propose a method called network deconvolution that resembles animal vision system to train convolution networks better.", "keywords": ["batch normalization", "cnn", "imagenet"], "paperhash": "ye|network_deconvolution", "code": "https://github.com/yechengxi/deconvolution", "spotlight_video": "", "authorids": ["yechengxi@gmail.com", "mevanusa@umd.edu", "huah@umd.edu", "amitrokh@umd.edu", "tomg@cs.umd.edu", "yorke@umd.edu", "fer@umiacs.umd.edu", "yiannis@cs.umd.edu"], "poster": "", "slides": "", "authors": ["Chengxi Ye", "Matthew Evanusa", "Hua He", "Anton Mitrokhin", "Tom Goldstein", "James A. Yorke", "Cornelia Fermuller", "Yiannis Aloimonos"], "_bibtex": "@inproceedings{\nYe2020Network,\ntitle={Network Deconvolution},\nauthor={Chengxi Ye and Matthew Evanusa and Hua He and Anton Mitrokhin and Tom Goldstein and James A. Yorke and Cornelia Fermuller and Yiannis Aloimonos},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkeu30EtvS}\n}", "original_pdf": "/attachment/375fdffad1a9a5f79fd2cd0174c1dd7016607793.pdf", "title": "Network Deconvolution", "pdf": "/pdf/27bd01ec15ec4358deecb8afd123a91a547c7ca4.pdf", "abstract": "Convolution is a central operation in Convolutional Neural Networks (CNNs), which applies a kernel to overlapping regions shifted across the image. However, because of the strong correlations in real-world image data, convolutional kernels are in effect re-learning redundant data. In this work, we show that this redundancy has made neural network training challenging, and propose network deconvolution, a procedure which optimally removes pixel-wise and channel-wise correlations before the data is fed into each layer. Network deconvolution can be efficiently calculated at a fraction of the computational cost of a convolution layer. We also show that the deconvolution filters in the first layer of the network resemble the center-surround structure found in biological neurons in the visual regions of the brain. Filtering with such kernels results in a sparse representation, a desired property that has been missing in the training of neural networks. Learning from the sparse representation promotes faster convergence and superior results without the use of batch normalization. We apply our network deconvolution operation to 10 modern neural network models by replacing batch normalization within each. Extensive experiments show that the network deconvolution operation is able to deliver performance improvement in all cases on the CIFAR-10, CIFAR-100, MNIST, Fashion-MNIST, Cityscapes, and ImageNet datasets.", "full_presentation_video": ""}, "forum": "rkeu30EtvS", "id": "rkeu30EtvS"}, "HylpqA4FwS": {"content": {"appendix": "", "TL;DR": "Incremental-RNNs resolves exploding/vanishing gradient problem by updating state vectors based on difference between previous state and that predicted by an ODE.", "keywords": ["rnn"], "paperhash": "kag|rnns_incrementally_evolving_on_an_equilibrium_manifold_a_panacea_for_vanishing_and_exploding_gradients", "spotlight_video": "", "poster": "", "slides": "", "abstract": "Recurrent neural networks (RNNs) are particularly well-suited for modeling long-term dependencies in sequential data, but are notoriously hard to train because the error backpropagated in time either vanishes or explodes at an exponential rate. While a number of works attempt to mitigate this effect through gated recurrent units, skip-connections, parametric constraints and design choices, we propose a novel incremental RNN (iRNN), where hidden state vectors keep track of incremental changes, and as such approximate state-vector increments of Rosenblatt's (1962) continuous-time RNNs. iRNN exhibits identity gradients and is able to account for long-term dependencies (LTD). We show that our method is computationally efficient overcoming overheads of many existing methods that attempt to improve RNN training, while suffering no performance degradation. We demonstrate the utility of our approach with extensive experiments and show competitive performance against standard LSTMs on LTD and other non-LTD tasks.\n", "_bibtex": "@inproceedings{\nKag2020RNNs,\ntitle={RNNs Incrementally Evolving on an Equilibrium Manifold: A Panacea for Vanishing and Exploding Gradients?},\nauthor={Anil Kag and Ziming Zhang and Venkatesh Saligrama},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HylpqA4FwS}\n}", "authorids": ["anilkag@bu.edu", "zzhang@merl.com", "srv@bu.edu"], "title": "RNNs Incrementally Evolving on an Equilibrium Manifold: A Panacea for Vanishing and Exploding Gradients?", "authors": ["Anil Kag", "Ziming Zhang", "Venkatesh Saligrama"], "original_pdf": "/attachment/969eb2353a659999ebe45e8cf7ff52614661c926.pdf", "pdf": "/pdf/34194213dd8d50a2f1eef0ece3c53bfdefd05845.pdf", "full_presentation_video": ""}, "forum": "HylpqA4FwS", "id": "HylpqA4FwS"}, "SJxWS64FwH": {"content": {"appendix": "", "TL;DR": "A scattering transform followed by supervised dictionary learning reaches a higher accuracy than AlexNet on ImageNet.", "keywords": ["dictionary learning", "imagenet", "representation learning", "sparse coding"], "paperhash": "zarka|deep_network_classification_by_scattering_and_homotopy_dictionary_learning", "spotlight_video": "", "poster": "", "slides": "", "authors": ["John Zarka", "Louis Thiry", "Tomas Angles", "Stephane Mallat"], "_bibtex": "@inproceedings{\nZarka2020Deep,\ntitle={Deep Network Classification by Scattering and Homotopy Dictionary Learning},\nauthor={John Zarka and Louis Thiry and Tomas Angles and Stephane Mallat},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJxWS64FwH}\n}", "authorids": ["john.zarka@ens.fr", "louis.thiry@ens.fr", "tomas.angles@ens.fr", "stephane.mallat@ens.fr"], "title": "Deep Network Classification by Scattering and Homotopy Dictionary Learning", "original_pdf": "/attachment/29c8b04d248189e8466780ebd914005d9beb4aa5.pdf", "pdf": "/pdf/e6235a059a0929ea35206c487924a1f355bb8c7d.pdf", "abstract": "We introduce a sparse scattering deep convolutional neural network, which provides a simple model to analyze properties of deep representation learning for classification. Learning a single dictionary matrix with a classifier yields a higher classification accuracy than AlexNet over the ImageNet 2012 dataset. The network first applies a scattering transform that linearizes variabilities due to geometric transformations such as translations and small deformations.\nA sparse $\\ell^1$ dictionary coding reduces intra-class variability while preserving class separation through projections over unions of linear spaces. It is implemented in a deep convolutional network with a homotopy algorithm having an exponential convergence. A convergence proof is given in a general framework that includes ALISTA. Classification results are analyzed on ImageNet.", "full_presentation_video": ""}, "forum": "SJxWS64FwH", "id": "SJxWS64FwH"}, "Hye_V0NKwr": {"content": {"appendix": "", "TL;DR": "An analysis of the effects of compositionality and locality on representation learning for zero-shot learning.", "keywords": ["compositionality", "generalization", "imagenet", "representation learning", "zero shot learning"], "paperhash": "sylvain|locality_and_compositionality_in_zeroshot_learning", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Tristan Sylvain", "Linda Petrini", "Devon Hjelm"], "_bibtex": "@inproceedings{\nsylvain2020locality,\ntitle={Locality and Compositionality in Zero-Shot Learning},\nauthor={Tristan Sylvain and Linda Petrini and Devon Hjelm},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Hye_V0NKwr}\n}", "authorids": ["tristan.sylvain@gmail.com", "lindapetrini@gmail.com", "devon.hjelm@microsoft.com"], "title": "Locality and Compositionality in Zero-Shot Learning", "original_pdf": "/attachment/d55a6ecb5843fdb65af828aff26c08d11c2d2ce0.pdf", "pdf": "/pdf/ae21cd01627929186c1673ccc2f9b1c9cc1dfc2e.pdf", "abstract": "In this work we study locality and compositionality in the context of learning representations for Zero Shot Learning (ZSL). \nIn order to well-isolate the importance of these properties in learned representations, we impose the additional constraint that, differently from most recent work in ZSL, no pre-training on different datasets (e.g. ImageNet) is performed.\nThe results of our experiment show how locality, in terms of small parts of the input, and compositionality, i.e. how well can the learned representations be expressed as a function of a smaller vocabulary, are both deeply related to generalization and motivate the focus on more local-aware models in future research directions for representation learning.", "full_presentation_video": ""}, "forum": "Hye_V0NKwr", "id": "Hye_V0NKwr"}, "rklB76EKPr": {"content": {"appendix": "", "TL;DR": "Gradient clipping doesn't endow robustness to label noise, but a simple loss-based variant does.", "keywords": ["gradient descent", "robustness"], "paperhash": "menon|can_gradient_clipping_mitigate_label_noise", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Aditya Krishna Menon", "Ankit Singh Rawat", "Sashank J. Reddi", "Sanjiv Kumar"], "_bibtex": "@inproceedings{\nMenon2020Can,\ntitle={Can gradient clipping mitigate label noise?},\nauthor={Aditya Krishna Menon and Ankit Singh Rawat and Sashank J. Reddi and Sanjiv Kumar},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rklB76EKPr}\n}", "authorids": ["adityakmenon@google.com", "ankitsrawat@google.com", "sashank@google.com", "sanjivk@google.com"], "title": "Can gradient clipping mitigate label noise?", "original_pdf": "/attachment/c1a32a20347a7417e2f53e5308e3f0919e235561.pdf", "pdf": "/pdf/1bae93a14da558350cc4541835f41c3c07aeacf1.pdf", "abstract": "Gradient clipping is a widely-used technique in the training of deep networks, and is generally motivated from an optimisation lens: informally, it controls the dynamics of iterates, thus enhancing the rate of convergence to a local minimum. This intuition has been made precise in a line of recent works, which show that suitable clipping can yield significantly faster convergence than vanilla gradient descent. In this paper, we propose a new lens for studying gradient clipping, namely, robustness: informally, one expects clipping to provide robustness to noise, since one does not overly trust any single sample. Surprisingly, we prove that for the common problem of label noise in classification, standard gradient clipping does not in general provide robustness. On the other hand, we show that a simple variant of gradient clipping is provably robust, and corresponds to suitably modifying the underlying loss function. This yields a simple, noise-robust alternative to the standard cross-entropy loss which performs well empirically.", "full_presentation_video": ""}, "forum": "rklB76EKPr", "id": "rklB76EKPr"}, "Byg1v1HKDB": {"content": {"appendix": "", "keywords": ["generation", "natural language generation", "natural language inference", "nlp", "question answering", "reasoning"], "paperhash": "bhagavatula|abductive_commonsense_reasoning", "spotlight_video": "", "poster": "", "slides": "", "abstract": "Abductive reasoning is inference to the most plausible explanation. For example, if Jenny finds her house in a mess when she returns from work, and remembers that she left a window open, she can hypothesize that a thief broke into her house and caused the mess, as the most plausible explanation. While abduction has long been considered to be at the core of how people interpret and read between the lines in natural language (Hobbs et al., 1988), there has been relatively little research in support of abductive natural language inference and generation. We present the first study that investigates the viability of language-based abductive reasoning. We introduce a challenge dataset, ART, that consists of over 20k commonsense narrative contexts and 200k explanations. Based on this dataset, we conceptualize two new tasks \u2013 (i) Abductive NLI: a multiple-choice question answering task for choosing the more likely explanation, and (ii) Abductive NLG: a conditional generation task for explaining given observations in natural language. On Abductive NLI, the best model achieves 68.9% accuracy, well below human performance of 91.4%. On Abductive NLG, the current best language generators struggle even more, as they lack reasoning capabilities that are trivial for humans. Our analysis leads to new insights into the types of reasoning that deep pre-trained language models fail to perform\u2014despite their strong performance on the related but more narrowly defined task of entailment NLI\u2014pointing to interesting avenues for future research.", "_bibtex": "@inproceedings{\nBhagavatula2020Abductive,\ntitle={Abductive Commonsense Reasoning},\nauthor={Chandra Bhagavatula and Ronan Le Bras and Chaitanya Malaviya and Keisuke Sakaguchi and Ari Holtzman and Hannah Rashkin and Doug Downey and Wen-tau Yih and Yejin Choi},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Byg1v1HKDB}\n}", "authorids": ["chandrab@allenai.org", "ronanlb@allenai.org", "chaitanyam@allenai.org", "keisukes@allenai.org", "arih@allenai.org", "hrashkin@uw.edu", "dougd@allenai.org", "scottyih@fb.com", "yejinc@allenai.org"], "title": "Abductive Commonsense Reasoning", "authors": ["Chandra Bhagavatula", "Ronan Le Bras", "Chaitanya Malaviya", "Keisuke Sakaguchi", "Ari Holtzman", "Hannah Rashkin", "Doug Downey", "Wen-tau Yih", "Yejin Choi"], "original_pdf": "/attachment/183c6ed66f0f87bfb8e7c632c47965c81588a687.pdf", "pdf": "/pdf/48fb3337a97a1f5f932d55dcebbbadf13ff197e8.pdf", "full_presentation_video": ""}, "forum": "Byg1v1HKDB", "id": "Byg1v1HKDB"}, "r1gRTCVFvB": {"content": {"appendix": "", "keywords": ["imagenet", "memory", "representation learning", "transfer learning"], "paperhash": "kang|decoupling_representation_and_classifier_for_longtailed_recognition", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Bingyi Kang", "Saining Xie", "Marcus Rohrbach", "Zhicheng Yan", "Albert Gordo", "Jiashi Feng", "Yannis Kalantidis"], "_bibtex": "@inproceedings{\nKang2020Decoupling,\ntitle={Decoupling Representation and Classifier for Long-Tailed Recognition},\nauthor={Bingyi Kang and Saining Xie and Marcus Rohrbach and Zhicheng Yan and Albert Gordo and Jiashi Feng and Yannis Kalantidis},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1gRTCVFvB}\n}", "authorids": ["kang@u.nus.edu", "xiesaining@gmail.com", "maroffm@gmail.com", "zhicheng.yan@live.com", "albert.gordo.s@gmail.com", "elefjia@nus.edu.sg", "ykalant@image.ntua.gr"], "title": "Decoupling Representation and Classifier for Long-Tailed Recognition", "original_pdf": "/attachment/be842a1f63c29829db33c951bba4fa6089f31ac6.pdf", "pdf": "/pdf/2be5582a3de2661961b882fb82da2c1937c67162.pdf", "abstract": "The long-tail distribution of the visual world poses great challenges for deep learning based classification models on how to handle the class imbalance problem. Existing solutions usually involve class-balancing strategies, e.g., by loss re-weighting, data re-sampling, or transfer learning from head- to tail-classes, but most of them adhere to the scheme of jointly learning representations and classifiers. In this work, we decouple the learning procedure into representation learning and classification, and systematically explore how different balancing strategies affect them for long-tailed recognition. The findings are surprising: (1) data imbalance might not be an issue in learning high-quality representations; (2) with representations learned with the simplest instance-balanced (natural) sampling, it is also possible to achieve strong long-tailed recognition ability by adjusting only the classifier. We conduct extensive experiments and set new state-of-the-art performance on common long-tailed benchmarks like ImageNet-LT, Places-LT and iNaturalist, showing that it is possible to outperform carefully designed losses, sampling strategies, even complex modules with memory, by using a straightforward approach that decouples representation and classification. Our code is available at https://github.com/facebookresearch/classifier-balancing.", "full_presentation_video": ""}, "forum": "r1gRTCVFvB", "id": "r1gRTCVFvB"}, "BylA_C4tPr": {"content": {"appendix": "", "TL;DR": "A Composition-based Graph Convolutional framework for multi-relational graphs.", "keywords": ["cnn", "graph embedding", "graph networks", "knowledge graph embeddings"], "paperhash": "vashishth|compositionbased_multirelational_graph_convolutional_networks", "code": "https://github.com/malllabiisc/CompGCN", "spotlight_video": "", "authorids": ["shikhar@iisc.ac.in", "sanyal.soumya8@gmail.com", "vikram.nitin@columbia.edu", "ppt@iisc.ac.in"], "poster": "", "slides": "", "authors": ["Shikhar Vashishth", "Soumya Sanyal", "Vikram Nitin", "Partha Talukdar"], "_bibtex": "@inproceedings{\nVashishth2020Composition-based,\ntitle={Composition-based Multi-Relational Graph Convolutional Networks},\nauthor={Shikhar Vashishth and Soumya Sanyal and Vikram Nitin and Partha Talukdar},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BylA_C4tPr}\n}", "original_pdf": "/attachment/cfeddb7e2f31bd9b02ca2e777b0c1f8c6f137b07.pdf", "title": "Composition-based Multi-Relational Graph Convolutional Networks", "pdf": "/pdf/bf70ad4afed02db2b8a47eaa28685546b45a47c9.pdf", "abstract": "Graph Convolutional Networks (GCNs) have recently been shown to be quite successful in modeling graph-structured data. However, the primary focus has been on handling simple undirected graphs. Multi-relational graphs are a more general and prevalent form of graphs where each edge has a label and direction associated with it. Most of the existing approaches to handle such graphs suffer from over-parameterization and are restricted to learning representations of nodes only. In this paper, we propose CompGCN, a novel Graph Convolutional framework which jointly embeds both nodes and relations in a relational graph. CompGCN leverages a variety of entity-relation composition operations from Knowledge Graph Embedding techniques and scales with the number of relations. It also generalizes several of the existing multi-relational GCN methods. We evaluate our proposed method on multiple tasks such as node classification, link prediction, and graph classification, and achieve demonstrably superior results. We make the source code of CompGCN available to foster reproducible research.", "full_presentation_video": ""}, "forum": "BylA_C4tPr", "id": "BylA_C4tPr"}, "BJgr4kSFDS": {"content": {"appendix": "", "TL;DR": "Answering a wide class of logical queries over knowledge graphs with box embeddings in vector space", "keywords": ["graph embedding", "knowledge graph embeddings", "knowledge graphs", "logical reasoning", "reasoning"], "paperhash": "ren|query2box_reasoning_over_knowledge_graphs_in_vector_space_using_box_embeddings", "code": "https://github.com/hyren/query2box", "spotlight_video": "", "authorids": ["hyren@cs.stanford.edu", "weihuahu@stanford.edu", "jure@cs.stanford.edu"], "poster": "", "slides": "", "authors": ["Hongyu Ren*", "Weihua Hu*", "Jure Leskovec"], "_bibtex": "@inproceedings{\nRen*2020Query2box:,\ntitle={Query2box: Reasoning over Knowledge Graphs in Vector Space Using Box Embeddings},\nauthor={Hongyu Ren* and Weihua Hu* and Jure Leskovec},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJgr4kSFDS}\n}", "original_pdf": "/attachment/53afb23bbbf04c3cc738f034d3595515fd6f0860.pdf", "title": "Query2box: Reasoning over Knowledge Graphs in Vector Space Using Box Embeddings", "pdf": "/pdf/fe83d2016118d370eb96ed4781cda179f98f8b64.pdf", "abstract": "Answering complex logical queries on large-scale incomplete knowledge graphs (KGs) is a fundamental yet challenging task. Recently, a promising approach to this problem has been to embed KG entities as well as the query into a vector space such that entities that answer the query are embedded close to the query. However, prior work models queries as single points in the vector space, which is problematic because a complex query represents a potentially large set of its answer entities, but it is unclear how such a set can be represented as a single point. Furthermore, prior work can only handle queries that use conjunctions ($\\wedge$) and existential quantifiers ($\\exists$). Handling queries with logical disjunctions ($\\vee$) remains an open problem. Here we propose query2box, an embedding-based framework for reasoning over arbitrary queries with $\\wedge$, $\\vee$, and $\\exists$ operators in massive and incomplete KGs. Our main insight is that queries can be embedded as boxes (i.e., hyper-rectangles), where a set of points inside the box corresponds to a set of answer entities of the query. We show that conjunctions can be naturally represented as intersections of boxes and also prove a negative result that handling disjunctions would require embedding with dimension proportional to the number of KG entities. However, we show that by transforming queries into a Disjunctive Normal Form, query2box is capable of handling arbitrary logical queries with $\\wedge$, $\\vee$, $\\exists$ in a scalable manner. We demonstrate the effectiveness of query2box on two large KGs and show that query2box achieves up to 25% relative improvement over the state of the art.\n", "full_presentation_video": ""}, "forum": "BJgr4kSFDS", "id": "BJgr4kSFDS"}, "S1l8oANFDH": {"content": {"appendix": "", "TL;DR": "An approach to learn program policies for control tasks that inductively generalize. ", "keywords": ["generalization", "imitation learning", "program synthesis", "reinforcement learning"], "paperhash": "inala|synthesizing_programmatic_policies_that_inductively_generalize", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Jeevana Priya Inala", "Osbert Bastani", "Zenna Tavares", "Armando Solar-Lezama"], "_bibtex": "@inproceedings{\nInala2020Synthesizing,\ntitle={Synthesizing Programmatic Policies that Inductively Generalize},\nauthor={Jeevana Priya Inala and Osbert Bastani and Zenna Tavares and Armando Solar-Lezama},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1l8oANFDH}\n}", "authorids": ["jinala@csail.mit.edu", "obastani@seas.upenn.edu", "zenna@mit.edu", "asolar@csail.mit.edu"], "title": "Synthesizing Programmatic Policies that Inductively Generalize", "original_pdf": "/attachment/0745d2c4f4adb77adc5fd66dd62f89bc5f1643c5.pdf", "pdf": "/pdf/098084d58f170afbfefc335ad4361a364376ed4b.pdf", "abstract": "Deep reinforcement learning has successfully solved a number of challenging control tasks. However, learned policies typically have difficulty generalizing to novel environments. We propose an algorithm for learning programmatic state machine policies that can capture repeating behaviors. By doing so, they have the ability to generalize to instances requiring an arbitrary number of repetitions, a property we call inductive generalization. However, state machine policies are hard to learn since they consist of a combination of continuous and discrete structures. We propose a learning framework called adaptive teaching, which learns a state machine policy by imitating a teacher; in contrast to traditional imitation learning, our teacher adaptively updates itself based on the structure of the student. We show that our algorithm can be used to learn policies that inductively generalize to novel environments, whereas traditional neural network policies fail to do so. ", "full_presentation_video": ""}, "forum": "S1l8oANFDH", "id": "S1l8oANFDH"}, "SJlsFpVtDB": {"content": {"appendix": "", "TL;DR": "This work addresses continual learning for non-stationary data, using Bayesian neural networks and memory-based online variational Bayes.", "keywords": ["bayesian neural networks", "continual learning", "episodic memory", "lifelong learning", "memory", "variational inference"], "paperhash": "kurle|continual_learning_with_bayesian_neural_networks_for_nonstationary_data", "spotlight_video": "", "poster": "", "slides": "", "abstract": "This work addresses continual learning for non-stationary data, using Bayesian neural networks and memory-based online variational Bayes. We represent the posterior approximation of the network weights by a diagonal Gaussian distribution and a complementary memory of raw data. This raw data corresponds to likelihood terms that cannot be well approximated by the Gaussian. We introduce a novel method for sequentially updating both components of the posterior approximation. Furthermore, we propose Bayesian forgetting and a Gaussian diffusion process for adapting to non-stationary data. The experimental results show that our update method improves on existing approaches for streaming data. Additionally, the adaptation methods lead to better predictive performance for non-stationary data. ", "_bibtex": "@inproceedings{\nKurle2020Continual,\ntitle={Continual Learning with Bayesian Neural Networks for Non-Stationary Data},\nauthor={Richard Kurle and Botond Cseke and Alexej Klushyn and Patrick van der Smagt and Stephan G\u00fcnnemann},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJlsFpVtDB}\n}", "authorids": ["richard.kurle@tum.de", "botond.cseke@argmax.ai", "a.klushyn@tum.de", "smagt@argmax.ai", "guennemann@in.tum.de"], "title": "Continual Learning with Bayesian Neural Networks for Non-Stationary Data", "authors": ["Richard Kurle", "Botond Cseke", "Alexej Klushyn", "Patrick van der Smagt", "Stephan G\u00fcnnemann"], "original_pdf": "/attachment/ca360cecdcdb485f4ea655e00feaddee54574de3.pdf", "pdf": "/pdf/0fdd74c6a01317f2e8eaece257838f798a2776e0.pdf", "full_presentation_video": ""}, "forum": "SJlsFpVtDB", "id": "SJlsFpVtDB"}, "rkecl1rtwB": {"content": {"appendix": "", "TL;DR": "We proposed a normalization layer for GNN models to solve the oversmoothing problem.", "keywords": ["graph networks", "normalization"], "paperhash": "zhao|pairnorm_tackling_oversmoothing_in_gnns", "code": "https://github.com/LingxiaoShawn/PairNorm", "spotlight_video": "", "authorids": ["lingxiao@cmu.edu", "lakoglu@andrew.cmu.edu"], "poster": "", "slides": "", "authors": ["Lingxiao Zhao", "Leman Akoglu"], "_bibtex": "@inproceedings{\nZhao2020PairNorm:,\ntitle={PairNorm: Tackling Oversmoothing in GNNs},\nauthor={Lingxiao Zhao and Leman Akoglu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkecl1rtwB}\n}", "original_pdf": "/attachment/fd5ca669512acda4d49eb02d8ea702f9bd4c513b.pdf", "title": "PairNorm: Tackling Oversmoothing in GNNs", "pdf": "/pdf/a535150110cc7304a239d96432f605554c435f29.pdf", "abstract": "The performance of graph neural nets (GNNs) is known to gradually decrease with increasing number of layers. This decay is partly attributed to oversmoothing, where repeated graph convolutions eventually make node embeddings indistinguishable. We take a closer look at two different interpretations, aiming to quantify oversmoothing. Our main contribution is PairNorm, a novel normalization layer that is based on a careful analysis of the graph convolution operator, which prevents all node embeddings from becoming too similar. What is more, PairNorm is fast, easy to implement without any change to network architecture nor any additional parameters, and is broadly applicable to any GNN. Experiments on real-world graphs demonstrate that PairNorm makes deeper GCN, GAT, and SGC models more robust against oversmoothing, and significantly boosts performance for a new problem setting that benefits from deeper GNNs. Code is available at https://github.com/LingxiaoShawn/PairNorm.", "full_presentation_video": ""}, "forum": "rkecl1rtwB", "id": "rkecl1rtwB"}, "B1l4SgHKDH": {"content": {"appendix": "", "TL;DR": "We show that Energy-Based models when trained on the residual of an auto-regressive language model can be used effectively and efficiently to generate text. ", "keywords": ["energy based models", "generation", "importance sampling", "language modeling", "machine translation", "nlp", "text generation"], "paperhash": "deng|residual_energybased_models_for_text_generation", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Yuntian Deng", "Anton Bakhtin", "Myle Ott", "Arthur Szlam", "Marc'Aurelio Ranzato"], "_bibtex": "@inproceedings{\nDeng2020Residual,\ntitle={Residual Energy-Based Models for Text Generation},\nauthor={Yuntian Deng and Anton Bakhtin and Myle Ott and Arthur Szlam and Marc'Aurelio Ranzato},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1l4SgHKDH}\n}", "authorids": ["dengyuntian@seas.harvard.edu", "yolo@fb.com", "aszlam@fb.com", "ranzato@fb.com"], "title": "Residual Energy-Based Models for Text Generation", "original_pdf": "/attachment/06c202af4b0bd1738dea7afbb100bbb3cbe14e06.pdf", "pdf": "/pdf/86d8335d8d945dcfcddd7c5e931d0d6096ec32ee.pdf", "abstract": "Text generation is ubiquitous in many NLP tasks, from summarization, to dialogue and machine translation. The dominant parametric approach is based on locally normalized models which predict one word at a time. While these work remarkably well, they are plagued by exposure bias due to the greedy nature of the generation process. In this work, we investigate un-normalized energy-based models (EBMs) which operate not at the token but at the sequence level. In order to make training tractable, we first work in the residual of a pretrained locally normalized language model and second we train using noise contrastive estimation. Furthermore, since the EBM works at the sequence level, we can leverage pretrained bi-directional contextual representations, such as BERT and RoBERTa. Our experiments on two large language modeling datasets show that residual EBMs yield lower perplexity compared to locally normalized baselines. Moreover, generation via importance sampling is very efficient and of higher quality than the baseline models according to human evaluation.", "full_presentation_video": ""}, "forum": "B1l4SgHKDH", "id": "B1l4SgHKDH"}, "B1gX8kBtPr": {"content": {"appendix": "", "TL;DR": "We prove that for a large class of functions f there exists an interval certified robust network approximating f up to arbitrary precision.", "keywords": ["adversarial", "adversarial attacks", "interval bound propagation", "relu networks", "robustness", "universal approximation"], "paperhash": "baader|universal_approximation_with_certified_networks", "code": "https://github.com/eth-sri/UniversalCertificationTheory", "spotlight_video": "", "authorids": ["mbaader@inf.ethz.ch", "matthew.mirman@inf.ethz.ch", "martin.vechev@inf.ethz.ch"], "poster": "", "slides": "", "authors": ["Maximilian Baader", "Matthew Mirman", "Martin Vechev"], "_bibtex": "@inproceedings{\nBaader2020Universal,\ntitle={Universal Approximation with Certified Networks},\nauthor={Maximilian Baader and Matthew Mirman and Martin Vechev},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gX8kBtPr}\n}", "original_pdf": "/attachment/3c090e92685f5aa6424c60635ad34704bb5f54c9.pdf", "title": "Universal Approximation with Certified Networks", "pdf": "/pdf/478fda14f0b96fc087736e95a0678671065df1bc.pdf", "abstract": "Training neural networks to be certifiably robust is critical to ensure their safety against adversarial attacks. However, it is currently very difficult to train a neural network that is both accurate and certifiably robust. In this work we take a step towards addressing this challenge. We prove that for every continuous function $f$, there exists a network $n$ such that:\n(i) $n$ approximates $f$ arbitrarily close, and (ii) simple interval bound propagation of a region $B$ through $n$ yields a result that is arbitrarily close to the optimal output of $f$ on $B$. Our result can be seen as a Universal Approximation Theorem for interval-certified ReLU networks. To the best of our knowledge, this is the first work to prove the existence of accurate, interval-certified networks.", "full_presentation_video": ""}, "forum": "B1gX8kBtPr", "id": "B1gX8kBtPr"}, "rke3TJrtPS": {"content": {"appendix": "", "TL;DR": "We propose a new algorithm that learns constraint-satisfying policies, and provide theoretical analysis and empirical demonstration in the context of reinforcement learning with constraints.", "keywords": ["fairness", "optimization", "reinforcement learning"], "paperhash": "yang|projectionbased_constrained_policy_optimization", "code": "https://sites.google.com/view/iclr2020-pcpo", "spotlight_video": "", "authorids": ["ty3@princeton.edu", "justinian.rosca@siemens.com", "karthikn@cs.princeton.edu", "ramadge@princeton.edu"], "poster": "", "slides": "", "authors": ["Tsung-Yen Yang", "Justinian Rosca", "Karthik Narasimhan", "Peter J. Ramadge"], "_bibtex": "@inproceedings{\nYang2020Projection-Based,\ntitle={Projection-Based Constrained Policy Optimization},\nauthor={Tsung-Yen Yang and Justinian Rosca and Karthik Narasimhan and Peter J. Ramadge},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rke3TJrtPS}\n}", "original_pdf": "/attachment/cdf10e9612feb89be0f318b8342c12a0e62d6757.pdf", "title": "Projection-Based Constrained Policy Optimization", "pdf": "/pdf/02b7eaafb9b63b62aa428d07c9053b52134fff5e.pdf", "abstract": "We consider the problem of learning control policies that optimize a reward function while satisfying constraints due to considerations of safety, fairness, or other costs. We propose a new algorithm - Projection-Based Constrained Policy Optimization (PCPO), an iterative method for optimizing policies in a two-step process - the first step performs an unconstrained update while the second step reconciles the constraint violation by projecting the policy back onto the constraint set. We theoretically analyze PCPO and provide a lower bound on reward improvement, as well as an upper bound on constraint violation for each policy update. We further characterize the convergence of PCPO with projection based on two different metrics - L2 norm and Kullback-Leibler divergence. Our empirical results over several control tasks demonstrate that our algorithm achieves superior performance, averaging more than 3.5 times less constraint violation and around 15% higher reward compared to state-of-the-art methods.", "full_presentation_video": ""}, "forum": "rke3TJrtPS", "id": "rke3TJrtPS"}, "HJgcvJBFvB": {"content": {"appendix": "", "TL;DR": "We propose a simple randomization technique for improving generalization in deep reinforcement learning across tasks with various unseen visual patterns.", "keywords": ["generalization", "regularization", "reinforcement learning", "robotics"], "paperhash": "lee|network_randomization_a_simple_technique_for_generalization_in_deep_reinforcement_learning", "code": "https://github.com/pokaxpoka/netrand", "spotlight_video": "", "authorids": ["kiminlee@kaist.ac.kr", "kibok@umich.edu", "jinwoos@kaist.ac.kr", "honglak@eecs.umich.edu"], "poster": "", "slides": "", "authors": ["Kimin Lee", "Kibok Lee", "Jinwoo Shin", "Honglak Lee"], "_bibtex": "@inproceedings{\nLee2020Network,\ntitle={Network Randomization: A Simple Technique for Generalization in Deep Reinforcement Learning},\nauthor={Kimin Lee and Kibok Lee and Jinwoo Shin and Honglak Lee},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJgcvJBFvB}\n}", "original_pdf": "/attachment/e89b2046777f937f875939216edb24ce9efc1111.pdf", "title": "Network Randomization: A Simple Technique for Generalization in Deep Reinforcement Learning", "pdf": "/pdf/e6b7edd03b697541e4457dfbba7ed1e361cc329d.pdf", "abstract": "Deep reinforcement learning (RL) agents often fail to generalize to unseen environments (yet semantically similar to trained agents), particularly when they are trained on high-dimensional state spaces, such as images. In this paper, we propose a simple technique to improve a generalization ability of deep RL agents by introducing a randomized (convolutional) neural network that randomly perturbs input observations. It enables trained agents to adapt to new domains by learning robust features invariant across varied and randomized environments. Furthermore, we consider an inference method based on the Monte Carlo approximation to reduce the variance induced by this randomization. We demonstrate the superiority of our method across 2D CoinRun, 3D DeepMind Lab exploration and 3D robotics control tasks: it significantly outperforms various regularization and data augmentation methods for the same purpose.", "full_presentation_video": ""}, "forum": "HJgcvJBFvB", "id": "HJgcvJBFvB"}, "H1lK_lBtvS": {"content": {"appendix": "", "TL;DR": "Anomaly detection method that uses: openset techniques for better generalization, random-transformation classification for non-image data.", "keywords": ["anomaly detection", "generalization"], "paperhash": "bergman|classificationbased_anomaly_detection_for_general_data", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Liron Bergman", "Yedid Hoshen"], "_bibtex": "@inproceedings{\nBergman2020Classification-Based,\ntitle={Classification-Based Anomaly Detection for General Data},\nauthor={Liron Bergman and Yedid Hoshen},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1lK_lBtvS}\n}", "authorids": ["liron.bergman@mail.huji.ac.il", "yedid@cs.huji.ac.il"], "title": "Classification-Based Anomaly Detection for General Data", "original_pdf": "/attachment/cec379b3b1744b2c2d99353999187f79807fa9f5.pdf", "pdf": "/pdf/0cc1c7474fb7a5d646d371f84aef56b3e920caa7.pdf", "abstract": "Anomaly detection, finding patterns that substantially deviate from those seen previously, is one of the fundamental problems of artificial intelligence. Recently, classification-based methods were shown to achieve superior results on this task. In this work, we present a unifying view and propose an open-set method, GOAD, to relax current generalization assumptions. Furthermore, we extend the applicability of transformation-based methods to non-image data using random affine transformations. Our method is shown to obtain state-of-the-art accuracy and is applicable to broad data types. The strong performance of our method is extensively validated on multiple datasets from different domains. ", "full_presentation_video": ""}, "forum": "H1lK_lBtvS", "id": "H1lK_lBtvS"}, "Skgxcn4YDS": {"content": {"appendix": "", "TL;DR": "Language modeling for lifelong language learning.", "keywords": ["capacity", "catastrophic forgetting", "language modeling", "lifelong learning", "memory", "nlp"], "paperhash": "sun|lamol_language_modeling_for_lifelong_language_learning", "code": "https://github.com/jojotenya/LAMOL", "spotlight_video": "", "authorids": ["fankeng@mit.edu", "jojotenya@gmail.com", "hungyilee@ntu.edu.tw"], "poster": "", "slides": "", "authors": ["Fan-Keng Sun*", "Cheng-Hao Ho*", "Hung-Yi Lee"], "_bibtex": "@inproceedings{\nsun2020lamal,\ntitle={{\\{}LAMAL{\\}}: {\\{}LA{\\}}nguage Modeling Is All You Need for Lifelong Language Learning},\nauthor={Fan-Keng Sun and Cheng-Hao Ho and Hung-Yi Lee},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Skgxcn4YDS}\n}", "original_pdf": "/attachment/5cc6f2730274eb18aec8de902bc6eee8106c6595.pdf", "title": "LAMOL: LAnguage MOdeling for Lifelong Language Learning", "pdf": "/pdf/d2073f654646aa2c3b2cf36d5c3201cab5dbc7bf.pdf", "abstract": "Most research on lifelong learning applies to images or games, but not language.\nWe present LAMOL, a simple yet effective method for lifelong language learning (LLL) based on language modeling.\nLAMOL replays pseudo-samples of previous tasks while requiring no extra memory or model capacity.\nSpecifically, LAMOL is a language model that simultaneously learns to solve the tasks and generate training samples.\nWhen the model is trained for a new task, it generates pseudo-samples of previous tasks for training alongside data for the new task.\nThe results show that LAMOL prevents catastrophic forgetting without any sign of intransigence and can perform five very different language tasks sequentially with only one model. \nOverall, LAMOL outperforms previous methods by a considerable margin and is only 2-3% worse than multitasking, which is usually considered the LLL upper bound.\nThe source code is available at https://github.com/jojotenya/LAMOL.", "full_presentation_video": ""}, "forum": "Skgxcn4YDS", "id": "Skgxcn4YDS"}, "BJgQ4lSFPH": {"content": {"appendix": "", "keywords": ["attention", "nlp", "question answering"], "paperhash": "wang|structbert_incorporating_language_structures_into_pretraining_for_deep_language_understanding", "spotlight_video": "", "poster": "", "slides": "", "abstract": "Recently, the pre-trained language model, BERT (and its robustly optimized version RoBERTa), has attracted a lot of attention in natural language understanding (NLU), and achieved state-of-the-art accuracy in various NLU tasks, such as sentiment classification, natural language inference, semantic textual similarity and question answering. Inspired by the linearization exploration work of Elman, we extend BERT to a new model, StructBERT, by incorporating language structures into pre-training. Specifically, we pre-train StructBERT with two auxiliary tasks to make the most of the sequential order of words and sentences, which leverage language structures at the word and sentence levels, respectively. As a result, the new model is adapted to different levels of language understanding required by downstream tasks.\n\nThe StructBERT with structural pre-training gives surprisingly good empirical results on a variety of downstream tasks, including pushing the state-of-the-art on the GLUE benchmark to 89.0 (outperforming all published models at the time of model submission), the F1 score on SQuAD v1.1 question answering to 93.0, the accuracy on SNLI to 91.7.", "_bibtex": "@inproceedings{\nWang2020StructBERT:,\ntitle={StructBERT: Incorporating Language Structures into Pre-training for Deep Language Understanding},\nauthor={Wei Wang and Bin Bi and Ming Yan and Chen Wu and Jiangnan Xia and Zuyi Bao and Liwei Peng and Luo Si},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJgQ4lSFPH}\n}", "authorids": ["hebian.ww@alibaba-inc.com", "b.bi@alibaba-inc.com", "ym119608@alibaba-inc.com", "wuchen.wc@alibaba-inc.com", "jiangnan.xjn@alibaba-inc.com", "zuyi.bzy@alibaba-inc.com", "liwei.peng@alibaba-inc.com", "luo.si@alibaba-inc.com"], "title": "StructBERT: Incorporating Language Structures into Pre-training for Deep Language Understanding", "authors": ["Wei Wang", "Bin Bi", "Ming Yan", "Chen Wu", "Jiangnan Xia", "Zuyi Bao", "Liwei Peng", "Luo Si"], "original_pdf": "/attachment/5c636b602c138c3b16b505bc5c06d9b812373538.pdf", "pdf": "/pdf/f735ac06bc2574a936d0b04f0dfb84e8a26078af.pdf", "full_presentation_video": ""}, "forum": "BJgQ4lSFPH", "id": "BJgQ4lSFPH"}, "Syx4wnEtvH": {"content": {"appendix": "", "TL;DR": "A fast optimizer for general applications and large-batch training.", "keywords": ["attention", "distributed", "imagenet", "learning rate", "memory", "optimization"], "paperhash": "you|large_batch_optimization_for_deep_learning_training_bert_in_76_minutes", "code": "https://github.com/tensorflow/addons/blob/master/tensorflow_addons/optimizers/lamb.py", "spotlight_video": "", "authorids": ["youyang@cs.berkeley.edu", "jingli@google.com", "sashank@google.com", "jhseu@google.com", "sanjivk@google.com", "bsrinadh@google.com", "xiaodansong@google.com", "demmel@berkeley.edu", "keutzer@berkeley.edu", "chohsieh@cs.ucla.edu"], "poster": "", "slides": "", "authors": ["Yang You", "Jing Li", "Sashank Reddi", "Jonathan Hseu", "Sanjiv Kumar", "Srinadh Bhojanapalli", "Xiaodan Song", "James Demmel", "Kurt Keutzer", "Cho-Jui Hsieh"], "_bibtex": "@inproceedings{\nYou2020Large,\ntitle={Large Batch Optimization for Deep Learning: Training BERT in 76 minutes},\nauthor={Yang You and Jing Li and Sashank Reddi and Jonathan Hseu and Sanjiv Kumar and Srinadh Bhojanapalli and Xiaodan Song and James Demmel and Kurt Keutzer and Cho-Jui Hsieh},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Syx4wnEtvH}\n}", "original_pdf": "/attachment/fba406f629ae63d9ffc7183dc68397ea2cb3955c.pdf", "title": "Large Batch Optimization for Deep Learning: Training BERT in 76 minutes", "pdf": "/pdf/857c7f5fd13b287d2aa9ecea6712812c11f75f37.pdf", "abstract": "Training large deep neural networks on massive datasets is computationally very challenging. There has been recent surge in interest in using large batch stochastic optimization methods to tackle this issue. The most prominent algorithm in this line of research is LARS, which by employing layerwise adaptive learning rates trains ResNet on ImageNet in a few minutes. However, LARS performs poorly for attention models like BERT, indicating that its performance gains are not consistent across tasks. In this paper, we first study a principled layerwise adaptation strategy to accelerate training of deep neural networks using large mini-batches. Using this strategy, we develop a new layerwise adaptive large batch optimization technique called LAMB; we then provide convergence analysis of LAMB as well as LARS, showing convergence to a stationary point in general nonconvex settings. Our empirical results demonstrate the superior performance of LAMB across various tasks such as BERT and ResNet-50 training with very little hyperparameter tuning. In particular, for BERT training, our optimizer enables use of very large batch sizes of 32868 without any degradation of performance. By increasing the batch size to the memory limit of a TPUv3 Pod, BERT training time can be reduced from 3 days to just 76 minutes.", "full_presentation_video": ""}, "forum": "Syx4wnEtvH", "id": "Syx4wnEtvH"}, "SJgob6NKvH": {"content": {"appendix": "", "TL;DR": "We show language understanding via reading is promising way to learn policies that generalise to new environments.", "keywords": ["nlp", "reading comprehension", "reasoning", "reinforcement learning"], "paperhash": "zhong|rtfm_generalising_to_new_environment_dynamics_via_reading", "spotlight_video": "", "poster": "", "slides": "", "abstract": "Obtaining policies that can generalise to new environments in reinforcement learning is challenging. In this work, we demonstrate that language understanding via a reading policy learner is a promising vehicle for generalisation to new environments. We propose a grounded policy learning problem, Read to Fight Monsters (RTFM), in which the agent must jointly reason over a language goal, relevant dynamics described in a document, and environment observations. We procedurally generate environment dynamics and corresponding language descriptions of the dynamics, such that agents must read to understand new environment dynamics instead of memorising any particular information. In addition, we propose txt2\u03c0, a model that captures three-way interactions between the goal, document, and observations. On RTFM, txt2\u03c0 generalises to new environments with dynamics not seen during training via reading. Furthermore, our model outperforms baselines such as FiLM and language-conditioned CNNs on RTFM. Through curriculum learning, txt2\u03c0 produces policies that excel on complex RTFM tasks requiring several reasoning and coreference steps.", "_bibtex": "@inproceedings{\nZhong2020RTFM:,\ntitle={RTFM: Generalising to New Environment Dynamics via Reading},\nauthor={Victor Zhong and Tim Rockt\u00e4schel and Edward Grefenstette},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJgob6NKvH}\n}", "authorids": ["victor@victorzhong.com", "tim.rocktaeschel@gmail.com", "egrefen@gmail.com"], "title": "RTFM: Generalising to New Environment Dynamics via Reading", "authors": ["Victor Zhong", "Tim Rockt\u00e4schel", "Edward Grefenstette"], "original_pdf": "/attachment/5f657ebe4e0ed1345106f9b34940ed06b4aac69c.pdf", "pdf": "/pdf/3d8753316d9acf5fccd77e6b1674e7e8b857c7c7.pdf", "full_presentation_video": ""}, "forum": "SJgob6NKvH", "id": "SJgob6NKvH"}, "SJg5J6NtDr": {"content": {"appendix": "", "keywords": ["imitation learning", "meta learning", "reinforcement learning"], "paperhash": "zhou|watch_try_learn_metalearning_from_demonstrations_and_rewards", "code": "https://drive.google.com/open?id=1f1LzO0fe1m-kINY8DTgL6JGimVGiQOuz", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Allan Zhou", "Eric Jang", "Daniel Kappler", "Alex Herzog", "Mohi Khansari", "Paul Wohlhart", "Yunfei Bai", "Mrinal Kalakrishnan", "Sergey Levine", "Chelsea Finn"], "_bibtex": "@inproceedings{\nZhou2020Watch,,\ntitle={Watch, Try, Learn: Meta-Learning from Demonstrations and Rewards},\nauthor={Allan Zhou and Eric Jang and Daniel Kappler and Alex Herzog and Mohi Khansari and Paul Wohlhart and Yunfei Bai and Mrinal Kalakrishnan and Sergey Levine and Chelsea Finn},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJg5J6NtDr}\n}", "authorids": ["ayz@stanford.edu", "ejang@google.com", "kappler@google.com", "alexherzog@google.com", "khansari@google.com", "wohlhart@google.com", "yunfeibai@google.com", "kalakris@google.com", "slevine@google.com", "cbfinn@cs.stanford.edu"], "title": "Watch, Try, Learn: Meta-Learning from Demonstrations and Rewards", "original_pdf": "/attachment/b96e1af00f52f18c33d9a8e29af5b32feb9ed813.pdf", "pdf": "/pdf/dbe739d65d07b15df28fc4f40e29d648966e4165.pdf", "abstract": "Imitation learning allows agents to learn complex behaviors from demonstrations. However, learning a complex vision-based task may require an impractical number of demonstrations. Meta-imitation learning is a promising approach towards enabling agents to learn a new task from one or a few demonstrations by leveraging experience from learning similar tasks. In the presence of task ambiguity or unobserved dynamics, demonstrations alone may not provide enough information; an agent must also try the task to successfully infer a policy. In this work, we propose a method that can learn to learn from both demonstrations and trial-and-error experience with sparse reward feedback. In comparison to meta-imitation, this approach enables the agent to effectively and efficiently improve itself autonomously beyond the demonstration data. In comparison to meta-reinforcement learning, we can scale to substantially broader distributions of tasks, as the demonstration reduces the burden of exploration. Our experiments show that our method significantly outperforms prior approaches on a set of challenging, vision-based control tasks.", "full_presentation_video": ""}, "forum": "SJg5J6NtDr", "id": "SJg5J6NtDr"}, "Hye1RJHKwB": {"content": {"appendix": "", "TL;DR": "We decompose the discriminator in a GAN in a principled way so that each component can be independently trained on different parts of the input. The resulting \"FactorGAN\" can be used for semi-supervised learning and in missing data scenarios.", "keywords": ["adversarial", "gan", "generation", "image generation", "missing data", "semi supervised learning"], "paperhash": "stoller|training_generative_adversarial_networks_from_incomplete_observations_using_factorised_discriminators", "code": "https://www.dropbox.com/s/gtc7m7pc4n2yt05/source.zip?dl=1", "spotlight_video": "", "authorids": ["d.stoller@qmul.ac.uk", "sewert@spotify.com", "s.e.dixon@qmul.ac.uk"], "poster": "", "slides": "", "authors": ["Daniel Stoller", "Sebastian Ewert", "Simon Dixon"], "_bibtex": "@inproceedings{\nStoller2020Training,\ntitle={Training Generative Adversarial Networks from Incomplete Observations using Factorised Discriminators},\nauthor={Daniel Stoller and Sebastian Ewert and Simon Dixon},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Hye1RJHKwB}\n}", "original_pdf": "/attachment/858be1a64692ebfcc0ee8131c227274b3ba8b89c.pdf", "title": "Training Generative Adversarial Networks from Incomplete Observations using Factorised Discriminators", "pdf": "/pdf/dd2f617b749f8375bb46bfd434be0338b70e4cd5.pdf", "abstract": "Generative adversarial networks (GANs) have shown great success in applications such as image generation and inpainting.\nHowever, they typically require large datasets, which are often not available, especially in the context of prediction tasks such as image segmentation that require labels. Therefore, methods such as the CycleGAN use more easily available unlabelled data, but do not offer a way to leverage additional labelled data for improved performance. To address this shortcoming, we show how to factorise the joint data distribution into a set of lower-dimensional distributions along with their dependencies. This allows splitting the discriminator in a GAN into multiple \"sub-discriminators\" that can be independently trained from incomplete observations. Their outputs can be combined to estimate the density ratio between the joint real and the generator distribution, which enables training generators as in the original GAN framework. We apply our method to image generation, image segmentation and audio source separation, and obtain improved performance over a standard GAN when additional incomplete training examples are available. For the Cityscapes segmentation task in particular, our method also improves accuracy by an absolute 14.9% over CycleGAN while using only 25 additional paired examples.", "full_presentation_video": ""}, "forum": "Hye1RJHKwB", "id": "Hye1RJHKwB"}, "r1lL4a4tDB": {"content": {"appendix": "", "TL;DR": "A deep RL algorithm for solving POMDPs by auto-encoding the underlying states using a variational recurrent model", "keywords": ["continuous control", "reinforcement learning", "rnn", "variational inference"], "paperhash": "han|variational_recurrent_models_for_solving_partially_observable_control_tasks", "code": "https://github.com/oist-cnru/Variational-Recurrent-Models", "spotlight_video": "", "authorids": ["dongqi.han@oist.jp", "doya@oist.jp", "jun.tani@oist.jp"], "poster": "", "slides": "", "authors": ["Dongqi Han", "Kenji Doya", "Jun Tani"], "_bibtex": "@inproceedings{\nhan2020variational,\ntitle={Variational Recurrent Models for Solving Partially Observable Control Tasks},\nauthor={Dongqi Han and Kenji Doya and Jun Tani},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1lL4a4tDB}\n}", "original_pdf": "/attachment/092631903d35e7768f1afbc03180ac37733724cb.pdf", "title": "Variational Recurrent Models for Solving Partially Observable Control Tasks", "pdf": "/pdf/008bf8ee5b9f826cd28cccde848cfe092052030e.pdf", "abstract": "In partially observable (PO) environments, deep reinforcement learning (RL) agents often suffer from unsatisfactory performance, since two problems need to be tackled together: how to extract information from the raw observations to solve the task, and how to improve the policy. In this study, we propose an RL algorithm for solving PO tasks. Our method comprises two parts: a variational recurrent model (VRM) for modeling the environment, and an RL controller that has access to both the environment and the VRM. The proposed algorithm was tested in two types of PO robotic control tasks, those in which either coordinates or velocities were not observable and those that require long-term memorization. Our experiments show that the proposed algorithm achieved better data efficiency and/or learned more optimal policy than other alternative approaches in tasks in which unobserved states cannot be inferred from raw observations in a simple manner.", "full_presentation_video": ""}, "forum": "r1lL4a4tDB", "id": "r1lL4a4tDB"}, "SylL0krYPS": {"content": {"appendix": "", "TL;DR": "We study the problem of continuous control agents in deep RL with adversarial attacks and proposed a two-step algorithm based on learned model dynamics. ", "keywords": ["adversarial", "adversarial attacks", "continuous control", "perturbation", "reinforcement learning", "robustness"], "paperhash": "weng|toward_evaluating_robustness_of_deep_reinforcement_learning_with_continuous_control", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Tsui-Wei Weng", "Krishnamurthy (Dj) Dvijotham*", "Jonathan Uesato*", "Kai Xiao*", "Sven Gowal*", "Robert Stanforth*", "Pushmeet Kohli"], "_bibtex": "@inproceedings{\nWeng2020Toward,\ntitle={Toward Evaluating Robustness of Deep Reinforcement Learning with Continuous Control},\nauthor={Tsui-Wei Weng and Krishnamurthy (Dj) Dvijotham* and Jonathan Uesato* and Kai Xiao* and Sven Gowal* and Robert Stanforth* and Pushmeet Kohli},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SylL0krYPS}\n}", "authorids": ["twweng@mit.edu", "dvij@google.com", "juesato@google.com", "kaix@mit.edu", "sgowal@google.com", "stanforth@google.com", "pushmeet@google.com"], "title": "Toward Evaluating Robustness of Deep Reinforcement Learning with Continuous Control", "original_pdf": "/attachment/8e5924e5256c7cae23c3b35025b871775b7349dc.pdf", "pdf": "/pdf/e21e669d2db82623943d5305efdb02983e5f1781.pdf", "abstract": "Deep reinforcement learning has achieved great success in many previously difficult reinforcement learning tasks, yet recent studies show that deep RL agents are also unavoidably susceptible to adversarial perturbations, similar to deep neural networks in classification tasks. Prior works mostly focus on model-free adversarial attacks and agents with discrete actions. In this work, we study the problem of continuous control agents in deep RL with adversarial attacks and propose the first two-step algorithm based on learned model dynamics. Extensive experiments on various MuJoCo domains (Cartpole, Fish, Walker, Humanoid) demonstrate that our proposed framework is much more effective and efficient than model-free based attacks baselines in degrading agent performance as well as driving agents to unsafe states. ", "full_presentation_video": ""}, "forum": "SylL0krYPS", "id": "SylL0krYPS"}, "SJeq9JBFvH": {"content": {"appendix": "", "keywords": ["optimization"], "paperhash": "huijben|deep_probabilistic_subsampling_for_taskadaptive_compressed_sensing", "spotlight_video": "", "poster": "", "slides": "", "abstract": "The field of deep learning is commonly concerned with optimizing predictive models using large pre-acquired datasets of densely sampled datapoints or signals. In this work, we demonstrate that the deep learning paradigm can be extended to incorporate a subsampling scheme that is jointly optimized under a desired minimum sample rate. We present Deep Probabilistic Subsampling (DPS), a widely applicable framework for task-adaptive compressed sensing that enables end-to end optimization of an optimal subset of signal samples with a subsequent model that performs a required task. We demonstrate strong performance on reconstruction and classification tasks of a toy dataset, MNIST, and CIFAR10 under stringent subsampling rates in both the pixel and the spatial frequency domain. Due to the task-agnostic nature of the framework, DPS is directly applicable to all real-world domains that benefit from sample rate reduction.", "_bibtex": "@inproceedings{\nHuijben2020Deep,\ntitle={Deep probabilistic subsampling for task-adaptive compressed sensing},\nauthor={Iris A.M. Huijben and Bastiaan S. Veeling and Ruud J.G. van Sloun},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJeq9JBFvH}\n}", "authorids": ["i.a.m.huijben@tue.nl", "basveeling@gmail.com", "r.j.g.v.sloun@tue.nl"], "title": "Deep probabilistic subsampling for task-adaptive compressed sensing", "authors": ["Iris A.M. Huijben", "Bastiaan S. Veeling", "Ruud J.G. van Sloun"], "original_pdf": "/attachment/f18a417b057bd6a27850e7b4eb3a4af77097ae26.pdf", "pdf": "/pdf/46be97c7ee83cc19aa7e13c83340c04aa4fd5c5a.pdf", "full_presentation_video": ""}, "forum": "SJeq9JBFvH", "id": "SJeq9JBFvH"}, "H1emfT4twB": {"content": {"appendix": "", "TL;DR": "Meta-learning methods used for vision, directly applied to NLP, perform worse than nearest neighbors on new classes; we can do better with distributional signatures.", "keywords": ["attention", "computer vision", "fewshot learning", "meta learning", "text classification"], "paperhash": "bao|fewshot_text_classification_with_distributional_signatures", "code": "https://github.com/YujiaBao/Distributional-Signatures", "spotlight_video": "", "authorids": ["yujia@csail.mit.edu", "rmwu@mit.edu", "shiyu.chang@ibm.com", "regina@csail.mit.edu"], "poster": "", "slides": "", "authors": ["Yujia Bao", "Menghua Wu", "Shiyu Chang", "Regina Barzilay"], "_bibtex": "@inproceedings{\nBao2020Few-shot,\ntitle={Few-shot Text Classification with Distributional Signatures},\nauthor={Yujia Bao and Menghua Wu and Shiyu Chang and Regina Barzilay},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1emfT4twB}\n}", "original_pdf": "/attachment/a90b4b7b3d3a5baa79570d60c725c959d431870d.pdf", "title": "Few-shot Text Classification with Distributional Signatures", "pdf": "/pdf/4fa31bfd21e94185de50157b87f30279d335676f.pdf", "abstract": "In this paper, we explore meta-learning for few-shot text classification. Meta-learning has shown strong performance in computer vision, where low-level patterns are transferable across learning tasks. However, directly applying this approach to text is challenging--lexical features highly informative for one task may be insignificant for another. Thus, rather than learning solely from words, our model also leverages their distributional signatures, which encode pertinent word occurrence patterns. Our model is trained within a meta-learning framework to map these signatures into attention scores, which are then used to weight the lexical representations of words. We demonstrate that our model consistently outperforms prototypical networks learned on lexical knowledge (Snell et al., 2017) in both few-shot text classification and relation classification by a significant margin across six benchmark datasets (20.0% on average in 1-shot classification).", "full_presentation_video": ""}, "forum": "H1emfT4twB", "id": "H1emfT4twB"}, "rJehNT4YPr": {"content": {"appendix": "", "TL;DR": "We present an efficient and adaptive framework for comparing image classifiers to maximize the discrepancies between the classifiers, in place of comparing on fixed test sets.", "keywords": ["distributed", "imagenet"], "paperhash": "wang|i_am_going_mad_maximum_discrepancy_competition_for_comparing_classifiers_adaptively", "code": "https://github.com/TAMU-VITA/MAD", "spotlight_video": "", "authorids": ["htwang@tamu.edu", "wiwjp619@tamu.edu", "atlaswang@tamu.edu", "kede.ma@cityu.edu.hk"], "poster": "", "slides": "", "authors": ["Haotao Wang", "Tianlong Chen", "Zhangyang Wang", "Kede Ma"], "_bibtex": "@inproceedings{\nWang2020I,\ntitle={I Am Going MAD: Maximum Discrepancy Competition for Comparing Classifiers Adaptively},\nauthor={Haotao Wang and Tianlong Chen and Zhangyang Wang and Kede Ma},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJehNT4YPr}\n}", "original_pdf": "/attachment/165a542ec11bae3c4467b696ebaa9a135fb5b8d7.pdf", "title": "I Am Going MAD: Maximum Discrepancy Competition for Comparing Classifiers Adaptively", "pdf": "/pdf/6829b219f87f9b9f7572e56e63ff8c28f4937e56.pdf", "abstract": "The learning of hierarchical representations for image classification has experienced an impressive series of successes due in part to the availability of large-scale labeled data for training. On the other hand, the trained classifiers have traditionally been evaluated on small and fixed sets of test images, which are deemed to be extremely sparsely distributed in the space of all natural images. It is thus questionable whether recent performance improvements on the excessively re-used test sets generalize to real-world natural images with much richer content variations. Inspired by efficient stimulus selection for testing perceptual models in psychophysical and physiological studies, we present an alternative framework for comparing image classifiers, which we name the MAximum Discrepancy (MAD) competition. Rather than comparing image classifiers using fixed test images, we adaptively sample a small test set from an arbitrarily large corpus of unlabeled images so as to maximize the discrepancies between the classifiers, measured by the distance over WordNet hierarchy. Human labeling on the resulting model-dependent image sets reveals the relative performance of the competing classifiers, and provides useful insights on potential ways to improve them. We report the MAD competition results of eleven ImageNet classifiers while noting that the framework is readily extensible and cost-effective to add future classifiers into the competition. Codes can be found at https://github.com/TAMU-VITA/MAD.", "full_presentation_video": ""}, "forum": "rJehNT4YPr", "id": "rJehNT4YPr"}, "SJexHkSFPS": {"content": {"appendix": "", "TL;DR": "Reinforcement learning formulation that allows agents to think and act at the same time, demonstrated on real-world robotic grasping.", "keywords": ["reinforcement learning", "robotics"], "paperhash": "xiao|thinking_while_moving_deep_reinforcement_learning_with_concurrent_control", "spotlight_video": "", "poster": "", "slides": "", "abstract": "We study reinforcement learning in settings where sampling an action from the policy must be done concurrently with the time evolution of the controlled system, such as when a robot must decide on the next action while still performing the previous action. Much like a person or an animal, the robot must think and move at the same time, deciding on its next action before the previous one has completed. In order to develop an algorithmic framework for such concurrent control problems, we start with a continuous-time formulation of the Bellman equations, and then discretize them in a way that is aware of system delays. We instantiate this new class of approximate dynamic programming methods via a simple architectural extension to existing value-based deep reinforcement learning algorithms. We evaluate our methods on simulated benchmark tasks and a large-scale robotic grasping task where the robot must \"think while moving.\"", "_bibtex": "@inproceedings{\nXiao2020Thinking,\ntitle={Thinking While Moving: Deep Reinforcement Learning with Concurrent Control},\nauthor={Ted Xiao and Eric Jang and Dmitry Kalashnikov and Sergey Levine and Julian Ibarz and Karol Hausman and Alexander Herzog},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJexHkSFPS}\n}", "authorids": ["tedxiao@google.com", "ejang@google.com", "dkalashnikov@google.com", "slevine@google.com", "julianibarz@google.com", "karolhausman@google.com", "alexherzog@google.com"], "title": "Thinking While Moving: Deep Reinforcement Learning with Concurrent Control", "authors": ["Ted Xiao", "Eric Jang", "Dmitry Kalashnikov", "Sergey Levine", "Julian Ibarz", "Karol Hausman", "Alexander Herzog"], "original_pdf": "/attachment/57af3c4d7e744b2ba8886acf9d10a089c96cba2c.pdf", "pdf": "/pdf/0b01c152c145c317d0f64e522bc8badcfd6d2f29.pdf", "full_presentation_video": ""}, "forum": "SJexHkSFPS", "id": "SJexHkSFPS"}, "rkenmREFDr": {"content": {"appendix": "", "TL;DR": "We use supervised learning (and in particular deep learning) to produce better space partitions for fast nearest neighbor search.", "keywords": ["locality sensitive hashing", "quantization"], "paperhash": "dong|learning_space_partitions_for_nearest_neighbor_search", "code": "https://anonymous.4open.science/r/cdd789a8-818c-4675-98fd-39f8da656129/", "spotlight_video": "", "authorids": ["yihedong@gmail.com", "indyk@mit.edu", "ilyaraz@microsoft.com", "tal.wagner@gmail.com"], "poster": "", "slides": "", "authors": ["Yihe Dong", "Piotr Indyk", "Ilya Razenshteyn", "Tal Wagner"], "_bibtex": "@inproceedings{\nDong2020Learning,\ntitle={Learning Space Partitions for Nearest Neighbor Search},\nauthor={Yihe Dong and Piotr Indyk and Ilya Razenshteyn and Tal Wagner},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkenmREFDr}\n}", "original_pdf": "/attachment/0f0beb2ed855c6760a66929f01898c14b4c29269.pdf", "title": "Learning Space Partitions for Nearest Neighbor Search", "pdf": "/pdf/f51865310de5e0e9859b34d6a9ce1151193cd247.pdf", "abstract": "Space partitions of $\\mathbb{R}^d$ underlie a vast and important\nclass of fast nearest neighbor search (NNS) algorithms. Inspired by recent theoretical work on NNS for general metric spaces (Andoni et al. 2018b,c), we develop a new framework for building space partitions reducing the problem to balanced graph partitioning followed by supervised classification.\nWe instantiate this general approach with the KaHIP graph partitioner (Sanders and Schulz 2013) and neural networks, respectively, to obtain a new partitioning procedure called Neural Locality-Sensitive Hashing (Neural LSH). On several standard benchmarks for NNS (Aumuller et al. 2017), our experiments show that the partitions obtained by Neural LSH consistently outperform partitions found by quantization-based and tree-based methods as well as classic, data-oblivious LSH.", "full_presentation_video": ""}, "forum": "rkenmREFDr", "id": "rkenmREFDr"}, "HJli2hNKDH": {"content": {"appendix": "", "TL;DR": "We isolate one factor of RL generalization by analyzing the case when the agent only overfits to the observations. We show that architectural implicit regularizations occur in this regime.", "keywords": ["generalization", "overfitting", "regularization", "reinforcement learning"], "paperhash": "song|observational_overfitting_in_reinforcement_learning", "spotlight_video": "", "poster": "", "slides": "", "abstract": "A major component of overfitting in model-free reinforcement learning (RL) involves the case where the agent may mistakenly correlate reward with certain spurious features from the observations generated by the Markov Decision Process (MDP). We provide a general framework for analyzing this scenario, which we use to design multiple synthetic benchmarks from only modifying the observation space of an MDP. When an agent overfits to different observation spaces even if the underlying MDP dynamics is fixed, we term this observational overfitting. Our experiments expose intriguing properties especially with regards to implicit regularization, and also corroborate results from previous works in RL generalization and supervised learning (SL). ", "_bibtex": "@inproceedings{\nSong2020Observational,\ntitle={Observational Overfitting in Reinforcement Learning},\nauthor={Xingyou Song and Yiding Jiang and Stephen Tu and Yilun Du and Behnam Neyshabur},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJli2hNKDH}\n}", "authorids": ["xsong@berkeley.edu", "ydjiang@google.com", "stephentu@google.com", "yilundu@mit.edu", "neyshabur@google.com"], "title": "Observational Overfitting in Reinforcement Learning", "authors": ["Xingyou Song", "Yiding Jiang", "Stephen Tu", "Yilun Du", "Behnam Neyshabur"], "original_pdf": "/attachment/29e48ebeb1a4e3cc9f942522044de1eb3595d38c.pdf", "pdf": "/pdf/414967553544dd06517120ef72ad27b1bffaf61b.pdf", "full_presentation_video": ""}, "forum": "HJli2hNKDH", "id": "HJli2hNKDH"}, "SkeyppEFvS": {"content": {"appendix": "", "keywords": ["capacity", "reasoning", "video prediction"], "paperhash": "baradel|cophy_counterfactual_learning_of_physical_dynamics", "code": "https://github.com/fabienbaradel/cophy", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Fabien Baradel", "Natalia Neverova", "Julien Mille", "Greg Mori", "Christian Wolf"], "_bibtex": "@inproceedings{\nBaradel2020CoPhy:,\ntitle={CoPhy: Counterfactual Learning of Physical Dynamics},\nauthor={Fabien Baradel and Natalia Neverova and Julien Mille and Greg Mori and Christian Wolf},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SkeyppEFvS}\n}", "authorids": ["fabien.baradel@insa-lyon.fr", "nneverova@fb.com", "julien.mille@insa-cvl.fr", "mori@cs.sfu.ca", "christian.wolf@insa-lyon.fr"], "title": "CoPhy: Counterfactual Learning of Physical Dynamics", "original_pdf": "/attachment/503b179d2367c446d221d0f628acaf9a4497bd85.pdf", "pdf": "/pdf/f00a0eeedb2a7f893adea8ba02e04d6836be26b0.pdf", "abstract": "Understanding causes and effects in mechanical systems is an essential component of reasoning in the physical world. This work poses a new problem of counterfactual learning of object mechanics from visual input. We develop the CoPhy benchmark to assess the capacity of the state-of-the-art models for causal physical reasoning in a synthetic 3D environment and propose a model for learning the physical dynamics in a counterfactual setting. Having observed a mechanical experiment that involves, for example, a falling tower of blocks, a set of bouncing balls or colliding objects, we learn to predict how its outcome is affected by an arbitrary intervention on its initial conditions, such as displacing one of the objects in the scene. The alternative future is predicted given the altered past and a latent representation of the confounders learned by the model in an end-to-end fashion with no supervision. We compare against feedforward video prediction baselines and show how observing alternative experiences allows the network to capture latent physical properties of the environment, which results in significantly more accurate predictions at the level of super human performance.", "full_presentation_video": ""}, "forum": "SkeyppEFvS", "id": "SkeyppEFvS"}, "rJg76kStwH": {"content": {"appendix": "", "TL;DR": "We employ graph neural networks in the variational EM framework for efficient inference and learning of Markov Logic Networks.", "keywords": ["graph networks", "reasoning", "variational inference"], "paperhash": "zhang|efficient_probabilistic_logic_reasoning_with_graph_neural_networks", "code": "https://github.com/expressGNN/ExpressGNN", "spotlight_video": "", "authorids": ["yuyu@gatech.edu", "xinshi.chen@gatech.edu", "yuanyang@gatech.edu", "arun.ramamurthy@siemens.com", "lbo@illinois.edu", "yuan.qi@antfin.com", "lsong@cc.gatech.edu"], "poster": "", "slides": "", "authors": ["Yuyu Zhang", "Xinshi Chen", "Yuan Yang", "Arun Ramamurthy", "Bo Li", "Yuan Qi", "Le Song"], "_bibtex": "@inproceedings{\nZhang2020Efficient,\ntitle={Efficient Probabilistic Logic Reasoning with Graph Neural Networks},\nauthor={Yuyu Zhang and Xinshi Chen and Yuan Yang and Arun Ramamurthy and Bo Li and Yuan Qi and Le Song},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJg76kStwH}\n}", "original_pdf": "/attachment/eb1cdf178a89bd0c9b9ddc89d283f6b3ced11760.pdf", "title": "Efficient Probabilistic Logic Reasoning with Graph Neural Networks", "pdf": "/pdf/0bca62b49b86b863a0581bd84402302041f4a94c.pdf", "abstract": "Markov Logic Networks (MLNs), which elegantly combine logic rules and probabilistic graphical models, can be used to address many knowledge graph problems. However, inference in MLN is computationally intensive, making the industrial-scale application of MLN very difficult. In recent years, graph neural networks (GNNs) have emerged as efficient and effective tools for large-scale graph problems. Nevertheless, GNNs do not explicitly incorporate prior logic rules into the models, and may require many labeled examples for a target task. In this paper, we explore the combination of MLNs and GNNs, and use graph neural networks for variational inference in MLN. We propose a GNN variant, named ExpressGNN, which strikes a nice balance between the representation power and the simplicity of the model. Our extensive experiments on several benchmark datasets demonstrate that ExpressGNN leads to effective and efficient probabilistic logic reasoning.", "full_presentation_video": ""}, "forum": "rJg76kStwH", "id": "rJg76kStwH"}, "S1erpeBFPB": {"content": {"appendix": "", "TL;DR": "We design an algorithm that reconstructs the key components of a novel deep learning system by exploiting a small amount of information leakage from a cache side-channel attack, Flush+Reload.", "keywords": ["imagenet", "neural architecture search", "security"], "paperhash": "hong|how_to_0wn_the_nas_in_your_spare_time", "code": "https://github.com/Sanghyun-Hong/How-to-0wn-NAS-in-Your-Spare-Time", "spotlight_video": "", "authorids": ["shhong@cs.umd.edu", "michael.davinroy@gmail.com", "cankaya@umiacs.umd.edu", "danadach@ece.umd.edu", "tdumitra@umiacs.umd.edu"], "poster": "", "slides": "", "authors": ["Sanghyun Hong", "Michael Davinroy", "Yi\u01e7itcan Kaya", "Dana Dachman-Soled", "Tudor Dumitra\u015f"], "_bibtex": "@inproceedings{\nHong2020How,\ntitle={How to 0wn the NAS in Your Spare Time},\nauthor={Sanghyun Hong and Michael Davinroy and Yi\u01e7itcan Kaya and Dana Dachman-Soled and Tudor Dumitra\u015f},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1erpeBFPB}\n}", "original_pdf": "/attachment/8ba37a02b709c86d3fde26c4a39f301a4f2c35a3.pdf", "title": "How to 0wn the NAS in Your Spare Time", "pdf": "/pdf/598041527370f8bd568a4493447d8e208055cea4.pdf", "abstract": "New data processing pipelines and novel network architectures increasingly drive the success of deep learning. In consequence, the industry considers top-performing architectures as intellectual property and devotes considerable computational resources to discovering such architectures through neural architecture search (NAS). This provides an incentive for adversaries to steal these novel architectures; when used in the cloud, to provide Machine Learning as a Service (MLaaS), the adversaries also have an opportunity to reconstruct the architectures by exploiting a range of hardware side-channels. However, it is challenging to reconstruct novel architectures and pipelines without knowing the computational graph (e.g., the layers, branches or skip connections), the architectural parameters (e.g., the number of filters in a convolutional layer) or the specific pre-processing steps (e.g. embeddings). In this paper, we design an algorithm that reconstructs the key components of a novel deep learning system by exploiting a small amount of information leakage from a cache side-channel attack, Flush+Reload. We use Flush+Reload to infer the trace of computations and the timing for each computation. Our algorithm then generates candidate computational graphs from the trace and eliminates incompatible candidates through a parameter estimation process. We implement our algorithm in PyTorch and Tensorflow. We demonstrate experimentally that we can reconstruct MalConv, a novel data pre-processing pipeline for malware detection, and ProxylessNAS-CPU, a novel network architecture for the ImageNet classification optimized to run on CPUs, without knowing the architecture family. In both cases, we achieve 0% error. These results suggest hardware side channels are a practical attack vector against MLaaS, and more efforts should be devoted to understanding their impact on the security of deep learning systems.", "full_presentation_video": ""}, "forum": "S1erpeBFPB", "id": "S1erpeBFPB"}, "SkgGCkrKvH": {"content": {"appendix": "", "TL;DR": "We propose Choco-SGD---decentralized SGD with compressed communication---for non-convex objectives and show its strong performance in various deep learning applications (on-device learning, datacenter case).", "keywords": ["compression", "privacy"], "paperhash": "koloskova|decentralized_deep_learning_with_arbitrary_communication_compression", "code": "https://github.com/epfml/ChocoSGD", "spotlight_video": "", "authorids": ["anastasia.koloskova@epfl.ch", "tao.lin@epfl.ch", "sebastian.stich@epfl.ch", "martin.jaggi@epfl.ch"], "poster": "", "slides": "", "authors": ["Anastasia Koloskova*", "Tao Lin*", "Sebastian U Stich", "Martin Jaggi"], "_bibtex": "@inproceedings{\nKoloskova*2020Decentralized,\ntitle={Decentralized Deep Learning with Arbitrary Communication Compression},\nauthor={Anastasia Koloskova* and Tao Lin* and Sebastian U Stich and Martin Jaggi},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SkgGCkrKvH}\n}", "original_pdf": "/attachment/4aff8cfc96d7cc90e7d4d9aeb7a2f5216b68509c.pdf", "title": "Decentralized Deep Learning with Arbitrary Communication Compression", "pdf": "/pdf/3934e547bc85fc7fb307d0440ebfc85da4efec1d.pdf", "abstract": "Decentralized training of deep learning models is a key element for enabling data privacy and on-device learning over networks, as well as for efficient scaling to large compute clusters. As current approaches are limited by network bandwidth, we propose the use of communication compression in the decentralized training context. We show that Choco-SGD achieves linear speedup in the number of workers for arbitrary high compression ratios on general non-convex functions, and non-IID training data. We demonstrate the practical performance of the algorithm in two key scenarios: the training of deep learning models (i) over decentralized user devices, connected by a peer-to-peer network and (ii) in a datacenter. ", "full_presentation_video": ""}, "forum": "SkgGCkrKvH", "id": "SkgGCkrKvH"}, "SkxJ8REYPH": {"content": {"appendix": "", "TL;DR": "SlowMo improves the optimization and generalization performance of communication-efficient decentralized algorithms without sacrificing speed.", "keywords": ["distributed", "distributed optimization", "generalization", "machine translation", "momentum", "optimization"], "paperhash": "wang|slowmo_improving_communicationefficient_distributed_sgd_with_slow_momentum", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Jianyu Wang", "Vinayak Tantia", "Nicolas Ballas", "Michael Rabbat"], "_bibtex": "@inproceedings{\nWang2020SlowMo:,\ntitle={SlowMo: Improving Communication-Efficient Distributed SGD with Slow Momentum},\nauthor={Jianyu Wang and Vinayak Tantia and Nicolas Ballas and Michael Rabbat},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SkxJ8REYPH}\n}", "authorids": ["jianyuw1@andrew.cmu.edu", "tantia@fb.com", "ballasn@fb.com", "mikerabbat@fb.com"], "title": "SlowMo: Improving Communication-Efficient Distributed SGD with Slow Momentum", "original_pdf": "/attachment/5d0d0fb3364cb6e46d0e07221220adede3442eb0.pdf", "pdf": "/pdf/e8558035e2690cc168ce3d147df58cab1cb0d93e.pdf", "abstract": "Distributed optimization is essential for training large models on large datasets. Multiple approaches have been proposed to reduce the communication overhead in distributed training, such as synchronizing only after performing multiple local SGD steps, and decentralized methods (e.g., using gossip algorithms) to decouple communications among workers. Although these methods run faster than AllReduce-based methods, which use blocking communication before every update, the resulting models may be less accurate after the same number of updates. Inspired by the BMUF method of Chen & Huo (2016), we propose a slow momentum (SlowMo) framework, where workers periodically synchronize and perform a momentum update, after multiple iterations of a base optimization algorithm. Experiments on image classification and machine translation tasks demonstrate that SlowMo consistently yields improvements in optimization and generalization performance relative to the base optimizer, even when the additional overhead is amortized over many updates so that the SlowMo runtime is on par with that of the base optimizer. We provide theoretical convergence guarantees showing that SlowMo converges to a stationary point of smooth non-convex losses. Since BMUF can be expressed through the SlowMo framework, our results also correspond to the first theoretical convergence guarantees for BMUF.", "full_presentation_video": ""}, "forum": "SkxJ8REYPH", "id": "SkxJ8REYPH"}, "r1ecqn4YwB": {"content": {"appendix": "", "TL;DR": "A novel deep interpretable architecture that achieves state of the art on three large scale univariate time series forecasting datasets ", "keywords": [], "paperhash": "oreshkin|nbeats_neural_basis_expansion_analysis_for_interpretable_time_series_forecasting", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Boris N. Oreshkin", "Dmitri Carpov", "Nicolas Chapados", "Yoshua Bengio"], "_bibtex": "@inproceedings{\nOreshkin2020N-BEATS:,\ntitle={N-BEATS: Neural basis expansion analysis for interpretable time series forecasting},\nauthor={Boris N. Oreshkin and Dmitri Carpov and Nicolas Chapados and Yoshua Bengio},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1ecqn4YwB}\n}", "authorids": ["boris@elementai.com", "dmitri.carpov@elementai.com", "chapados@elementai.com", "yoshua.bengio@mila.quebec"], "title": "N-BEATS: Neural basis expansion analysis for interpretable time series forecasting", "original_pdf": "/attachment/a872edceb8a91ee81e87029e955b07a98d1bff82.pdf", "pdf": "/pdf/8e7978f4e9295d1f68d2bc32a2643507a0d9546a.pdf", "abstract": "We focus on solving the univariate times series point forecasting problem using deep learning. We propose a deep neural architecture based on backward and forward residual links and a very deep stack of fully-connected layers. The architecture has a number of desirable properties, being interpretable, applicable without modification to a wide array of target domains, and fast to train. We test the proposed architecture on several well-known datasets, including M3, M4 and TOURISM competition datasets containing time series from diverse domains. We demonstrate state-of-the-art performance for two configurations of N-BEATS for all the datasets, improving forecast accuracy by 11% over a statistical benchmark and by 3% over last year's winner of the M4 competition, a domain-adjusted hand-crafted hybrid between neural network and statistical time series models. The first configuration of our model does not employ any time-series-specific components and its performance on heterogeneous datasets strongly suggests that, contrarily to received wisdom, deep learning primitives such as residual blocks are by themselves sufficient to solve a wide range of forecasting problems. Finally, we demonstrate how the proposed architecture can be augmented to provide outputs that are interpretable without considerable loss in accuracy.", "full_presentation_video": ""}, "forum": "r1ecqn4YwB", "id": "r1ecqn4YwB"}, "rklOg6EFwS": {"content": {"appendix": "", "TL;DR": "By differentiating misclassified and correctly classified data, we propose a new misclassification aware defense that improves the state-of-the-art adversarial robustness.", "keywords": ["adversarial", "adversarial defense", "optimization", "perturbation", "robustness"], "paperhash": "wang|improving_adversarial_robustness_requires_revisiting_misclassified_examples", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Yisen Wang", "Difan Zou", "Jinfeng Yi", "James Bailey", "Xingjun Ma", "Quanquan Gu"], "_bibtex": "@inproceedings{\nWang2020Improving,\ntitle={Improving Adversarial Robustness Requires Revisiting Misclassified Examples},\nauthor={Yisen Wang and Difan Zou and Jinfeng Yi and James Bailey and Xingjun Ma and Quanquan Gu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rklOg6EFwS}\n}", "authorids": ["eewangyisen@gmail.com", "knowzou@ucla.edu", "jinfengyi.ustc@gmail.com", "baileyj@unimelb.edu.au", "xingjun.ma@unimelb.edu.au", "qgu@cs.ucla.edu"], "title": "Improving Adversarial Robustness Requires Revisiting Misclassified Examples", "original_pdf": "/attachment/65ea259447dcf990685ad388e977ec2598d98839.pdf", "pdf": "/pdf/08f03663964e4c8da165be864d32eed9723ef6fa.pdf", "abstract": "Deep neural networks (DNNs) are vulnerable to adversarial examples crafted by imperceptible perturbations. A range of defense techniques have been proposed to improve DNN robustness to adversarial examples, among which adversarial training has been demonstrated to be the most effective. Adversarial training is often formulated as a min-max optimization problem, with the inner maximization for generating adversarial examples. However, there exists a simple, yet easily overlooked fact that adversarial examples are only defined on correctly classified (natural) examples, but inevitably, some (natural) examples will be misclassified during training. In this paper, we investigate the distinctive influence of misclassified and correctly classified examples on the final robustness of adversarial training. Specifically, we find that misclassified examples indeed have a significant impact on the final robustness. More surprisingly, we find that different maximization techniques on misclassified examples may have a negligible influence on the final robustness, while different minimization techniques are crucial. Motivated by the above discovery, we propose a new defense algorithm called {\\em Misclassification Aware adveRsarial Training} (MART), which explicitly differentiates the misclassified and correctly classified examples during the training. We also propose a semi-supervised extension of MART, which can leverage the unlabeled data to further improve the robustness. Experimental results show that MART and its variant could significantly improve the state-of-the-art adversarial robustness.", "full_presentation_video": ""}, "forum": "rklOg6EFwS", "id": "rklOg6EFwS"}, "ryx6WgStPB": {"content": {"appendix": "", "TL;DR": "Hypermodels can encode posterior distributions similar to large ensembles at much smaller computational cost. This can facilitate significant improvements in exploration.", "keywords": ["ensembles", "hypernetworks", "reinforcement learning", "uncertainty"], "paperhash": "dwaracherla|hypermodels_for_exploration", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Vikranth Dwaracherla", "Xiuyuan Lu", "Morteza Ibrahimi", "Ian Osband", "Zheng Wen", "Benjamin Van Roy"], "_bibtex": "@inproceedings{\nDwaracherla2020Hypermodels,\ntitle={Hypermodels for Exploration},\nauthor={Vikranth Dwaracherla and Xiuyuan Lu and Morteza Ibrahimi and Ian Osband and Zheng Wen and Benjamin Van Roy},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ryx6WgStPB}\n}", "authorids": ["vikranthd@google.com", "lxlu@google.com", "mibrahimi@google.com", "iosband@google.com", "zhengwen@google.com", "benvanroy@google.com"], "title": "Hypermodels for Exploration", "original_pdf": "/attachment/8ed6e8f16570848a6485d4cdee64ccc03c4e9786.pdf", "pdf": "/pdf/9e6d021fdb54902bcc899e0974dfa15f3b18a1a2.pdf", "abstract": "We study the use of hypermodels to represent epistemic uncertainty and guide exploration.\nThis generalizes and extends the use of ensembles to approximate Thompson sampling. The computational cost of training an ensemble grows with its size, and as such, prior work has typically been limited to ensembles with tens of elements. We show that alternative hypermodels can enjoy dramatic efficiency gains, enabling behavior that would otherwise require hundreds or thousands of elements, and even succeed in situations where ensemble methods fail to learn regardless of size.\nThis allows more accurate approximation of Thompson sampling as well as use of more sophisticated exploration schemes. In particular, we consider an approximate form of information-directed sampling and demonstrate performance gains relative to Thompson sampling. As alternatives to ensembles, we consider linear and neural network hypermodels, also known as hypernetworks.\nWe prove that, with neural network base models, a linear hypermodel can represent essentially any distribution over functions, and as such, hypernetworks do not extend what can be represented.", "full_presentation_video": ""}, "forum": "ryx6WgStPB", "id": "ryx6WgStPB"}, "rklTmyBKPH": {"content": {"appendix": "", "keywords": ["computer vision", "imagenet", "neural architecture search", "semantic segmentation"], "paperhash": "fang|fast_neural_network_adaptation_via_parameter_remapping_and_architecture_search", "code": "https://github.com/JaminFong/FNA", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Jiemin Fang*", "Yuzhu Sun*", "Kangjian Peng*", "Qian Zhang", "Yuan Li", "Wenyu Liu", "Xinggang Wang"], "_bibtex": "@inproceedings{\nFang*2020Fast,\ntitle={Fast Neural Network Adaptation via Parameter Remapping and Architecture Search},\nauthor={Jiemin Fang* and Yuzhu Sun* and Kangjian Peng* and Qian Zhang and Yuan Li and Wenyu Liu and Xinggang Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rklTmyBKPH}\n}", "authorids": ["jaminfong@hust.edu.cn", "yzsun@hust.edu.cn", "kangjian.peng@horizon.ai", "qian01.zhang@horizon.ai", "yuan.li@horizon.ai", "liuwy@hust.edu.cn", "xgwang@hust.edu.cn"], "title": "Fast Neural Network Adaptation via Parameter Remapping and Architecture Search", "original_pdf": "/attachment/61f43016613ce63040e2c710889a878197816e40.pdf", "pdf": "/pdf/138c331613acec38935934038d3d61b23d844d91.pdf", "abstract": "Deep neural networks achieve remarkable performance in many computer vision tasks. Most state-of-the-art~(SOTA) semantic segmentation and object detection approaches reuse neural network architectures designed for image classification as the backbone, commonly pre-trained on ImageNet. However, performance gains can be achieved by designing network architectures specifically for detection and segmentation, as shown by recent neural architecture search (NAS) research for detection and segmentation. One major challenge though, is that ImageNet pre-training of the search space representation (a.k.a. super network) or the searched networks incurs huge computational cost. In this paper, we propose a Fast Neural Network Adaptation (FNA) method, which can adapt both the architecture and parameters of a seed network (e.g. a high performing manually designed backbone) to become a network with different depth, width, or kernels via a Parameter Remapping technique, making it possible to utilize NAS for detection/segmentation tasks a lot more efficiently. In our experiments, we conduct FNA on MobileNetV2 to obtain new networks for both segmentation and detection that clearly out-perform existing networks designed both manually and by NAS. The total computation cost of FNA is significantly less than SOTA segmentation/detection NAS approaches: 1737$\\times$ less than DPC, 6.8$\\times$ less than Auto-DeepLab and 7.4$\\times$ less than DetNAS. The code is available at https://github.com/JaminFong/FNA .", "full_presentation_video": ""}, "forum": "rklTmyBKPH", "id": "rklTmyBKPH"}, "rJgzzJHtDB": {"content": {"appendix": "", "TL;DR": "Is it possible to co-design model accuracy, robustness and efficiency to achieve their triple wins? Yes!", "keywords": ["adversarial", "adversarial attacks", "capacity", "efficient inference", "robustness"], "paperhash": "hu|triple_wins_boosting_accuracy_robustness_and_efficiency_together_by_enabling_inputadaptive_inference", "code": "https://github.com/TAMU-VITA/triple-wins", "spotlight_video": "", "authorids": ["tkhu@tamu.edu", "wiwjp619@tamu.edu", "htwang@tamu.edu", "atlaswang@tamu.edu"], "poster": "", "slides": "", "authors": ["Ting-Kuei Hu", "Tianlong Chen", "Haotao Wang", "Zhangyang Wang"], "_bibtex": "@inproceedings{\nHu2020Triple,\ntitle={Triple Wins: Boosting Accuracy, Robustness and Efficiency Together by Enabling Input-Adaptive Inference},\nauthor={Ting-Kuei Hu and Tianlong Chen and Haotao Wang and Zhangyang Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJgzzJHtDB}\n}", "original_pdf": "/attachment/9176e996154d129d884e4ea4d7d57a2a9fa54803.pdf", "title": "Triple Wins: Boosting Accuracy, Robustness and Efficiency Together by Enabling Input-Adaptive Inference", "pdf": "/pdf/56607f5a22da68d17a77097c6f3a5785777e7fd4.pdf", "abstract": "Deep networks were recently suggested to face the odds between accuracy (on clean natural images) and robustness (on adversarially perturbed images) (Tsipras et al., 2019). Such a dilemma is shown to be rooted in the inherently higher sample complexity (Schmidt et al., 2018) and/or model capacity (Nakkiran, 2019), for learning a high-accuracy and robust classifier. In view of that, give a classification task, growing the model capacity appears to help draw a win-win between accuracy and robustness, yet at the expense of model size and latency, therefore posing challenges for resource-constrained applications. Is it possible to co-design model accuracy, robustness and efficiency to achieve their triple wins? This paper studies multi-exit networks associated with input-adaptive efficient inference, showing their strong promise in achieving a \u201csweet point\" in co-optimizing model accuracy, robustness, and efficiency. Our proposed solution, dubbed Robust Dynamic Inference Networks (RDI-Nets), allows for each input (either clean or adversarial) to adaptively choose one of the multiple output layers (early branches or the final one) to output its prediction. That multi-loss adaptivity adds new variations and flexibility to adversarial attacks and defenses, on which we present a systematical investigation. We show experimentally that by equipping existing backbones with such robust adaptive inference, the resulting RDI-Nets can achieve better accuracy and robustness, yet with over 30% computational savings, compared to the defended original models.\n", "full_presentation_video": ""}, "forum": "rJgzzJHtDB", "id": "rJgzzJHtDB"}, "Hyl7ygStwB": {"content": {"appendix": "", "keywords": ["attention", "fine tuning", "machine translation", "nlp", "reading comprehension", "text classification", "transformer", "unsupervised"], "paperhash": "zhu|incorporating_bert_into_neural_machine_translation", "code": "https://github.com/bert-nmt/bert-nmt", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Jinhua Zhu", "Yingce Xia", "Lijun Wu", "Di He", "Tao Qin", "Wengang Zhou", "Houqiang Li", "Tieyan Liu"], "_bibtex": "@inproceedings{\nZhu2020Incorporating,\ntitle={Incorporating BERT into Neural Machine Translation},\nauthor={Jinhua Zhu and Yingce Xia and Lijun Wu and Di He and Tao Qin and Wengang Zhou and Houqiang Li and Tieyan Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Hyl7ygStwB}\n}", "authorids": ["teslazhu@mail.ustc.edu.cn", "yingce.xia@gmail.com", "wulijun3@mail2.sysu.edu.cn", "di_he@pku.edu.cn", "taoqin@microsoft.com", "zhwg@ustc.edu.cn", "lihq@ustc.edu.cn", "tyliu@microsoft.com"], "title": "Incorporating BERT into Neural Machine Translation", "original_pdf": "/attachment/3bb215521f2fe3f44cecff721c1bf5ae57655021.pdf", "pdf": "/pdf/d131542710841297d9e981e433c86120b31486bd.pdf", "abstract": "The recently proposed BERT (Devlin et al., 2019) has shown great power on a variety of natural language understanding tasks, such as text classification, reading comprehension, etc. However, how to effectively apply BERT to neural machine translation (NMT) lacks enough exploration. While BERT is more commonly used as fine-tuning instead of contextual embedding for downstream language understanding tasks, in NMT, our preliminary exploration of using BERT as contextual embedding is better than using for fine-tuning. This motivates us to think how to better leverage BERT for NMT along this direction. We propose a new algorithm named BERT-fused model, in which we first use BERT to extract representations for an input sequence, and then the representations are fused with each layer of the encoder and decoder of the NMT model through attention mechanisms. We conduct experiments on supervised (including sentence-level and document-level translations), semi-supervised and unsupervised machine translation, and achieve state-of-the-art results on seven benchmark datasets. Our code is available at https://github.com/bert-nmt/bert-nmt", "full_presentation_video": ""}, "forum": "Hyl7ygStwB", "id": "Hyl7ygStwB"}, "BylVcTNtDS": {"content": {"appendix": "", "keywords": ["security", "transfer learning"], "paperhash": "rezaei|a_targetagnostic_attack_on_deep_models_exploiting_security_vulnerabilities_of_transfer_learning", "code": "https://github.com/shrezaei/Target-Agnostic-Attack", "spotlight_video": "", "poster": "", "slides": "", "abstract": "Due to insufficient training data and the high computational cost to train a deep neural network from scratch, transfer learning has been extensively used in many deep-neural-network-based applications. A commonly used transfer learning approach involves taking a part of a pre-trained model, adding a few layers at the end, and re-training the new layers with a small dataset. This approach, while efficient and widely used, imposes a security vulnerability because the pre-trained model used in transfer learning is usually publicly available, including to potential attackers. In this paper, we show that without any additional knowledge other than the pre-trained model, an attacker can launch an effective and efficient brute force attack that can craft instances of input to trigger each target class with high confidence. We assume that the attacker has no access to any target-specific information, including samples from target classes, re-trained model, and probabilities assigned by Softmax to each class, and thus making the attack target-agnostic. These assumptions render all previous attack models inapplicable, to the best of our knowledge. To evaluate the proposed attack, we perform a set of experiments on face recognition and speech recognition tasks and show the effectiveness of the attack. Our work reveals a fundamental security weakness of the Softmax layer when used in transfer learning settings.", "_bibtex": "@inproceedings{\nRezaei2020A,\ntitle={A Target-Agnostic Attack on Deep Models: Exploiting Security Vulnerabilities of Transfer Learning},\nauthor={Shahbaz Rezaei and Xin Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BylVcTNtDS}\n}", "authorids": ["srezaei@ucdavis.edu", "xinliu@ucdavis.edu"], "title": "A Target-Agnostic Attack on Deep Models: Exploiting Security Vulnerabilities of Transfer Learning", "authors": ["Shahbaz Rezaei", "Xin Liu"], "original_pdf": "/attachment/a19f4100610f907d689c339a5bb84d8312864f56.pdf", "pdf": "/pdf/114cb3e7e93573002951610e4809837571a3939f.pdf", "full_presentation_video": ""}, "forum": "BylVcTNtDS", "id": "BylVcTNtDS"}, "HJgfDREKDB": {"content": {"appendix": "", "TL;DR": "Neural nets can encode complex 3D objects into the parameters of other (surprisingly small) neural nets", "keywords": ["3d reconstruction", "computer vision", "representation learning"], "paperhash": "mitchell|higherorder_function_networks_for_learning_composable_3d_object_representations", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Eric Mitchell", "Selim Engin", "Volkan Isler", "Daniel D Lee"], "_bibtex": "@inproceedings{\nMitchell2020Higher-Order,\ntitle={Higher-Order Function Networks for Learning Composable 3D Object Representations},\nauthor={Eric Mitchell and Selim Engin and Volkan Isler and Daniel D Lee},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJgfDREKDB}\n}", "authorids": ["eric.anthony.mitchell95@gmail.com", "engin003@umn.edu", "isler@umn.edu", "ddlee@seas.upenn.edu"], "title": "Higher-Order Function Networks for Learning Composable 3D Object Representations", "original_pdf": "/attachment/e4c108994d2a7681b043de4b1132e51bf8524d40.pdf", "pdf": "/pdf/aa1d65118055944102c9de56cbada2966f55de6f.pdf", "abstract": "We present a new approach to 3D object representation where a neural network encodes the geometry of an object directly into the weights and biases of a second 'mapping' network. This mapping network can be used to reconstruct an object by applying its encoded transformation to points randomly sampled from a simple geometric space, such as the unit sphere. We study the effectiveness of our method through various experiments on subsets of the ShapeNet dataset. We find that the proposed approach can reconstruct encoded objects with accuracy equal to or exceeding state-of-the-art methods with orders of magnitude fewer parameters. Our smallest mapping network has only about 7000 parameters and shows reconstruction quality on par with state-of-the-art object decoder architectures with millions of parameters. Further experiments on feature mixing through the composition of learned functions show that the encoding captures a meaningful subspace of objects.", "full_presentation_video": ""}, "forum": "HJgfDREKDB", "id": "HJgfDREKDB"}, "B1lnbRNtwr": {"content": {"appendix": "", "TL;DR": "Models of source code that combine global and structural features learn more powerful representations of programs.", "keywords": ["attention", "distributed", "graph networks", "inductive bias", "transformer"], "paperhash": "hellendoorn|global_relational_models_of_source_code", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Vincent J. Hellendoorn", "Charles Sutton", "Rishabh Singh", "Petros Maniatis", "David Bieber"], "_bibtex": "@inproceedings{\nHellendoorn2020Global,\ntitle={Global Relational Models of Source Code},\nauthor={Vincent J. Hellendoorn and Charles Sutton and Rishabh Singh and Petros Maniatis and David Bieber},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1lnbRNtwr}\n}", "authorids": ["vjhellendoorn@gmail.com", "charlessutton@google.com", "rising@google.com", "maniatis@google.com", "dbieber@google.com"], "title": "Global Relational Models of Source Code", "original_pdf": "/attachment/eeffeebe48ecb72497dbbe5e2d5b5f373fe10737.pdf", "pdf": "/pdf/1a70bffc358e61847e6c4b29f824a9204f8ce4c3.pdf", "abstract": "Models of code can learn distributed representations of a program's syntax and semantics to predict many non-trivial properties of a program. Recent state-of-the-art models leverage highly structured representations of programs, such as trees, graphs and paths therein (e.g. data-flow relations), which are precise and abundantly available for code. This provides a strong inductive bias towards semantically meaningful relations, yielding more generalizable representations than classical sequence-based models. Unfortunately, these models primarily rely on graph-based message passing to represent relations in code, which makes them de facto local due to the high cost of message-passing steps, quite in contrast to modern, global sequence-based models, such as the Transformer. In this work, we bridge this divide between global and structured models by introducing two new hybrid model families that are both global and incorporate structural bias: Graph Sandwiches, which wrap traditional (gated) graph message-passing layers in sequential message-passing layers; and Graph Relational Embedding Attention Transformers (GREAT for short), which bias traditional Transformers with relational information from graph edge types. By studying a popular, non-trivial program repair task, variable-misuse identification, we explore the relative merits of traditional and hybrid model families for code representation. Starting with a graph-based model that already improves upon the prior state-of-the-art for this task by 20%, we show that our proposed hybrid models improve an additional 10-15%, while training both faster and using fewer parameters.", "full_presentation_video": ""}, "forum": "B1lnbRNtwr", "id": "B1lnbRNtwr"}, "rkeS1RVtPS": {"content": {"appendix": "", "keywords": ["bayesian inference", "imagenet"], "paperhash": "zhang|cyclical_stochastic_gradient_mcmc_for_bayesian_deep_learning", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Ruqi Zhang", "Chunyuan Li", "Jianyi Zhang", "Changyou Chen", "Andrew Gordon Wilson"], "_bibtex": "@inproceedings{\nZhang2020Cyclical,\ntitle={Cyclical Stochastic Gradient MCMC for Bayesian Deep Learning},\nauthor={Ruqi Zhang and Chunyuan Li and Jianyi Zhang and Changyou Chen and Andrew Gordon Wilson},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkeS1RVtPS}\n}", "authorids": ["rz297@cornell.edu", "chunyuan.li@duke.edu", "jz318@duke.edu", "cchangyou@gmail.com", "andrewgw@cims.nyu.edu"], "title": "Cyclical Stochastic Gradient MCMC for Bayesian Deep Learning", "original_pdf": "/attachment/8f643a3875c689194374550ce0b86e6db26462e5.pdf", "pdf": "/pdf/64c55a88ec29d1a60c09e7c647867c3056e1cd83.pdf", "abstract": "The posteriors over neural network weights are high dimensional and multimodal. Each mode typically characterizes a meaningfully different representation of the data. We develop Cyclical Stochastic Gradient MCMC (SG-MCMC) to automatically explore such distributions. In particular, we propose a cyclical stepsize schedule, where larger steps discover new modes, and smaller steps characterize each mode. We prove non-asymptotic convergence theory of our proposed algorithm. Moreover, we provide extensive experimental results, including ImageNet, to demonstrate the effectiveness of cyclical SG-MCMC in learning complex multimodal distributions, especially for fully Bayesian inference with modern deep neural networks.", "full_presentation_video": ""}, "forum": "rkeS1RVtPS", "id": "rkeS1RVtPS"}, "rkg1ngrFPr": {"content": {"appendix": "", "TL;DR": "nearly isometric DNN initializations imply low parameter space curvature, and a lower condition number, but that's not always great", "keywords": ["gradient descent", "information geometry", "optimization"], "paperhash": "sok\u00f3|information_geometry_of_orthogonal_initializations_and_training", "code": "https://github.com/PiotrSokol/info-geom", "spotlight_video": "", "authorids": ["piotr.sokol@stonybrook.edu", "memming.park@stonybrook.edu"], "poster": "", "slides": "", "authors": ["Piotr Aleksander Sok\u00f3\u0142", "Il Memming Park"], "_bibtex": "@inproceedings{\nSok\u00f3\u01422020Information,\ntitle={Information Geometry of Orthogonal Initializations and Training},\nauthor={Piotr Aleksander Sok\u00f3\u0142 and Il Memming Park},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkg1ngrFPr}\n}", "original_pdf": "/attachment/7d9aea264b15581eb028196cf425aadd5040e7d7.pdf", "title": "Information Geometry of Orthogonal Initializations and Training", "pdf": "/pdf/518fe69dd850c85be75561849db48ae9cbadf710.pdf", "abstract": " Recently mean field theory has been successfully used to analyze properties\n of wide, random neural networks. It gave rise to a prescriptive theory for\n initializing feed-forward neural networks with orthogonal weights, which\n ensures that both the forward propagated activations and the backpropagated\n gradients are near \\(\\ell_2\\) isometries and as a consequence training is\n orders of magnitude faster. Despite strong empirical performance, the\n mechanisms by which critical initializations confer an advantage in the\n optimization of deep neural networks are poorly understood. Here we show a\n novel connection between the maximum curvature of the optimization landscape\n (gradient smoothness) as measured by the Fisher information matrix (FIM) and\n the spectral radius of the input-output Jacobian, which partially explains\n why more isometric networks can train much faster. Furthermore, given that\n orthogonal weights are necessary to ensure that gradient norms are\n approximately preserved at initialization, we experimentally investigate the\n benefits of maintaining orthogonality throughout training, and we conclude\n that manifold optimization of weights performs well regardless of the\n smoothness of the gradients. Moreover, we observe a surprising yet robust\n behavior of highly isometric initializations --- even though such networks\n have a lower FIM condition number \\emph{at initialization}, and therefore by\n analogy to convex functions should be easier to optimize, experimentally\n they prove to be much harder to train with stochastic gradient descent. We\n conjecture the FIM condition number plays a non-trivial role in the optimization.", "full_presentation_video": ""}, "forum": "rkg1ngrFPr", "id": "rkg1ngrFPr"}, "rke2P1BFwS": {"content": {"appendix": "", "TL;DR": "We propose new tensor decompositions and associated regularizers to obtain state of the art performances on temporal knowledge base completion.", "keywords": ["knowledge base completion", "regularization", "representation learning"], "paperhash": "lacroix|tensor_decompositions_for_temporal_knowledge_base_completion", "code": "http://s000.tinyupload.com/?file_id=37064871945432677939", "spotlight_video": "", "authorids": ["timothee.lax@gmail.com", "guillaume.obozinski@epfl.ch", "usunier@fb.com"], "poster": "", "slides": "", "authors": ["Timoth\u00e9e Lacroix", "Guillaume Obozinski", "Nicolas Usunier"], "_bibtex": "@inproceedings{\nLacroix2020Tensor,\ntitle={Tensor Decompositions for Temporal Knowledge Base Completion},\nauthor={Timoth\u00e9e Lacroix and Guillaume Obozinski and Nicolas Usunier},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rke2P1BFwS}\n}", "original_pdf": "/attachment/1e7451b67261edb78fe1567e5e61bd167ea487c8.pdf", "title": "Tensor Decompositions for Temporal Knowledge Base Completion", "pdf": "/pdf/2db709046c244a5c17887a7423aead1ba56005e3.pdf", "abstract": "Most algorithms for representation learning and link prediction in relational data have been designed for static data. However, the data they are applied to usually evolves with time, such as friend graphs in social networks or user interactions with items in recommender systems. This is also the case for knowledge bases, which contain facts such as (US, has president, B. Obama, [2009-2017]) that are valid only at certain points in time. For the problem of link prediction under temporal constraints, i.e., answering queries of the form (US, has president, ?, 2012), we propose a solution inspired by the canonical decomposition of tensors of order 4.\nWe introduce new regularization schemes and present an extension of ComplEx that achieves state-of-the-art performance. Additionally, we propose a new dataset for knowledge base completion constructed from Wikidata, larger than previous benchmarks by an order of magnitude, as a new reference for evaluating temporal and non-temporal link prediction methods. ", "full_presentation_video": ""}, "forum": "rke2P1BFwS", "id": "rke2P1BFwS"}, "SyxL2TNtvr": {"content": {"appendix": "", "TL;DR": "We introduce a method for unsupervised disentangled model selection for VAE-based disentangled representation learning approaches.", "keywords": ["disentanglement", "fairness", "reinforcement learning", "representation learning", "unsupervised"], "paperhash": "duan|unsupervised_model_selection_for_variational_disentangled_representation_learning", "spotlight_video": "", "poster": "", "slides": "", "abstract": "Disentangled representations have recently been shown to improve fairness, data efficiency and generalisation in simple supervised and reinforcement learning tasks. To extend the benefits of disentangled representations to more complex domains and practical applications, it is important to enable hyperparameter tuning and model selection of existing unsupervised approaches without requiring access to ground truth attribute labels, which are not available for most datasets. This paper addresses this problem by introducing a simple yet robust and reliable method for unsupervised disentangled model selection. We show that our approach performs comparably to the existing supervised alternatives across 5400 models from six state of the art unsupervised disentangled representation learning model classes. Furthermore, we show that the ranking produced by our approach correlates well with the final task performance on two different domains.", "_bibtex": "@inproceedings{\nDuan2020Unsupervised,\ntitle={Unsupervised Model Selection for Variational Disentangled Representation Learning},\nauthor={Sunny Duan and Loic Matthey and Andre Saraiva and Nick Watters and Chris Burgess and Alexander Lerchner and Irina Higgins},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SyxL2TNtvr}\n}", "authorids": ["sunnyd@google.com", "lmatthey@google.com", "andresnds@google.com", "nwatters@google.com", "cpburgess@google.com", "lerchner@google.com", "irinah@google.com"], "title": "Unsupervised Model Selection for Variational Disentangled Representation Learning", "authors": ["Sunny Duan", "Loic Matthey", "Andre Saraiva", "Nick Watters", "Chris Burgess", "Alexander Lerchner", "Irina Higgins"], "original_pdf": "/attachment/e47c3a7f17a8bcdefb5fcfcf0ed148c4bba80cbf.pdf", "pdf": "/pdf/e7d741db6e2baa1fb3b937144540ebd6f0e70681.pdf", "full_presentation_video": ""}, "forum": "SyxL2TNtvr", "id": "SyxL2TNtvr"}, "ryxmb1rKDS": {"content": {"appendix": "", "TL;DR": "This work enforces Hamiltonian dynamics with control to learn system models from embedded position and velocity data, and exploits this physically-consistent dynamics to synthesize model-based control via energy shaping.", "keywords": ["generalization", "inductive bias", "momentum"], "paperhash": "zhong|symplectic_odenet_learning_hamiltonian_dynamics_with_control", "code": "https://github.com/d-biswa/Symplectic-ODENet", "spotlight_video": "", "authorids": ["y.zhong@princeton.edu", "biswadip.dey@siemens.com", "amit.chakraborty@siemens.com"], "poster": "", "slides": "", "authors": ["Yaofeng Desmond Zhong", "Biswadip Dey", "Amit Chakraborty"], "_bibtex": "@inproceedings{\nZhong2020Symplectic,\ntitle={Symplectic ODE-Net: Learning Hamiltonian Dynamics with Control},\nauthor={Yaofeng Desmond Zhong and Biswadip Dey and Amit Chakraborty},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ryxmb1rKDS}\n}", "original_pdf": "/attachment/26b8ca152e7652572fc5c8dfd9d630cca29ea2d1.pdf", "title": "Symplectic ODE-Net: Learning Hamiltonian Dynamics with Control", "pdf": "/pdf/2b517e0ae387e89e94d818b31518e9e61e0b532d.pdf", "abstract": "In this paper, we introduce Symplectic ODE-Net (SymODEN), a deep learning framework which can infer the dynamics of a physical system, given by an ordinary differential equation (ODE), from observed state trajectories. To achieve better generalization with fewer training samples, SymODEN incorporates appropriate inductive bias by designing the associated computation graph in a physics-informed manner. In particular, we enforce Hamiltonian dynamics with control to learn the underlying dynamics in a transparent way, which can then be leveraged to draw insight about relevant physical aspects of the system, such as mass and potential energy. In addition, we propose a parametrization which can enforce this Hamiltonian formalism even when the generalized coordinate data is embedded in a high-dimensional space or we can only access velocity data instead of generalized momentum. This framework, by offering interpretable, physically-consistent models for physical systems, opens up new possibilities for synthesizing model-based control strategies.", "full_presentation_video": ""}, "forum": "ryxmb1rKDS", "id": "ryxmb1rKDS"}, "rklHqRVKvH": {"content": {"appendix": "", "TL;DR": "We propose a generic framework that allows for exploiting the low-rank structure in both planning and deep reinforcement learning.", "keywords": ["planning", "reinforcement learning"], "paperhash": "yang|harnessing_structures_for_valuebased_planning_and_reinforcement_learning", "code": "https://github.com/YyzHarry/SV-RL", "spotlight_video": "", "authorids": ["yuzhe@mit.edu", "guozhang@mit.edu", "zhixu@mit.edu", "dina@csail.mit.edu"], "poster": "", "slides": "", "authors": ["Yuzhe Yang", "Guo Zhang", "Zhi Xu", "Dina Katabi"], "_bibtex": "@inproceedings{\nYang2020Harnessing,\ntitle={Harnessing Structures for Value-Based Planning and Reinforcement Learning},\nauthor={Yuzhe Yang and Guo Zhang and Zhi Xu and Dina Katabi},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rklHqRVKvH}\n}", "original_pdf": "/attachment/116a89c0e44aaabad099e651eb83258ea774d5fe.pdf", "title": "Harnessing Structures for Value-Based Planning and Reinforcement Learning", "pdf": "/pdf/dd681213028d05d82ebd6e2b76a0a33733cf4209.pdf", "abstract": "Value-based methods constitute a fundamental methodology in planning and deep reinforcement learning (RL). In this paper, we propose to exploit the underlying structures of the state-action value function, i.e., Q function, for both planning and deep RL. In particular, if the underlying system dynamics lead to some global structures of the Q function, one should be capable of inferring the function better by leveraging such structures. Specifically, we investigate the low-rank structure, which widely exists for big data matrices. We verify empirically the existence of low-rank Q functions in the context of control and deep RL tasks. As our key contribution, by leveraging Matrix Estimation (ME) techniques, we propose a general framework to exploit the underlying low-rank structure in Q functions. This leads to a more efficient planning procedure for classical control, and additionally, a simple scheme that can be applied to value-based RL techniques to consistently achieve better performance on \"low-rank\" tasks. Extensive experiments on control tasks and Atari games confirm the efficacy of our approach.", "full_presentation_video": ""}, "forum": "rklHqRVKvH", "id": "rklHqRVKvH"}, "H1gBhkBFDH": {"content": {"appendix": "", "TL;DR": "The paper describes a flexible framework for building CNNs that are equivariant to a large class of transformations groups.", "keywords": ["cnn", "computer vision", "equivariance"], "paperhash": "bekkers|bspline_cnns_on_lie_groups", "code": "https://github.com/ebekkers/gsplinets", "spotlight_video": "", "authorids": ["e.j.bekkers@tue.nl"], "poster": "", "slides": "", "authors": ["Erik J Bekkers"], "_bibtex": "@inproceedings{\nBekkers2020B-Spline,\ntitle={B-Spline CNNs on Lie groups},\nauthor={Erik J Bekkers},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1gBhkBFDH}\n}", "original_pdf": "/attachment/9b80bf115e277ac2a7f609e0ffe00dbadafc1ff8.pdf", "title": "B-Spline CNNs on Lie groups", "pdf": "/pdf/3cc76c692f77db46f58ca54e21ab5ce74859942b.pdf", "abstract": "Group convolutional neural networks (G-CNNs) can be used to improve classical CNNs by equipping them with the geometric structure of groups. Central in the success of G-CNNs is the lifting of feature maps to higher dimensional disentangled representations, in which data characteristics are effectively learned, geometric data-augmentations are made obsolete, and predictable behavior under geometric transformations (equivariance) is guaranteed via group theory. Currently, however, the practical implementations of G-CNNs are limited to either discrete groups (that leave the grid intact) or continuous compact groups such as rotations (that enable the use of Fourier theory). In this paper we lift these limitations and propose a modular framework for the design and implementation of G-CNNs for arbitrary Lie groups. In our approach the differential structure of Lie groups is used to expand convolution kernels in a generic basis of B-splines that is defined on the Lie algebra. This leads to a flexible framework that enables localized, atrous, and deformable convolutions in G-CNNs by means of respectively localized, sparse and non-uniform B-spline expansions. The impact and potential of our approach is studied on two benchmark datasets: cancer detection in histopathology slides (PCam dataset) in which rotation equivariance plays a key role and facial landmark localization (CelebA dataset) in which scale equivariance is important. In both cases, G-CNN architectures outperform their classical 2D counterparts and the added value of atrous and localized group convolutions is studied in detail.", "full_presentation_video": ""}, "forum": "H1gBhkBFDH", "id": "H1gBhkBFDH"}, "B1lGU64tDr": {"content": {"appendix": "", "TL;DR": "A deep hierarchical state-space model in which the state transitions of correlated objects are coordinated by graph neural networks.", "keywords": ["graph networks", "time series"], "paperhash": "yang|relational_statespace_model_for_stochastic_multiobject_systems", "spotlight_video": "", "poster": "", "slides": "", "abstract": "Real-world dynamical systems often consist of multiple stochastic subsystems that interact with each other. Modeling and forecasting the behavior of such dynamics are generally not easy, due to the inherent hardness in understanding the complicated interactions and evolutions of their constituents. This paper introduces the relational state-space model (R-SSM), a sequential hierarchical latent variable model that makes use of graph neural networks (GNNs) to simulate the joint state transitions of multiple correlated objects. By letting GNNs cooperate with SSM, R-SSM provides a flexible way to incorporate relational information into the modeling of multi-object dynamics. We further suggest augmenting the model with normalizing flows instantiated for vertex-indexed random variables and propose two auxiliary contrastive objectives to facilitate the learning. The utility of R-SSM is empirically evaluated on synthetic and real time series datasets.", "_bibtex": "@inproceedings{\nYang2020Relational,\ntitle={Relational State-Space Model for Stochastic Multi-Object Systems},\nauthor={Fan Yang and Ling Chen and Fan Zhou and Yusong Gao and Wei Cao},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1lGU64tDr}\n}", "authorids": ["fanyang01@zju.edu.cn", "lingchen@cs.zju.edu.cn", "fanzhou@zju.edu.cn", "jianchuan.gys@alibaba-inc.com", "mingsong.cw@alibaba-inc.com"], "title": "Relational State-Space Model for Stochastic Multi-Object Systems", "authors": ["Fan Yang", "Ling Chen", "Fan Zhou", "Yusong Gao", "Wei Cao"], "original_pdf": "/attachment/1a5c6d25d9e38523ae97069275579568f629f549.pdf", "pdf": "/pdf/de07d28ba461bc0a4cc08286cfc79cb112ffee03.pdf", "full_presentation_video": ""}, "forum": "B1lGU64tDr", "id": "B1lGU64tDr"}, "BJge3TNKwH": {"content": {"appendix": "", "TL;DR": "\"A novel framework for overcoming catastrophic forgetting by preserving the distribution of the network's output at an arbitrary layer.\"", "keywords": ["capacity", "catastrophic forgetting", "incremental learning", "memory", "unsupervised"], "paperhash": "kolouri|sliced_cramer_synaptic_consolidation_for_preserving_deeply_learned_representations", "spotlight_video": "", "poster": "", "slides": "", "abstract": "Deep neural networks suffer from the inability to preserve the learned data representation (i.e., catastrophic forgetting) in domains where the input data distribution is non-stationary, and it changes during training. Various selective synaptic plasticity approaches have been recently proposed to preserve network parameters, which are crucial for previously learned tasks while learning new tasks. We explore such selective synaptic plasticity approaches through a unifying lens of memory replay and show the close relationship between methods like Elastic Weight Consolidation (EWC) and Memory-Aware-Synapses (MAS). We then propose a fundamentally different class of preservation methods that aim at preserving the distribution of internal neural representations for previous tasks while learning a new one. We propose the sliced Cram\\'{e}r distance as a suitable choice for such preservation and evaluate our Sliced Cramer Preservation (SCP) algorithm through extensive empirical investigations on various network architectures in both supervised and unsupervised learning settings. We show that SCP consistently utilizes the learning capacity of the network better than online-EWC and MAS methods on various incremental learning tasks.", "_bibtex": "@inproceedings{\nKolouri2020Sliced,\ntitle={Sliced Cramer Synaptic Consolidation for Preserving Deeply Learned Representations},\nauthor={Soheil Kolouri and Nicholas A. Ketz and Andrea Soltoggio and Praveen K. Pilly},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJge3TNKwH}\n}", "authorids": ["skolouri@hrl.com", "naketz@hrl.com", "a.soltoggio@lboro.ac.uk", "pkpilly@hrl.com"], "title": "Sliced Cramer Synaptic Consolidation for Preserving Deeply Learned Representations", "authors": ["Soheil Kolouri", "Nicholas A. Ketz", "Andrea Soltoggio", "Praveen K. Pilly"], "original_pdf": "/attachment/b8575645ec0eb906c5bf32145d733f11bdc7cb52.pdf", "pdf": "/pdf/94dc267f1eb2462a0e0706040c156c440b3a27ae.pdf", "full_presentation_video": ""}, "forum": "BJge3TNKwH", "id": "BJge3TNKwH"}, "HkxYzANYDB": {"content": {"appendix": "", "TL;DR": "We present a diagnostic dataset for systematic study of temporal and casual reasoning in videos. ", "keywords": ["nlp", "reasoning"], "paperhash": "yi|clevrer_collision_events_for_video_representation_and_reasoning", "code": "http://clevrer.csail.mit.edu/", "spotlight_video": "", "authorids": ["kyi@g.harvard.edu", "ganchuang1990@gmail.com", "liyunzhu@mit.edu", "pushmeet@google.com", "jiajunwu@mit.edu", "torralba@mit.edu", "jbt@mit.edu"], "poster": "", "slides": "", "authors": ["Kexin Yi*", "Chuang Gan*", "Yunzhu Li", "Pushmeet Kohli", "Jiajun Wu", "Antonio Torralba", "Joshua B. Tenenbaum"], "_bibtex": "@inproceedings{\nYi*2020CLEVRER:,\ntitle={CLEVRER: Collision Events for Video Representation and Reasoning},\nauthor={Kexin Yi* and Chuang Gan* and Yunzhu Li and Pushmeet Kohli and Jiajun Wu and Antonio Torralba and Joshua B. Tenenbaum},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HkxYzANYDB}\n}", "original_pdf": "/attachment/1a880a42da32cdc43cfc8f4d5fdb95c1c1236450.pdf", "title": "CLEVRER: Collision Events for Video Representation and Reasoning", "pdf": "/pdf/5c16bdbb2bb062be7c809cb259b10585a45dec5b.pdf", "abstract": "The ability to reason about temporal and causal events from videos lies at the core of human intelligence. Most video reasoning benchmarks, however, focus on pattern recognition from complex visual and language input, instead of on causal structure. We study the complementary problem, exploring the temporal and causal structures behind videos of objects with simple visual appearance. To this end, we introduce the CoLlision Events for Video REpresentation and Reasoning (CLEVRER) dataset, a diagnostic video dataset for systematic evaluation of computational models on a wide range of reasoning tasks. Motivated by the theory of human casual judgment, CLEVRER includes four types of question: descriptive (e.g., \u2018what color\u2019), explanatory (\u2018what\u2019s responsible for\u2019), predictive (\u2018what will happen next\u2019), and counterfactual (\u2018what if\u2019). We evaluate various state-of-the-art models for visual reasoning on our benchmark. While these models thrive on the perception-based task (descriptive), they perform poorly on the causal tasks (explanatory, predictive and counterfactual), suggesting that a principled approach for causal reasoning should incorporate the capability of both perceiving complex visual and language inputs, and understanding the underlying dynamics and causal relations. We also study an oracle model that explicitly combines these components via symbolic representations. ", "full_presentation_video": ""}, "forum": "HkxYzANYDB", "id": "HkxYzANYDB"}, "rJehVyrKwH": {"content": {"appendix": "", "TL;DR": "Using a structured quantization technique aiming at better in-domain reconstruction to compress convolutional neural networks", "keywords": ["compression", "imagenet", "memory", "quantization"], "paperhash": "stock|and_the_bit_goes_down_revisiting_the_quantization_of_neural_networks", "code": "https://drive.google.com/file/d/12QK7onizf2ArpEBK706ly8bNfiM9cPzp/view?usp=sharing", "spotlight_video": "", "authorids": ["pstock@fb.com", "ajoulin@fb.com", "remi.gribonval@inria.fr", "benjamingraham@fb.com", "rvj@fb.com"], "poster": "", "slides": "", "authors": ["Pierre Stock", "Armand Joulin", "R\u00e9mi Gribonval", "Benjamin Graham", "Herv\u00e9 J\u00e9gou"], "_bibtex": "@inproceedings{\nStock2020And,\ntitle={And the Bit Goes Down: Revisiting the Quantization of Neural Networks},\nauthor={Pierre Stock and Armand Joulin and R\u00e9mi Gribonval and Benjamin Graham and Herv\u00e9 J\u00e9gou},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJehVyrKwH}\n}", "original_pdf": "/attachment/217e5e8b7ca40637ee4bd88af0b918acbad24305.pdf", "title": "And the Bit Goes Down: Revisiting the Quantization of Neural Networks", "pdf": "/pdf/b2bdf3ea140d4beb3be2a63f813a0e04da7088d0.pdf", "abstract": "In this paper, we address the problem of reducing the memory footprint of convolutional network architectures. We introduce a vector quantization method that aims at preserving the quality of the reconstruction of the network outputs rather than its weights. The principle of our approach is that it minimizes the loss reconstruction error for in-domain inputs. Our method only requires a set of unlabelled data at quantization time and allows for efficient inference on CPU by using byte-aligned codebooks to store the compressed weights. We validate our approach by quantizing a high performing ResNet-50 model to a memory size of 5MB (20x compression factor) while preserving a top-1 accuracy of 76.1% on ImageNet object classification and by compressing a Mask R-CNN with a 26x factor.", "full_presentation_video": ""}, "forum": "rJehVyrKwH", "id": "rJehVyrKwH"}, "rJxe3xSYDS": {"content": {"appendix": "", "TL;DR": "An efficient, unbiased approximation of the softmax loss function for extreme classification", "keywords": ["adversarial", "regression"], "paperhash": "bamler|extreme_classification_via_adversarial_softmax_approximation", "code": "https://github.com/mandt-lab/adversarial-negative-sampling", "spotlight_video": "", "authorids": ["rbamler@uci.edu", "stephan.mandt@gmail.com"], "poster": "", "slides": "", "authors": ["Robert Bamler", "Stephan Mandt"], "_bibtex": "@inproceedings{\nBamler2020Extreme,\ntitle={Extreme Classification via Adversarial Softmax Approximation},\nauthor={Robert Bamler and Stephan Mandt},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJxe3xSYDS}\n}", "original_pdf": "/attachment/086ab53658adf7398a9d3465357d40a5d917f048.pdf", "title": "Extreme Classification via Adversarial Softmax Approximation", "pdf": "/pdf/a27d71bd050d73f92c63ca09118c8d644e3b68e6.pdf", "abstract": "Training a classifier over a large number of classes, known as 'extreme classification', has become a topic of major interest with applications in technology, science, and e-commerce. Traditional softmax regression induces a gradient cost proportional to the number of classes C, which often is prohibitively expensive. A popular scalable softmax approximation relies on uniform negative sampling, which suffers from slow convergence due a poor signal-to-noise ratio. In this paper, we propose a simple training method for drastically enhancing the gradient signal by drawing negative samples from an adversarial model that mimics the data distribution. Our contributions are three-fold: (i) an adversarial sampling mechanism that produces negative samples at a cost only logarithmic in C, thus still resulting in cheap gradient updates; (ii) a mathematical proof that this adversarial sampling minimizes the gradient variance while any bias due to non-uniform sampling can be removed; (iii) experimental results on large scale data sets that show a reduction of the training time by an order of magnitude relative to several competitive baselines.\n", "full_presentation_video": ""}, "forum": "rJxe3xSYDS", "id": "rJxe3xSYDS"}, "BkepbpNFwr": {"content": {"appendix": "", "TL;DR": "We present a neural memory-based architecture for incremental domain adaptation, and provide theoretical and empirical results.", "keywords": ["attention", "capacity", "domain adaptation", "fine tuning", "memory", "nlp"], "paperhash": "asghar|progressive_memory_banks_for_incremental_domain_adaptation", "code": "https://github.com/nabihach/IDA", "spotlight_video": "", "authorids": ["nasghar@uwaterloo.ca", "doublepower.mou@gmail.com", "kaselby@uwaterloo.ca", "kevin.pantasdo@uwaterloo.ca", "ppoupart@uwaterloo.ca", "jiang.xin@huawei.com"], "poster": "", "slides": "", "authors": ["Nabiha Asghar", "Lili Mou", "Kira A. Selby", "Kevin D. Pantasdo", "Pascal Poupart", "Xin Jiang"], "_bibtex": "@inproceedings{\nAsghar2020Progressive,\ntitle={Progressive Memory Banks for Incremental Domain Adaptation},\nauthor={Nabiha Asghar and Lili Mou and Kira A. Selby and Kevin D. Pantasdo and Pascal Poupart and Xin Jiang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BkepbpNFwr}\n}", "original_pdf": "/attachment/7e3fb050f0162f9dd7abb3a28728cd8b05276c25.pdf", "title": "Progressive Memory Banks for Incremental Domain Adaptation", "pdf": "/pdf/09cab4009ea99ae5ec0339eca94cdcb853cfba52.pdf", "abstract": "This paper addresses the problem of incremental domain adaptation (IDA) in natural language processing (NLP). We assume each domain comes one after another, and that we could only access data in the current domain. The goal of IDA is to build a unified model performing well on all the domains that we have encountered. We adopt the recurrent neural network (RNN) widely used in NLP, but augment it with a directly parameterized memory bank, which is retrieved by an attention mechanism at each step of RNN transition. The memory bank provides a natural way of IDA: when adapting our model to a new domain, we progressively add new slots to the memory bank, which increases the number of parameters, and thus the model capacity. We learn the new memory slots and fine-tune existing parameters by back-propagation. Experimental results show that our approach achieves significantly better performance than fine-tuning alone. Compared with expanding hidden states, our approach is more robust for old domains, shown by both empirical and theoretical results. Our model also outperforms previous work of IDA including elastic weight consolidation and progressive neural networks in the experiments.", "full_presentation_video": ""}, "forum": "BkepbpNFwr", "id": "BkepbpNFwr"}, "HJedXaEtvS": {"content": {"appendix": "", "TL;DR": "Training neural networks so you can efficiently patch them later.", "keywords": ["machine translation", "maml", "meta learning"], "paperhash": "sinitsin|editable_neural_networks", "code": "https://github.com/editable-ICLR2020/editable", "spotlight_video": "", "authorids": ["ant.sinitsin@gmail.com", "vsevolod-pl@yandex.ru", "alagaster@yandex.ru", "sapopov@yandex-team.ru", "artem.babenko@phystech.edu"], "poster": "", "slides": "", "authors": ["Anton Sinitsin", "Vsevolod Plokhotnyuk", "Dmitry Pyrkin", "Sergei Popov", "Artem Babenko"], "_bibtex": "@inproceedings{\nSinitsin2020Editable,\ntitle={Editable Neural Networks},\nauthor={Anton Sinitsin and Vsevolod Plokhotnyuk and Dmitry Pyrkin and Sergei Popov and Artem Babenko},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJedXaEtvS}\n}", "original_pdf": "/attachment/00f3ba7c452311995e3e88637184366541494c8a.pdf", "title": "Editable Neural Networks", "pdf": "/pdf/99693462793c1e5fb613755206e3113d362988a8.pdf", "abstract": "These days deep neural networks are ubiquitously used in a wide range of tasks, from image classification and machine translation to face identification and self-driving cars. In many applications, a single model error can lead to devastating financial, reputational and even life-threatening consequences. Therefore, it is crucially important to correct model mistakes quickly as they appear. In this work, we investigate the problem of neural network editing - how one can efficiently patch a mistake of the model on a particular sample, without influencing the model behavior on other samples. Namely, we propose Editable Training, a model-agnostic training technique that encourages fast editing of the trained model. We empirically demonstrate the effectiveness of this method on large-scale image classification and machine translation tasks.", "full_presentation_video": ""}, "forum": "HJedXaEtvS", "id": "HJedXaEtvS"}, "rkllGyBFPH": {"content": {"appendix": "", "TL;DR": "Wide neural networks can escape the NTK regime and couple with quadratic models, with provably nice optimization landscape and better generalization.", "keywords": ["deep learning theory", "generalization", "neural tangent kernel", "optimization"], "paperhash": "bai|beyond_linearization_on_quadratic_and_higherorder_approximation_of_wide_neural_networks", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Yu Bai", "Jason D. Lee"], "_bibtex": "@inproceedings{\nBai2020Beyond,\ntitle={Beyond Linearization: On Quadratic and Higher-Order Approximation of Wide Neural Networks},\nauthor={Yu Bai and Jason D. Lee},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkllGyBFPH}\n}", "authorids": ["yubai.pku@gmail.com", "jasondlee88@gmail.com"], "title": "Beyond Linearization: On Quadratic and Higher-Order Approximation of Wide Neural Networks", "original_pdf": "/attachment/6fea625dc7c0856f6deb265b093c40c33b9b60f9.pdf", "pdf": "/pdf/cf05a398eaa2e3ce8f03535b88e6582fb8c4b167.pdf", "abstract": "Recent theoretical work has established connections between over-parametrized neural networks and linearized models governed by the Neural Tangent Kernels (NTKs). NTK theory leads to concrete convergence and generalization results, yet the empirical performance of neural networks are observed to exceed their linearized models, suggesting insufficiency of this theory.\nTowards closing this gap, we investigate the training of over-parametrized neural networks that are beyond the NTK regime yet still governed by the Taylor expansion of the network. We bring forward the idea of randomizing the neural networks, which allows them to escape their NTK and couple with quadratic models. We show that the optimization landscape of randomized two-layer networks are nice and amenable to escaping-saddle algorithms. We prove concrete generalization and expressivity results on these randomized networks, which lead to sample complexity bounds (of learning certain simple functions) that match the NTK and can in addition be better by a dimension factor when mild distributional assumptions are present. We demonstrate that our randomization technique can be generalized systematically beyond the quadratic case, by using it to find networks that are coupled with higher-order terms in their Taylor series.\n", "full_presentation_video": ""}, "forum": "rkllGyBFPH", "id": "rkllGyBFPH"}, "HJgpugrKPS": {"content": {"appendix": "", "keywords": ["attention", "cnn", "equivariance", "stability"], "paperhash": "sosnovik|scaleequivariant_steerable_networks", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Ivan Sosnovik", "Micha\u0142 Szmaja", "Arnold Smeulders"], "_bibtex": "@inproceedings{\nSosnovik2020Scale-Equivariant,\ntitle={Scale-Equivariant Steerable Networks},\nauthor={Ivan Sosnovik and Micha\u0142 Szmaja and Arnold Smeulders},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJgpugrKPS}\n}", "authorids": ["sosnovikivan@gmail.com", "szmajamichal@gmail.com", "a.w.m.smeulders@uva.nl"], "title": "Scale-Equivariant Steerable Networks", "original_pdf": "/attachment/063c866136f90482a841be7fcb7a723d5de4164b.pdf", "pdf": "/pdf/ce6c00156479706a1ff9e63741b0551427f47911.pdf", "abstract": "The effectiveness of Convolutional Neural Networks (CNNs) has been substantially attributed to their built-in property of translation equivariance. However, CNNs do not have embedded mechanisms to handle other types of transformations. In this work, we pay attention to scale changes, which regularly appear in various tasks due to the changing distances between the objects and the camera. First, we introduce the general theory for building scale-equivariant convolutional networks with steerable filters. We develop scale-convolution and generalize other common blocks to be scale-equivariant. We demonstrate the computational efficiency and numerical stability of the proposed method. We compare the proposed models to the previously developed methods for scale equivariance and local scale invariance. We demonstrate state-of-the-art results on the MNIST-scale dataset and on the STL-10 dataset in the supervised learning setting.", "full_presentation_video": ""}, "forum": "HJgpugrKPS", "id": "HJgpugrKPS"}, "r1lOgyrKDS": {"content": {"appendix": "", "keywords": ["generation", "policy gradient", "program synthesis", "reinforcement learning", "uncertainty", "variance reduction"], "paperhash": "fan|adaptive_correlated_monte_carlo_for_contextual_categorical_sequence_generation", "code": "https://github.com/xinjiefan/ACMC_ICLR", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Xinjie Fan", "Yizhe Zhang", "Zhendong Wang", "Mingyuan Zhou"], "_bibtex": "@inproceedings{\nFan2020Adaptive,\ntitle={Adaptive Correlated Monte Carlo for Contextual Categorical Sequence Generation},\nauthor={Xinjie Fan and Yizhe Zhang and Zhendong Wang and Mingyuan Zhou},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1lOgyrKDS}\n}", "authorids": ["xfan@utexas.edu", "yizhe.zhang@microsoft.com", "zw2533@columbia.edu", "mingyuan.zhou@mccombs.utexas.edu"], "title": "Adaptive Correlated Monte Carlo for Contextual Categorical Sequence Generation", "original_pdf": "/attachment/0f5b0d11ea39b008736d97e93ca3a0a24cf2e495.pdf", "pdf": "/pdf/bce84f5d9b005b6f1a001d55324bced7650439be.pdf", "abstract": "Sequence generation models are commonly refined with reinforcement learning over user-defined metrics. However, high gradient variance hinders the practical use of this method. To stabilize this method, we adapt to contextual generation of categorical sequences a policy gradient estimator, which evaluates a set of correlated Monte Carlo (MC) rollouts for variance control. Due to the correlation, the number of unique rollouts is random and adaptive to model uncertainty; those rollouts naturally become baselines for each other, and hence are combined to effectively reduce gradient variance. We also demonstrate the use of correlated MC rollouts for binary-tree softmax models, which reduce the high generation cost in large vocabulary scenarios by decomposing each categorical action into a sequence of binary actions. We evaluate our methods on both neural program synthesis and image captioning. The proposed methods yield lower gradient variance and consistent improvement over related baselines. ", "full_presentation_video": ""}, "forum": "r1lOgyrKDS", "id": "r1lOgyrKDS"}, "ByeWogStDS": {"content": {"appendix": "", "TL;DR": "We propose HiPPO, a stable Hierarchical Reinforcement Learning algorithm that can train several levels of the hierarchy simultaneously, giving good performance both in skill discovery and adaptation.", "keywords": ["hierarchical reinforcement learning", "optimization", "policy gradient", "reinforcement learning", "robustness", "skill discovery", "transfer learning"], "paperhash": "li|subpolicy_adaptation_for_hierarchical_reinforcement_learning", "code": "https://anonymous.4open.science/r/de105a6d-8f8b-405e-b90a-54ab74adcb17/", "spotlight_video": "", "authorids": ["alexli1@berkeley.edu", "florensa@berkeley.edu", "iclavera@berkeley.edu", "pabbeel@berkeley.edu"], "poster": "", "slides": "", "authors": ["Alexander Li", "Carlos Florensa", "Ignasi Clavera", "Pieter Abbeel"], "_bibtex": "@inproceedings{\nLi2020Sub-policy,\ntitle={Sub-policy Adaptation for Hierarchical Reinforcement Learning},\nauthor={Alexander Li and Carlos Florensa and Ignasi Clavera and Pieter Abbeel},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ByeWogStDS}\n}", "original_pdf": "/attachment/93c3f94143e2f8aab3e80380d4490fe0dfddc7da.pdf", "title": "Sub-policy Adaptation for Hierarchical Reinforcement Learning", "pdf": "/pdf/3182ba9f57df7bad5db6033a3217209b9606dd6e.pdf", "abstract": "Hierarchical reinforcement learning is a promising approach to tackle long-horizon decision-making problems with sparse rewards. Unfortunately, most methods still decouple the lower-level skill acquisition process and the training of a higher level that controls the skills in a new task. Leaving the skills fixed can lead to significant sub-optimality in the transfer setting. In this work, we propose a novel algorithm to discover a set of skills, and continuously adapt them along with the higher level even when training on a new task. Our main contributions are two-fold. First, we derive a new hierarchical policy gradient with an unbiased latent-dependent baseline, and we introduce Hierarchical Proximal Policy Optimization (HiPPO), an on-policy method to efficiently train all levels of the hierarchy jointly. Second, we propose a method of training time-abstractions that improves the robustness of the obtained skills to environment changes. Code and videos are available at sites.google.com/view/hippo-rl.", "full_presentation_video": ""}, "forum": "ByeWogStDS", "id": "ByeWogStDS"}, "BJlzm64tDH": {"content": {"appendix": "", "keywords": ["language modeling", "nlp", "question answering", "self supervised learning"], "paperhash": "xiong|pretrained_encyclopedia_weakly_supervised_knowledgepretrained_language_model", "spotlight_video": "", "poster": "", "slides": "", "abstract": " Recent breakthroughs of pretrained language models have shown the effectiveness of self-supervised learning for a wide range of natural language processing (NLP) tasks. In addition to standard syntactic and semantic NLP tasks, pretrained models achieve strong improvements on tasks that involve real-world knowledge, suggesting that large-scale language modeling could be an implicit method to capture knowledge. In this work, we further investigate the extent to which pretrained models such as BERT capture knowledge using a zero-shot fact completion task. Moreover, we propose a simple yet effective weakly supervised pretraining objective, which explicitly forces the model to incorporate knowledge about real-world entities. Models trained with our new objective yield significant improvements on the fact completion task. When applied to downstream tasks, our model consistently outperforms BERT on four entity-related question answering datasets (i.e., WebQuestions, TriviaQA, SearchQA and Quasar-T) with an average 2.7 F1 improvements and a standard fine-grained entity typing dataset (i.e., FIGER) with 5.7 accuracy gains.", "_bibtex": "@inproceedings{\nXiong2020Pretrained,\ntitle={Pretrained Encyclopedia: Weakly Supervised Knowledge-Pretrained Language Model},\nauthor={Wenhan Xiong and Jingfei Du and William Yang Wang and Veselin Stoyanov},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJlzm64tDH}\n}", "authorids": ["xwhan@cs.ucsb.edu", "jingfeidu@fb.com", "william@cs.ucsb.edu", "ves@fb.com"], "title": "Pretrained Encyclopedia: Weakly Supervised Knowledge-Pretrained Language Model", "authors": ["Wenhan Xiong", "Jingfei Du", "William Yang Wang", "Veselin Stoyanov"], "original_pdf": "/attachment/1e75e8838c2c2e0e80ab04ad9923219214a06c94.pdf", "pdf": "/pdf/0ffdcc09ff094f333d60c31cf0f41479ee47ecc9.pdf", "full_presentation_video": ""}, "forum": "BJlzm64tDH", "id": "BJlzm64tDH"}, "S1g7tpEYDS": {"content": {"appendix": "", "TL;DR": "Deterministic regularized autoencoders can learn a smooth, meaningful latent space as VAEs without having to force some arbitrarily chosen prior (i.e., Gaussian).", "keywords": ["autoencoder", "generative models", "regularization", "unsupervised", "variational autoencoders"], "paperhash": "ghosh|from_variational_to_deterministic_autoencoders", "code": "https://github.com/ParthaEth/Regularized_autoencoders-RAE-", "spotlight_video": "", "authorids": ["partha.ghosh@tuebingen.mpg.de", "msajjadi@tue.mpg.de", "antonio.vergari@tuebingen.mpg.de", "black@tue.mpg.de", "bs@tue.mpg.de"], "poster": "", "slides": "", "authors": ["Partha Ghosh", "Mehdi S. M. Sajjadi", "Antonio Vergari", "Michael Black", "Bernhard Scholkopf"], "_bibtex": "@inproceedings{\nGhosh2020From,\ntitle={From Variational to Deterministic Autoencoders},\nauthor={Partha Ghosh and Mehdi S. M. Sajjadi and Antonio Vergari and Michael Black and Bernhard Scholkopf},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1g7tpEYDS}\n}", "original_pdf": "/attachment/e347fb88b9b7a25ba052d329ecd0971365223b05.pdf", "title": "From Variational to Deterministic Autoencoders", "pdf": "/pdf/4855d3d40c16853fcf3844059592f0c5783a60fe.pdf", "abstract": " Variational Autoencoders (VAEs) provide a theoretically-backed and popular framework for deep generative models. However, learning a VAE from data poses still unanswered theoretical questions and considerable practical challenges. In this work, we propose an alternative framework for generative modeling that is simpler, easier to train, and deterministic, yet has many of the advantages of the VAE. We observe that sampling a stochastic encoder in a Gaussian VAE can be interpreted as simply injecting noise into the input of a deterministic decoder. We investigate how substituting this kind of stochasticity, with other explicit and implicit regularization schemes, can lead to an equally smooth and meaningful latent space without having to force it to conform to an arbitrarily chosen prior. To retrieve a generative mechanism to sample new data points, we introduce an ex-post density estimation step that can be readily applied to the proposed framework as well as existing VAEs, improving their sample quality. We show, in a rigorous empirical study, that the proposed regularized deterministic autoencoders are able to generate samples that are comparable to, or better than, those of VAEs and more powerful alternatives when applied to images as well as to structured data such as molecules. ", "full_presentation_video": ""}, "forum": "S1g7tpEYDS", "id": "S1g7tpEYDS"}, "rJlUt0EYwS": {"content": {"appendix": "", "keywords": ["generalization", "nlp", "question answering", "text classification"], "paperhash": "wang|learning_from_explanations_with_neural_execution_tree", "code": "https://www.dropbox.com/sh/zkp19yr44yr8idt/AABpjFN3r2COIOub33L7DtfLa?dl=0", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Ziqi Wang*", "Yujia Qin*", "Wenxuan Zhou", "Jun Yan", "Qinyuan Ye", "Leonardo Neves", "Zhiyuan Liu", "Xiang Ren"], "_bibtex": "@inproceedings{\nWang*2020Learning,\ntitle={Learning from Explanations with Neural Execution Tree},\nauthor={Ziqi Wang* and Yujia Qin* and Wenxuan Zhou and Jun Yan and Qinyuan Ye and Leonardo Neves and Zhiyuan Liu and Xiang Ren},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJlUt0EYwS}\n}", "authorids": ["ziqi-wan16@mails.tsinghua.edu.cn", "qinyj16@mails.tsinghua.edu.cn", "zhouwenx@usc.edu", "yanjun@usc.edu", "qinyuany@usc.edu", "lneves@snap.com", "liuzy@tsinghua.edu.cn", "xiangren@usc.edu"], "title": "Learning from Explanations with Neural Execution Tree", "original_pdf": "/attachment/ecb996e12720b878c35679607709088e29d1933f.pdf", "pdf": "/pdf/384b941e7fb13502fee54b50ee3a59950f4beae9.pdf", "abstract": "While deep neural networks have achieved impressive performance on a range of NLP tasks, these data-hungry models heavily rely on labeled data, which restricts their applications in scenarios where data annotation is expensive. Natural language (NL) explanations have been demonstrated very useful additional supervision, which can provide sufficient domain knowledge for generating more labeled data over new instances, while the annotation time only doubles. However, directly applying them for augmenting model learning encounters two challenges: (1) NL explanations are unstructured and inherently compositional, which asks for a modularized model to represent their semantics, (2) NL explanations often have large numbers of linguistic variants, resulting in low recall and limited generalization ability. In this paper, we propose a novel Neural Execution Tree (NExT) framework to augment training data for text classification using NL explanations. After transforming NL explanations into executable logical forms by semantic parsing, NExT generalizes different types of actions specified by the logical forms for labeling data instances, which substantially increases the coverage of each NL explanation. Experiments on two NLP tasks (relation extraction and sentiment analysis) demonstrate its superiority over baseline methods. Its extension to multi-hop question answering achieves performance gain with light annotation effort.", "full_presentation_video": ""}, "forum": "rJlUt0EYwS", "id": "rJlUt0EYwS"}, "HJgK0h4Ywr": {"content": {"appendix": "", "keywords": ["disentanglement", "interpretability", "unsupervised"], "paperhash": "do|theory_and_evaluation_metrics_for_learning_disentangled_representations", "spotlight_video": "", "poster": "", "slides": "", "abstract": "We make two theoretical contributions to disentanglement learning by (a) defining precise semantics of disentangled representations, and (b) establishing robust metrics for evaluation. First, we characterize the concept \u201cdisentangled representations\u201d used in supervised and unsupervised methods along three dimensions\u2013informativeness, separability and interpretability\u2013which can be expressed and quantified explicitly using information-theoretic constructs. This helps explain the behaviors of several well-known disentanglement learning models. We then propose robust metrics for measuring informativeness, separability and interpretability. Through a comprehensive suite of experiments, we show that our metrics correctly characterize the representations learned by different methods and are consistent with qualitative (visual) results. Thus, the metrics allow disentanglement learning methods to be compared on a fair ground. We also empirically uncovered new interesting properties of VAE-based methods and interpreted them with our formulation. These findings are promising and hopefully will encourage the design of more theoretically driven models for learning disentangled representations. ", "_bibtex": "@inproceedings{\nDo2020Theory,\ntitle={Theory and Evaluation Metrics for Learning Disentangled Representations},\nauthor={Kien Do and Truyen Tran},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJgK0h4Ywr}\n}", "authorids": ["dkdo@deakin.edu.au", "truyen.tran@deakin.edu.au"], "title": "Theory and Evaluation Metrics for Learning Disentangled Representations", "authors": ["Kien Do", "Truyen Tran"], "original_pdf": "/attachment/b1acd533cf9dffd0cfa586201ac67c8561f9ba1b.pdf", "pdf": "/pdf/2e192160d2c979003aa25256ddab01689a94c2b7.pdf", "full_presentation_video": ""}, "forum": "HJgK0h4Ywr", "id": "HJgK0h4Ywr"}, "SkxxtgHKPS": {"content": {"appendix": "", "TL;DR": "We give some generalization error bounds of noisy gradient methods such as SGLD, Langevin dynamics, noisy momentum and so forth.", "keywords": ["acceleration", "attention", "deep learning theory", "generalization", "gradient descent", "momentum", "regularization", "stability"], "paperhash": "li|on_generalization_error_bounds_of_noisy_gradient_methods_for_nonconvex_learning", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Jian Li", "Xuanyuan Luo", "Mingda Qiao"], "_bibtex": "@inproceedings{\nLi2020On,\ntitle={On Generalization Error Bounds of Noisy Gradient Methods for Non-Convex Learning},\nauthor={Jian Li and Xuanyuan Luo and Mingda Qiao},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SkxxtgHKPS}\n}", "authorids": ["ljiian83@mail.tsinghua.edu.cn", "luo-xy19@mails.tsinghua.edu.cn", "mqiao@stanford.edu"], "title": "On Generalization Error Bounds of Noisy Gradient Methods for Non-Convex Learning", "original_pdf": "/attachment/08be5dda128ae3d9dcdba8dc966d38ed3402992f.pdf", "pdf": "/pdf/9020c8430bc5dee65249d687606e6bee5e08ed56.pdf", "abstract": "Generalization error (also known as the out-of-sample error) measures how well the hypothesis learned from training data generalizes to previously unseen data. Proving tight generalization error bounds is a central question in statistical learning theory. In this paper, we obtain generalization error bounds for learning general non-convex objectives, which has attracted significant attention in recent years. We develop a new framework, termed Bayes-Stability, for proving algorithm-dependent generalization error bounds. The new framework combines ideas from both the PAC-Bayesian theory and the notion of algorithmic stability. Applying the Bayes-Stability method, we obtain new data-dependent generalization bounds for stochastic gradient Langevin dynamics (SGLD) and several other noisy gradient methods (e.g., with momentum, mini-batch and acceleration, Entropy-SGD). Our result recovers (and is typically tighter than) a recent result in Mou et al. (2018) and improves upon the results in Pensia et al. (2018). Our experiments demonstrate that our data-dependent bounds can distinguish randomly labelled data from normal data, which provides an explanation to the intriguing phenomena observed in Zhang et al. (2017a). We also study the setting where the total loss is the sum of a bounded loss and an additiona l`2 regularization term. We obtain new generalization bounds for the continuous Langevin dynamic in this setting by developing a new Log-Sobolev inequality for the parameter distribution at any time. Our new bounds are more desirable when the noise level of the processis not very small, and do not become vacuous even when T tends to infinity.", "full_presentation_video": ""}, "forum": "SkxxtgHKPS", "id": "SkxxtgHKPS"}, "SJx1URNKwH": {"content": {"appendix": "", "TL;DR": "Video retargeting typically requires large amount of target data to be effective, which may not always be available; we propose a metalearning approach that improves over popular baselines while producing temporally coherent frames.", "keywords": ["adversarial", "fewshot learning", "gan", "generative models", "meta learning", "unsupervised"], "paperhash": "lee|metapix_fewshot_video_retargeting", "spotlight_video": "", "poster": "", "slides": "", "abstract": "We address the task of unsupervised retargeting of human actions from one video to another. We consider the challenging setting where only a few frames of the target is available. The core of our approach is a conditional generative model that can transcode input skeletal poses (automatically extracted with an off-the-shelf pose estimator) to output target frames. However, it is challenging to build a universal transcoder because humans can appear wildly different due to clothing and background scene geometry. Instead, we learn to adapt \u2013 or personalize \u2013 a universal generator to the particular human and background in the target. To do so, we make use of meta-learning to discover effective strategies for on-the-fly personalization. One significant benefit of meta-learning is that the personalized transcoder naturally enforces temporal coherence across its generated frames; all frames contain consistent clothing and background geometry of the target. We experiment on in-the-wild internet videos and images and show our approach improves over widely-used baselines for the task.\n", "_bibtex": "@inproceedings{\nLee2020MetaPix:,\ntitle={MetaPix: Few-Shot Video Retargeting},\nauthor={Jessica Lee and Deva Ramanan and Rohit Girdhar},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJx1URNKwH}\n}", "authorids": ["jl5@cs.cmu.edu", "deva@cs.cmu.edu", "rgirdhar@cs.cmu.edu"], "title": "MetaPix: Few-Shot Video Retargeting", "authors": ["Jessica Lee", "Deva Ramanan", "Rohit Girdhar"], "original_pdf": "/attachment/860134d07533593abcbb74b5e4d3b3389484db1e.pdf", "pdf": "/pdf/a63a19c5457caf7a6647d054cd03a139124fb7b4.pdf", "full_presentation_video": ""}, "forum": "SJx1URNKwH", "id": "SJx1URNKwH"}, "BygXFkSYDH": {"content": {"appendix": "", "keywords": ["autoencoder", "generalization", "regularization", "representation learning", "stability", "unsupervised"], "paperhash": "jarrett|targetembedding_autoencoders_for_supervised_representation_learning", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Daniel Jarrett", "Mihaela van der Schaar"], "_bibtex": "@inproceedings{\nJarrett2020Target-Embedding,\ntitle={Target-Embedding Autoencoders for Supervised Representation Learning},\nauthor={Daniel Jarrett and Mihaela van der Schaar},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BygXFkSYDH}\n}", "authorids": ["daniel.jarrett@eng.ox.ac.uk", "mv472@damtp.cam.ac.uk"], "title": "Target-Embedding Autoencoders for Supervised Representation Learning", "original_pdf": "/attachment/e226648ed2a0cce6da22ae24106432c7d9df87f0.pdf", "pdf": "/pdf/faa226acb48cee96a11ff11e7382d4999f80dc00.pdf", "abstract": "Autoencoder-based learning has emerged as a staple for disciplining representations in unsupervised and semi-supervised settings. This paper analyzes a framework for improving generalization in a purely supervised setting, where the target space is high-dimensional. We motivate and formalize the general framework of target-embedding autoencoders (TEA) for supervised prediction, learning intermediate latent representations jointly optimized to be both predictable from features as well as predictive of targets---encoding the prior that variations in targets are driven by a compact set of underlying factors. As our theoretical contribution, we provide a guarantee of generalization for linear TEAs by demonstrating uniform stability, interpreting the benefit of the auxiliary reconstruction task as a form of regularization. As our empirical contribution, we extend validation of this approach beyond existing static classification applications to multivariate sequence forecasting, verifying their advantage on both linear and nonlinear recurrent architectures---thereby underscoring the further generality of this framework beyond feedforward instantiations.", "full_presentation_video": ""}, "forum": "BygXFkSYDH", "id": "BygXFkSYDH"}, "Hye1kTVFDS": {"content": {"appendix": "", "TL;DR": "Training agents with adaptive computation based on information bottleneck can promote generalization. ", "keywords": ["compression", "generalization", "information bottleneck", "optimization", "planning", "reinforcement learning", "variational inference", "variational information bottleneck"], "paperhash": "goyal|the_variational_bandwidth_bottleneck_stochastic_evaluation_on_an_information_budget", "spotlight_video": "", "poster": "", "slides": "", "abstract": "In many applications, it is desirable to extract only the relevant information from complex input data, which involves making a decision about which input features are relevant.\nThe information bottleneck method formalizes this as an information-theoretic optimization problem by maintaining an optimal tradeoff between compression (throwing away irrelevant input information), and predicting the target. In many problem settings, including the reinforcement learning problems we consider in this work, we might prefer to compress only part of the input. This is typically the case when we have a standard conditioning input, such as a state observation, and a ``privileged'' input, which might correspond to the goal of a task, the output of a costly planning algorithm, or communication with another agent. In such cases, we might prefer to compress the privileged input, either to achieve better generalization (e.g., with respect to goals) or to minimize access to costly information (e.g., in the case of communication). Practical implementations of the information bottleneck based on variational inference require access to the privileged input in order to compute the bottleneck variable, so although they perform compression, this compression operation itself needs unrestricted, lossless access. In this work, we propose the variational bandwidth bottleneck, which decides for each example on the estimated value of the privileged information before seeing it, i.e., only based on the standard input, and then accordingly chooses stochastically, whether to access the privileged input or not. We formulate a tractable approximation to this framework and demonstrate in a series of reinforcement learning experiments that it can improve generalization and reduce access to computationally costly information.", "_bibtex": "@inproceedings{\nGoyal2020The,\ntitle={The Variational Bandwidth Bottleneck: Stochastic Evaluation on an Information Budget},\nauthor={Anirudh Goyal and Yoshua Bengio and Matthew Botvinick and Sergey Levine},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Hye1kTVFDS}\n}", "authorids": ["anirudhgoyal9119@gmail.com", "yoshua.bengio@mila.quebec", "botvinick@google.com", "svlevine@eecs.berkeley.edu"], "title": "The Variational Bandwidth Bottleneck: Stochastic Evaluation on an Information Budget", "authors": ["Anirudh Goyal", "Yoshua Bengio", "Matthew Botvinick", "Sergey Levine"], "original_pdf": "/attachment/109951b019651e7044742206321a257ae68e94a1.pdf", "pdf": "/pdf/b40be7a245f8e1c10d5e434f960b0a68288a91b5.pdf", "full_presentation_video": ""}, "forum": "Hye1kTVFDS", "id": "Hye1kTVFDS"}, "H1gX8C4YPr": {"content": {"appendix": "", "keywords": ["distributed", "fine tuning", "imagenet", "navigation", "optimization", "reinforcement learning"], "paperhash": "wijmans|ddppo_learning_nearperfect_pointgoal_navigators_from_25_billion_frames", "code": "https://github.com/facebookresearch/habitat-api", "spotlight_video": "", "poster": "", "slides": "", "abstract": "We present Decentralized Distributed Proximal Policy Optimization (DD-PPO), a method for distributed reinforcement learning in resource-intensive simulated environments. DD-PPO is distributed (uses multiple machines), decentralized (lacks a centralized server), and synchronous (no computation is ever \"stale\"), making it conceptually simple and easy to implement. In our experiments on training virtual robots to navigate in Habitat-Sim, DD-PPO exhibits near-linear scaling -- achieving a speedup of 107x on 128 GPUs over a serial implementation. We leverage this scaling to train an agent for 2.5 Billion steps of experience (the equivalent of 80 years of human experience) -- over 6 months of GPU-time training in under 3 days of wall-clock time with 64 GPUs. \n\nThis massive-scale training not only sets the state of art on Habitat Autonomous Navigation Challenge 2019, but essentially \"solves\" the task -- near-perfect autonomous navigation in an unseen environment without access to a map, directly from an RGB-D camera and a GPS+Compass sensor. Fortuitously, error vs computation exhibits a power-law-like distribution; thus, 90% of peak performance is obtained relatively early (at 100 million steps) and relatively cheaply (under 1 day with 8 GPUs). Finally, we show that the scene understanding and navigation policies learned can be transferred to other navigation tasks -- the analog of \"ImageNet pre-training + task-specific fine-tuning\" for embodied AI. Our model outperforms ImageNet pre-trained CNNs on these transfer tasks and can serve as a universal resource (all models and code are publicly available). ", "_bibtex": "@inproceedings{\nWijmans2020DD-PPO:,\ntitle={DD-PPO: Learning Near-Perfect PointGoal Navigators from 2.5 Billion Frames},\nauthor={Erik Wijmans and Abhishek Kadian and Ari Morcos and Stefan Lee and Irfan Essa and Devi Parikh and Manolis Savva and Dhruv Batra},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1gX8C4YPr}\n}", "authorids": ["etw@gatech.edu", "akadian@fb.com", "arimorcos@gmail.com", "leestef@oregonstate.edu", "irfan@gatech.edu", "parikh@gatech.edu", "msavva@sfu.ca", "dbatra@gatech.edu"], "title": "DD-PPO: Learning Near-Perfect PointGoal Navigators from 2.5 Billion Frames", "authors": ["Erik Wijmans", "Abhishek Kadian", "Ari Morcos", "Stefan Lee", "Irfan Essa", "Devi Parikh", "Manolis Savva", "Dhruv Batra"], "original_pdf": "/attachment/8692bf10f2c51feb9a5ad23c136d396e42b36153.pdf", "pdf": "/pdf/7cd7e4ba1dad47a6766dcc250d99d0a2300f2f85.pdf", "full_presentation_video": ""}, "forum": "H1gX8C4YPr", "id": "H1gX8C4YPr"}, "rygfnn4twS": {"content": {"appendix": "", "TL;DR": "Accurate, Fast and Automated Kernel-Wise Neural Network Quantization with Mixed Precision using Hierarchical Deep Reinforcement Learning", "keywords": ["automl", "cnn", "quantization", "reinforcement learning"], "paperhash": "lou|autoq_automated_kernelwise_neural_network_quantization", "spotlight_video": "", "poster": "", "slides": "", "abstract": "Network quantization is one of the most hardware friendly techniques to enable the deployment of convolutional neural networks (CNNs) on low-power mobile devices. Recent network quantization techniques quantize each weight kernel in a convolutional layer independently for higher inference accuracy, since the weight kernels in a layer exhibit different variances and hence have different amounts of redundancy. The quantization bitwidth or bit number (QBN) directly decides the inference accuracy, latency, energy and hardware overhead. To effectively reduce the redundancy and accelerate CNN inferences, various weight kernels should be quantized with different QBNs. However, prior works use only one QBN to quantize each convolutional layer or the entire CNN, because the design space of searching a QBN for each weight kernel is too large. The hand-crafted heuristic of the kernel-wise QBN search is so sophisticated that domain experts can obtain only sub-optimal results. It is difficult for even deep reinforcement learning (DRL) DDPG-based agents to find a kernel-wise QBN configuration that can achieve reasonable inference accuracy. In this paper, we propose a hierarchical-DRL-based kernel-wise network quantization technique, AutoQ, to automatically search a QBN for each weight kernel, and choose another QBN for each activation layer. Compared to the models quantized by the state-of-the-art DRL-based schemes, on average, the same models quantized by AutoQ reduce the inference latency by 54.06%, and decrease the inference energy consumption by 50.69%, while achieving the same inference accuracy.", "_bibtex": "@inproceedings{\nLou2020AutoQ:,\ntitle={AutoQ: Automated Kernel-Wise Neural Network Quantization },\nauthor={Qian Lou and Feng Guo and Minje Kim and Lantao Liu and Lei Jiang.},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rygfnn4twS}\n}", "authorids": ["louqian@iu.edu", "fengguo@iu.edu", "minje@indiana.edu", "lantao@iu.edu", "jiang60@iu.edu"], "title": "AutoQ: Automated Kernel-Wise Neural Network Quantization ", "authors": ["Qian Lou", "Feng Guo", "Minje Kim", "Lantao Liu", "Lei Jiang."], "original_pdf": "/attachment/d93360a3632d527f1cf25b3a31d9f7da4c24f84d.pdf", "pdf": "/pdf/161683e53131806d4a14c406230d1c65f415c1f9.pdf", "full_presentation_video": ""}, "forum": "rygfnn4twS", "id": "rygfnn4twS"}, "HygsuaNFwr": {"content": {"appendix": "", "TL;DR": "The notion of order learning is proposed and it is applied to regression problems in computer vision", "keywords": ["unsupervised"], "paperhash": "lim|order_learning_and_its_application_to_age_estimation", "code": "https://github.com/changsukim-ku/order-learning", "spotlight_video": "", "authorids": ["kslim@mcl.korea.ac.kr", "nhshin@mcl.korea.ac.kr", "yy77lee@gmail.com", "changsukim@korea.ac.kr"], "poster": "", "slides": "", "authors": ["Kyungsun Lim", "Nyeong-Ho Shin", "Young-Yoon Lee", "Chang-Su Kim"], "_bibtex": "@inproceedings{\nLim2020Order,\ntitle={Order Learning and Its Application to Age Estimation},\nauthor={Kyungsun Lim and Nyeong-Ho Shin and Young-Yoon Lee and Chang-Su Kim},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HygsuaNFwr}\n}", "original_pdf": "/attachment/930084ca12cdd4e920a96e287f92964bee4803d1.pdf", "title": "Order Learning and Its Application to Age Estimation", "pdf": "/pdf/73028cd236d7d7b460edd0121f9cf497c82c9304.pdf", "abstract": "We propose order learning to determine the order graph of classes, representing ranks or priorities, and classify an object instance into one of the classes. To this end, we design a pairwise comparator to categorize the relationship between two instances into one of three cases: one instance is `greater than,' `similar to,' or `smaller than' the other. Then, by comparing an input instance with reference instances and maximizing the consistency among the comparison results, the class of the input can be estimated reliably. We apply order learning to develop a facial age estimator, which provides the state-of-the-art performance. Moreover, the performance is further improved when the order graph is divided into disjoint chains using gender and ethnic group information or even in an unsupervised manner.", "full_presentation_video": ""}, "forum": "HygsuaNFwr", "id": "HygsuaNFwr"}, "HklRwaEKwB": {"content": {"appendix": "", "TL;DR": "We study the structure of ridge regression in a high-dimensional asymptotic framework, and get insights about cross-validation and sketching.", "keywords": ["regression", "regularization"], "paperhash": "liu|ridge_regression_structure_crossvalidation_and_sketching", "code": "https://github.com/liusf15/RidgeRegression", "spotlight_video": "", "authorids": ["sfliu@stanford.edu", "dobribanedgar@gmail.com"], "poster": "", "slides": "", "authors": ["Sifan Liu", "Edgar Dobriban"], "_bibtex": "@inproceedings{\nLiu2020Ridge,\ntitle={Ridge Regression: Structure, Cross-Validation, and Sketching},\nauthor={Sifan Liu and Edgar Dobriban},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HklRwaEKwB}\n}", "original_pdf": "/attachment/93023a7889d9100ddba9f147df77e93d85b4dc78.pdf", "title": "Ridge Regression: Structure, Cross-Validation, and Sketching", "pdf": "/pdf/455671549ea589bd0c09c5457c52062150923fca.pdf", "abstract": "We study the following three fundamental problems about ridge regression: (1) what is the structure of the estimator? (2) how to correctly use cross-validation to choose the regularization parameter? and (3) how to accelerate computation without losing too much accuracy? We consider the three problems in a unified large-data linear model. We give a precise representation of ridge regression as a covariance matrix-dependent linear combination of the true parameter and the noise. \nWe study the bias of $K$-fold cross-validation for choosing the regularization parameter, and propose a simple bias-correction. We analyze the accuracy of primal and dual sketching for ridge regression, showing they are surprisingly accurate. Our results are illustrated by simulations and by analyzing empirical data.", "full_presentation_video": ""}, "forum": "HklRwaEKwB", "id": "HklRwaEKwB"}, "S1xFl64tDr": {"content": {"appendix": "", "keywords": ["adversarial", "privacy"], "paperhash": "xiang|interpretable_complexvalued_neural_networks_for_privacy_protection", "spotlight_video": "", "poster": "", "slides": "", "abstract": "Previous studies have found that an adversary attacker can often infer unintended input information from intermediate-layer features. We study the possibility of preventing such adversarial inference, yet without too much accuracy degradation. We propose a generic method to revise the neural network to boost the challenge of inferring input attributes from features, while maintaining highly accurate outputs. In particular, the method transforms real-valued features into complex-valued ones, in which the input is hidden in a randomized phase of the transformed features. The knowledge of the phase acts like a key, with which any party can easily recover the output from the processing result, but without which the party can neither recover the output nor distinguish the original input. Preliminary experiments on various datasets and network structures have shown that our method significantly diminishes the adversary's ability in inferring about the input while largely preserves the resulting accuracy.", "_bibtex": "@inproceedings{\nXiang2020Interpretable,\ntitle={Interpretable Complex-Valued Neural Networks for Privacy Protection},\nauthor={Liyao Xiang and Hao Zhang and Haotian Ma and Yifan Zhang and Jie Ren and Quanshi Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1xFl64tDr}\n}", "authorids": ["xiangliyao08@sjtu.edu.cn", "1603023-zh@sjtu.edu.cn", "11612807@mail.sustc.edu.cn", "zhangyf_sjtu@sjtu.edu.cn", "ariesrj@sjtu.edu.cn", "zqs1022@sjtu.edu.cn"], "title": "Interpretable Complex-Valued Neural Networks for Privacy Protection", "authors": ["Liyao Xiang", "Hao Zhang", "Haotian Ma", "Yifan Zhang", "Jie Ren", "Quanshi Zhang"], "original_pdf": "/attachment/accb16f16c3413c8bbf6235964977fa63743849d.pdf", "pdf": "/pdf/de30952f44a977af1588f9a66f607e702d9bffcb.pdf", "full_presentation_video": ""}, "forum": "S1xFl64tDr", "id": "S1xFl64tDr"}, "SkgC6TNFvr": {"content": {"appendix": "", "TL;DR": "Learning a labeling policy with reinforcement learning to reduce labeling effort for the task of semantic segmentation", "keywords": ["active learning", "reinforcement learning", "semantic segmentation"], "paperhash": "casanova|reinforced_active_learning_for_image_segmentation", "spotlight_video": "", "poster": "", "slides": "", "abstract": "Learning-based approaches for semantic segmentation have two inherent challenges. First, acquiring pixel-wise labels is expensive and time-consuming. Second, realistic segmentation datasets are highly unbalanced: some categories are much more abundant than others, biasing the performance to the most represented ones. In this paper, we are interested in focusing human labelling effort on a small subset of a larger pool of data, minimizing this effort while maximizing performance of a segmentation model on a hold-out set. We present a new active learning strategy for semantic segmentation based on deep reinforcement learning (RL). An agent learns a policy to select a subset of small informative image regions -- opposed to entire images -- to be labeled, from a pool of unlabeled data. The region selection decision is made based on predictions and uncertainties of the segmentation model being trained. Our method proposes a new modification of the deep Q-network (DQN) formulation for active learning, adapting it to the large-scale nature of semantic segmentation problems. We test the proof of concept in CamVid and provide results in the large-scale dataset Cityscapes. On Cityscapes, our deep RL region-based DQN approach requires roughly 30% less additional labeled data than our most competitive baseline to reach the same performance. Moreover, we find that our method asks for more labels of under-represented categories compared to the baselines, improving their performance and helping to mitigate class imbalance.", "_bibtex": "@inproceedings{\nCasanova2020Reinforced,\ntitle={Reinforced active learning for image segmentation},\nauthor={Arantxa Casanova and Pedro O. Pinheiro and Negar Rostamzadeh and Christopher J. Pal},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SkgC6TNFvr}\n}", "authorids": ["arantxa.casanova-paga@polymtl.ca", "pedro@opinheiro.com", "negar@elementai.com", "chris.j.pal@gmail.com"], "title": "Reinforced active learning for image segmentation", "authors": ["Arantxa Casanova", "Pedro O. Pinheiro", "Negar Rostamzadeh", "Christopher J. Pal"], "original_pdf": "/attachment/e8398606e4c8e6b824a527346bc1a22fb89fc3bf.pdf", "pdf": "/pdf/cddf43c162f589c4d79c0991849cca940858fcae.pdf", "full_presentation_video": ""}, "forum": "SkgC6TNFvr", "id": "SkgC6TNFvr"}, "Hkx6hANtwH": {"content": {"appendix": "", "TL;DR": "We have presented LambdaNet, a neural architecture for type inference that combines the strength of explicit program analysis with graph neural networks.", "keywords": ["graph networks", "nlp"], "paperhash": "wei|lambdanet_probabilistic_type_inference_using_graph_neural_networks", "code": "https://github.com/MrVPlusOne/LambdaNet", "spotlight_video": "", "authorids": ["jiayi@cs.utexas.edu", "maruth@utexas.edu", "gdurrett@cs.utexas.edu", "isil@cs.utexas.edu"], "poster": "", "slides": "", "authors": ["Jiayi Wei", "Maruth Goyal", "Greg Durrett", "Isil Dillig"], "_bibtex": "@inproceedings{\nWei2020LambdaNet:,\ntitle={LambdaNet: Probabilistic Type Inference using Graph Neural Networks},\nauthor={Jiayi Wei and Maruth Goyal and Greg Durrett and Isil Dillig},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Hkx6hANtwH}\n}", "original_pdf": "/attachment/d7b27e1bb1c6ed4881d721d728b2abb4d41488dc.pdf", "title": "LambdaNet: Probabilistic Type Inference using Graph Neural Networks", "pdf": "/pdf/86657ef3180cce2cdbdc9e6f22aeef474f4d60de.pdf", "abstract": "As gradual typing becomes increasingly popular in languages like Python and TypeScript, there is a growing need to infer type annotations automatically. While type annotations help with tasks like code completion and static error catching, these annotations cannot be fully inferred by compilers and are tedious to annotate by hand. This paper proposes a probabilistic type inference scheme for TypeScript based on a graph neural network. Our approach first uses lightweight source code analysis to generate a program abstraction called a type dependency graph, which links type variables with logical constraints as well as name and usage information. Given this program abstraction, we then use a graph neural network to propagate information between related type variables and eventually make type predictions. Our neural architecture can predict both standard types, like number or string, as well as user-defined types that have not been encountered during training. Our experimental results show that our approach outperforms prior work in this space by 14% (absolute) on library types, while having the ability to make type predictions that are out of scope for existing techniques. ", "full_presentation_video": ""}, "forum": "Hkx6hANtwH", "id": "Hkx6hANtwH"}, "BkxRRkSKwr": {"content": {"appendix": "", "TL;DR": "We propose measurement of phrase importance and algorithms for hierarchical explanation of neural sequence model predictions", "keywords": ["interpretability", "nlp", "transformer"], "paperhash": "jin|towards_hierarchical_importance_attribution_explaining_compositional_semantics_for_neural_sequence_models", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Xisen Jin", "Zhongyu Wei", "Junyi Du", "Xiangyang Xue", "Xiang Ren"], "_bibtex": "@inproceedings{\nJin2020Towards,\ntitle={Towards Hierarchical Importance Attribution: Explaining Compositional Semantics for Neural Sequence Models},\nauthor={Xisen Jin and Zhongyu Wei and Junyi Du and Xiangyang Xue and Xiang Ren},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BkxRRkSKwr}\n}", "authorids": ["xisenjin@usc.edu", "zywei@fudan.edu.cn", "junyidu@usc.edu", "xyxue@fudan.edu.cn", "xiangren@usc.edu"], "title": "Towards Hierarchical Importance Attribution: Explaining Compositional Semantics for Neural Sequence Models", "original_pdf": "/attachment/94462e25fc4d879f70724dc698945b4c6573873e.pdf", "pdf": "/pdf/cf05abd1e4d4f8871bbcd84fcf80052e5a33a093.pdf", "abstract": "The impressive performance of neural networks on natural language processing tasks attributes to their ability to model complicated word and phrase compositions. To explain how the model handles semantic compositions, we study hierarchical explanation of neural network predictions. We identify non-additivity and context independent importance attributions within hierarchies as two desirable properties for highlighting word and phrase compositions. We show some prior efforts on hierarchical explanations, e.g. contextual decomposition, do not satisfy the desired properties mathematically, leading to inconsistent explanation quality in different models. In this paper, we start by proposing a formal and general way to quantify the importance of each word and phrase. Following the formulation, we propose Sampling and Contextual Decomposition (SCD) algorithm and Sampling and Occlusion (SOC) algorithm. Human and metrics evaluation on both LSTM models and BERT Transformer models on multiple datasets show that our algorithms outperform prior hierarchical explanation algorithms. Our algorithms help to visualize semantic composition captured by models, extract classification rules and improve human trust of models.", "full_presentation_video": ""}, "forum": "BkxRRkSKwr", "id": "BkxRRkSKwr"}, "SygpC6Ntvr": {"content": {"appendix": "", "TL;DR": "We propose an approach to learn sparse high dimensional representations that are fast to search, by incorporating a surrogate of the number of operations directly into the loss function.", "keywords": ["capacity", "distributed", "locality sensitive hashing", "metric learning", "quantization", "regularization", "representation learning"], "paperhash": "paria|minimizing_flops_to_learn_efficient_sparse_representations", "code": "https://github.com/biswajitsc/sparse-embed", "spotlight_video": "", "authorids": ["bparia@cs.cmu.edu", "cjyeh@cs.cmu.edu", "a061105@gmail.com", "ningxu01@gmail.com", "pradeepr@cs.cmu.edu", "bapoczos@cs.cmu.edu"], "poster": "", "slides": "", "authors": ["Biswajit Paria", "Chih-Kuan Yeh", "Ian E.H. Yen", "Ning Xu", "Pradeep Ravikumar", "Barnab\u00e1s P\u00f3czos"], "_bibtex": "@inproceedings{\nParia2020Minimizing,\ntitle={Minimizing FLOPs to Learn Efficient Sparse Representations},\nauthor={Biswajit Paria and Chih-Kuan Yeh and Ian E.H. Yen and Ning Xu and Pradeep Ravikumar and Barnab\u00e1s P\u00f3czos},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SygpC6Ntvr}\n}", "original_pdf": "/attachment/55832982cb98e24d7307fd6f476267ae9bff1045.pdf", "title": "Minimizing FLOPs to Learn Efficient Sparse Representations", "pdf": "/pdf/91215045fcdc274bcbe6cb882ecc9f9c95e39d5b.pdf", "abstract": "Deep representation learning has become one of the most widely adopted approaches for visual search, recommendation, and identification. Retrieval of such representations from a large database is however computationally challenging. Approximate methods based on learning compact representations, have been widely explored for this problem, such as locality sensitive hashing, product quantization, and PCA. In this work, in contrast to learning compact representations, we propose to learn high dimensional and sparse representations that have similar representational capacity as dense embeddings while being more efficient due to sparse matrix multiplication operations which can be much faster than dense multiplication. Following the key insight that the number of operations decreases quadratically with the sparsity of embeddings provided the non-zero entries are distributed uniformly across dimensions, we propose a novel approach to learn such distributed sparse embeddings via the use of a carefully constructed regularization function that directly minimizes a continuous relaxation of the number of floating-point operations (FLOPs) incurred during retrieval. Our experiments show that our approach is competitive to the other baselines and yields a similar or better speed-vs-accuracy tradeoff on practical datasets", "full_presentation_video": ""}, "forum": "SygpC6Ntvr", "id": "SygpC6Ntvr"}, "r1lfF2NYvH": {"content": {"appendix": "", "keywords": ["generalization", "mutual information", "nlp", "representation learning", "unsupervised"], "paperhash": "sun|infograph_unsupervised_and_semisupervised_graphlevel_representation_learning_via_mutual_information_maximization", "code": "https://github.com/fanyun-sun/InfoGraph", "spotlight_video": "", "poster": "", "slides": "", "abstract": "This paper studies learning the representations of whole graphs in both unsupervised and semi-supervised scenarios. Graph-level representations are critical in a variety of real-world applications such as predicting the properties of molecules and community analysis in social networks. Traditional graph kernel based methods are simple, yet effective for obtaining fixed-length representations for graphs but they suffer from poor generalization due to hand-crafted designs. There are also some recent methods based on language models (e.g. graph2vec) but they tend to only consider certain substructures (e.g. subtrees) as graph representatives. Inspired by recent progress of unsupervised representation learning, in this paper we proposed a novel method called InfoGraph for learning graph-level representations. We maximize the mutual information between the graph-level representation and the representations of substructures of different scales (e.g., nodes, edges, triangles). By doing so, the graph-level representations encode aspects of the data that are shared across different scales of substructures. Furthermore, we further propose InfoGraph*, an extension of InfoGraph for semisupervised scenarios. InfoGraph* maximizes the mutual information between unsupervised graph representations learned by InfoGraph and the representations learned by existing supervised methods. As a result, the supervised encoder learns from unlabeled data while preserving the latent semantic space favored by the current supervised task. Experimental results on the tasks of graph classification and molecular property prediction show that InfoGraph is superior to state-of-the-art baselines and InfoGraph* can achieve performance competitive with state-of-the-art semi-supervised models.", "_bibtex": "@inproceedings{\nSun2020InfoGraph:,\ntitle={InfoGraph: Unsupervised and Semi-supervised Graph-Level Representation Learning via Mutual Information Maximization},\nauthor={Fan-Yun Sun and Jordan Hoffman and Vikas Verma and Jian Tang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1lfF2NYvH}\n}", "authorids": ["sunfanyun@gmail.com", "jhoffmann@g.harvard.edu", "vikasverma.iitm@gmail.com", "jian.tang@hec.ca"], "title": "InfoGraph: Unsupervised and Semi-supervised Graph-Level Representation Learning via Mutual Information Maximization", "authors": ["Fan-Yun Sun", "Jordan Hoffman", "Vikas Verma", "Jian Tang"], "original_pdf": "/attachment/cea3381f36450012704b34f5a71c87d899d2249f.pdf", "pdf": "/pdf/af171fb8c60fa180c4dcf349ccc51ff006211216.pdf", "full_presentation_video": ""}, "forum": "r1lfF2NYvH", "id": "r1lfF2NYvH"}, "SJlVY04FwH": {"content": {"appendix": "", "TL;DR": "We systematically analyze the convergence of popular gradient algorithms for solving bilinear games, with both simultaneous and alternating updates.", "keywords": ["adversarial", "attention", "gan", "generative models", "optimization"], "paperhash": "zhang|convergence_of_gradient_methods_on_bilinear_zerosum_games", "code": "https://github.com/Gordon-Guojun-Zhang/ICLR-2020", "spotlight_video": "", "authorids": ["guojun.zhang@uwaterloo.ca", "yaoliang.yu@uwaterloo.ca"], "poster": "", "slides": "", "authors": ["Guojun Zhang", "Yaoliang Yu"], "_bibtex": "@inproceedings{\nZhang2020Convergence,\ntitle={Convergence of Gradient Methods on Bilinear Zero-Sum Games},\nauthor={Guojun Zhang and Yaoliang Yu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJlVY04FwH}\n}", "original_pdf": "/attachment/9492cb48fda4996a2b7ebb2bedec9e545aef5cdc.pdf", "title": "Convergence of Gradient Methods on Bilinear Zero-Sum Games", "pdf": "/pdf/c6b27aac7da2d6da7c2054dfe64270f068366b49.pdf", "abstract": "Min-max formulations have attracted great attention in the ML community due to the rise of deep generative models and adversarial methods, while understanding the dynamics of gradient algorithms for solving such formulations has remained a grand challenge. As a first step, we restrict to bilinear zero-sum games and give a systematic analysis of popular gradient updates, for both simultaneous and alternating versions. We provide exact conditions for their convergence and find the optimal parameter setup and convergence rates. In particular, our results offer formal evidence that alternating updates converge \"better\" than simultaneous ones.", "full_presentation_video": ""}, "forum": "SJlVY04FwH", "id": "SJlVY04FwH"}, "SJx-j64FDr": {"content": {"appendix": "", "TL;DR": "Formal analysis of Binarized Neural Networks ", "keywords": ["generation", "reasoning", "verification"], "paperhash": "narodytska|in_search_for_a_satfriendly_binarized_neural_network_architecture", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Nina Narodytska", "Hongce Zhang", "Aarti Gupta", "Toby Walsh"], "_bibtex": "@inproceedings{\nNarodytska2020In,\ntitle={In Search for a SAT-friendly Binarized Neural Network Architecture},\nauthor={Nina Narodytska and Hongce Zhang and Aarti Gupta and Toby Walsh},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJx-j64FDr}\n}", "authorids": ["n.narodytska@gmail.com", "hongcez@princeton.edu", "aartig@cs.princeton.edu", "toby.walsh@data61.csiro.au"], "title": "In Search for a SAT-friendly Binarized Neural Network Architecture", "original_pdf": "/attachment/3fc53365408e96b237130a7950d50c711d6eb3fc.pdf", "pdf": "/pdf/44fb42f333f0090b500e456a747a7ce8b1a57ffd.pdf", "abstract": "Analyzing the behavior of neural networks is one of the most pressing challenges in deep learning. Binarized Neural Networks are an important class of networks that allow equivalent representation in Boolean logic and can be analyzed formally with logic-based reasoning tools like SAT solvers. Such tools can be used to answer existential and probabilistic queries about the network, perform explanation generation, etc. However, the main bottleneck for all methods is their ability to reason about large BNNs efficiently. In this work, we analyze architectural design choices of BNNs and discuss how they affect the performance of logic-based reasoners. We propose changes to the BNN architecture and the training procedure to get a simpler network for SAT solvers without sacrificing accuracy on the primary task. Our experimental results demonstrate that our approach scales to larger deep neural networks compared to existing work for existential and probabilistic queries, leading to significant speed ups on all tested datasets.\n", "full_presentation_video": ""}, "forum": "SJx-j64FDr", "id": "SJx-j64FDr"}, "H1lBj2VFPS": {"content": {"appendix": "", "TL;DR": "We introduce an efficient quantization process that allows for performance acceleration on specialized integer-only neural network accelerator.", "keywords": ["acceleration", "batch normalization", "quantization"], "paperhash": "zhao|linear_symmetric_quantization_of_neural_networks_for_lowprecision_integer_hardware", "code": "https://anonymous.4open.science/r/c05a5b6a-1d0c-4201-926f-e7b52034f7a5/", "spotlight_video": "", "authorids": ["zhaoxiandong@ict.ac.cn", "wangying2009@ict.ac.cn", "caixuyi18s@ict.ac.cn", "liucheng@ict.ac.cn", "zlei@ict.ac.cn"], "poster": "", "slides": "", "authors": ["Xiandong Zhao", "Ying Wang", "Xuyi Cai", "Cheng Liu", "Lei Zhang"], "_bibtex": "@inproceedings{\nZhao2020Linear,\ntitle={Linear Symmetric Quantization of Neural Networks for Low-precision Integer Hardware},\nauthor={Xiandong Zhao and Ying Wang and Xuyi Cai and Cheng Liu and Lei Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1lBj2VFPS}\n}", "original_pdf": "/attachment/019ad49b7d510961532f62f1dd184434d19415e5.pdf", "title": "Linear Symmetric Quantization of Neural Networks for Low-precision Integer Hardware", "pdf": "/pdf/e33b3876da00219dcd4120fa7a53b8654123c0aa.pdf", "abstract": "With the proliferation of specialized neural network processors that operate on low-precision integers, the performance of Deep Neural Network inference becomes increasingly dependent on the result of quantization. Despite plenty of prior work on the quantization of weights or activations for neural networks, there is still a wide gap between the software quantizers and the low-precision accelerator implementation, which degrades either the efficiency of networks or that of the hardware for the lack of software and hardware coordination at design-phase. In this paper, we propose a learned linear symmetric quantizer for integer neural network processors, which not only quantizes neural parameters and activations to low-bit integer but also accelerates hardware inference by using batch normalization fusion and low-precision accumulators (e.g., 16-bit) and multipliers (e.g., 4-bit). We use a unified way to quantize weights and activations, and the results outperform many previous approaches for various networks such as AlexNet, ResNet, and lightweight models like MobileNet while keeping friendly to the accelerator architecture. Additional, we also apply the method to object detection models and witness high performance and accuracy in YOLO-v2. Finally, we deploy the quantized models on our specialized integer-arithmetic-only DNN accelerator to show the effectiveness of the proposed quantizer. We show that even with linear symmetric quantization, the results can be better than asymmetric or non-linear methods in 4-bit networks. In evaluation, the proposed quantizer induces less than 0.4\\% accuracy drop in ResNet18, ResNet34, and AlexNet when quantizing the whole network as required by the integer processors.", "full_presentation_video": ""}, "forum": "H1lBj2VFPS", "id": "H1lBj2VFPS"}, "SJgVHkrYDH": {"content": {"appendix": "", "TL;DR": "Graph-based recurrent retriever that learns to retrieve reasoning paths over Wikipedia Graph outperforms the most recent state of the art on HotpotQA by more than 14 points.", "keywords": ["question answering", "reasoning", "robustness"], "paperhash": "asai|learning_to_retrieve_reasoning_paths_over_wikipedia_graph_for_question_answering", "code": "https://github.com/AkariAsai/learning_to_retrieve_reasoning_paths", "spotlight_video": "", "authorids": ["akari@cs.washington.edu", "k.hashimoto@salesforce.com", "hannaneh@washington.edu", "richard@socher.org", "cxiong@salesforce.com"], "poster": "", "slides": "", "authors": ["Akari Asai", "Kazuma Hashimoto", "Hannaneh Hajishirzi", "Richard Socher", "Caiming Xiong"], "_bibtex": "@inproceedings{\nAsai2020Learning,\ntitle={Learning to Retrieve Reasoning Paths over Wikipedia Graph for Question Answering},\nauthor={Akari Asai and Kazuma Hashimoto and Hannaneh Hajishirzi and Richard Socher and Caiming Xiong},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJgVHkrYDH}\n}", "original_pdf": "/attachment/2f2f9627af27190013bf543bde66e5a891dfbf12.pdf", "title": "Learning to Retrieve Reasoning Paths over Wikipedia Graph for Question Answering", "pdf": "/pdf/98c90d0278593da54e83f1f5273782c6534da212.pdf", "abstract": "Answering questions that require multi-hop reasoning at web-scale necessitates retrieving multiple evidence documents, one of which often has little lexical or semantic relationship to the question. This paper introduces a new graph-based recurrent retrieval approach that learns to retrieve reasoning paths over the Wikipedia graph to answer multi-hop open-domain questions. Our retriever model trains a recurrent neural network that learns to sequentially retrieve evidence paragraphs in the reasoning path by conditioning on the previously retrieved documents. \nOur reader model ranks the reasoning paths and extracts the answer span included in the best reasoning path.\nExperimental results show state-of-the-art results in three open-domain QA datasets, showcasing the effectiveness and robustness of our method. Notably, our method achieves significant improvement in HotpotQA, outperforming the previous best model by more than 14 points.", "full_presentation_video": ""}, "forum": "SJgVHkrYDH", "id": "SJgVHkrYDH"}, "HkxjqxBYDB": {"content": {"appendix": "", "keywords": ["associative memory", "episodic memory", "memory", "navigation", "reasoning", "reinforcement learning", "sample efficiency"], "paperhash": "zhu|episodic_reinforcement_learning_with_associative_memory", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Guangxiang Zhu*", "Zichuan Lin*", "Guangwen Yang", "Chongjie Zhang"], "_bibtex": "@inproceedings{\nZhu*2020Episodic,\ntitle={Episodic Reinforcement Learning with Associative Memory},\nauthor={Guangxiang Zhu* and Zichuan Lin* and Guangwen Yang and Chongjie Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HkxjqxBYDB}\n}", "authorids": ["guangxiangzhu@outlook.com", "linzc16@mails.tsinghua.edu.cn", "ygw@tsinghua.edu.cn", "chongjie@tsinghua.edu.cn"], "title": "Episodic Reinforcement Learning with Associative Memory", "original_pdf": "/attachment/b3963353655f4b6edf2061ed8928ec1fa5d38c93.pdf", "pdf": "/pdf/1512938bac7377947bfce4dab3d34b8330ea3ec7.pdf", "abstract": "Sample efficiency has been one of the major challenges for deep reinforcement learning. Non-parametric episodic control has been proposed to speed up parametric reinforcement learning by rapidly latching on previously successful policies. However, previous work on episodic reinforcement learning neglects the relationship between states and only stored the experiences as unrelated items. To improve sample efficiency of reinforcement learning, we propose a novel framework, called Episodic Reinforcement Learning with Associative Memory (ERLAM), which associates related experience trajectories to enable reasoning effective strategies. We build a graph on top of states in memory based on state transitions and develop a reverse-trajectory propagation strategy to allow rapid value propagation through the graph. We use the non-parametric associative memory as early guidance for a parametric reinforcement learning model. Results on navigation domain and Atari games show our framework achieves significantly higher sample efficiency than state-of-the-art episodic reinforcement learning models.", "full_presentation_video": ""}, "forum": "HkxjqxBYDB", "id": "HkxjqxBYDB"}, "BJliakStvH": {"content": {"appendix": "", "TL;DR": "Our method infers constraints on task execution by leveraging the principle of maximum entropy to quantify how demonstrations differ from expected, un-constrained behavior.", "keywords": ["imitation learning", "inverse reinforcement learning", "reinforcement learning"], "paperhash": "scobee|maximum_likelihood_constraint_inference_for_inverse_reinforcement_learning", "code": "https://drive.google.com/drive/folders/1h2J7o4w4J0_dpldTRpFu_jWQR8CkBbXw", "spotlight_video": "", "authorids": ["dscobee@eecs.berkeley.edu", "sastry@eecs.berkeley.edu"], "poster": "", "slides": "", "authors": ["Dexter R.R. Scobee", "S. Shankar Sastry"], "_bibtex": "@inproceedings{\nScobee2020Maximum,\ntitle={Maximum Likelihood Constraint Inference for Inverse Reinforcement Learning},\nauthor={Dexter R.R. Scobee and S. Shankar Sastry},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJliakStvH}\n}", "original_pdf": "/attachment/40b7ca2d04b1bb62b9a7cea7903d1b7f0d7239dc.pdf", "title": "Maximum Likelihood Constraint Inference for Inverse Reinforcement Learning", "pdf": "/pdf/f0b5f65775099dec7c004e553ad1b8d364e6a226.pdf", "abstract": "While most approaches to the problem of Inverse Reinforcement Learning (IRL) focus on estimating a reward function that best explains an expert agent\u2019s policy or demonstrated behavior on a control task, it is often the case that such behavior is more succinctly represented by a simple reward combined with a set of hard constraints. In this setting, the agent is attempting to maximize cumulative rewards subject to these given constraints on their behavior. We reformulate the problem of IRL on Markov Decision Processes (MDPs) such that, given a nominal model of the environment and a nominal reward function, we seek to estimate state, action, and feature constraints in the environment that motivate an agent\u2019s behavior. Our approach is based on the Maximum Entropy IRL framework, which allows us to reason about the likelihood of an expert agent\u2019s demonstrations given our knowledge of an MDP. Using our method, we can infer which constraints can be added to the MDP to most increase the likelihood of observing these demonstrations. We present an algorithm which iteratively infers the Maximum Likelihood Constraint to best explain observed behavior, and we evaluate its efficacy using both simulated behavior and recorded data of humans navigating around an obstacle.", "full_presentation_video": ""}, "forum": "BJliakStvH", "id": "BJliakStvH"}, "SJxIm0VtwH": {"content": {"appendix": "", "TL;DR": "This paper provides novel analysis of adaptive gradient algorithms for solving non-convex non-concave min-max problems as GANs, and explains the reason why adaptive gradient methods outperform its non-adaptive counterparts by empirical studies.", "keywords": ["adversarial", "optimization"], "paperhash": "liu|towards_better_understanding_of_adaptive_gradient_algorithms_in_generative_adversarial_nets", "spotlight_video": "", "poster": "", "slides": "", "abstract": "Adaptive gradient algorithms perform gradient-based updates using the history of gradients and are ubiquitous in training deep neural networks. While adaptive gradient methods theory is well understood for minimization problems, the underlying factors driving their empirical success in min-max problems such as GANs remain unclear. In this paper, we aim at bridging this gap from both theoretical and empirical perspectives. First, we analyze a variant of Optimistic Stochastic Gradient (OSG) proposed in~\\citep{daskalakis2017training} for solving a class of non-convex non-concave min-max problem and establish $O(\\epsilon^{-4})$ complexity for finding $\\epsilon$-first-order stationary point, in which the algorithm only requires invoking one stochastic first-order oracle while enjoying state-of-the-art iteration complexity achieved by stochastic extragradient method by~\\citep{iusem2017extragradient}. Then we propose an adaptive variant of OSG named Optimistic Adagrad (OAdagrad) and reveal an \\emph{improved} adaptive complexity $\\widetilde{O}\\left(\\epsilon^{-\\frac{2}{1-\\alpha}}\\right)$~\\footnote{Here $\\widetilde{O}(\\cdot)$ compresses a logarithmic factor of $\\epsilon$.}, where $\\alpha$ characterizes the growth rate of the cumulative stochastic gradient and $0\\leq \\alpha\\leq 1/2$. To the best of our knowledge, this is the first work for establishing adaptive complexity in non-convex non-concave min-max optimization. Empirically, our experiments show that indeed adaptive gradient algorithms outperform their non-adaptive counterparts in GAN training. Moreover, this observation can be explained by the slow growth rate of the cumulative stochastic gradient, as observed empirically.", "_bibtex": "@inproceedings{\nliu2020towards,\ntitle={Towards Better Understanding of Adaptive Gradient Algorithms in Generative Adversarial Nets},\nauthor={Mingrui Liu and Youssef Mroueh and Jerret Ross and Wei Zhang and Xiaodong Cui and Payel Das and Tianbao Yang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJxIm0VtwH}\n}", "authorids": ["mingrui-liu@uiowa.edu", "mroueh@us.ibm.com", "rossja@us.ibm.com", "weiz@us.ibm.com", "cuix@us.ibm.com", "daspa@us.ibm.com", "tianbao-yang@uiowa.edu"], "title": "Towards Better Understanding of Adaptive Gradient Algorithms in Generative Adversarial Nets", "authors": ["Mingrui Liu", "Youssef Mroueh", "Jerret Ross", "Wei Zhang", "Xiaodong Cui", "Payel Das", "Tianbao Yang"], "original_pdf": "/attachment/37fc9fc009ff2d927a743d72c2a80e5b91c5c0d5.pdf", "pdf": "/pdf/0b7e14a04cf246e137c1d50f65a5cb3155c95602.pdf", "full_presentation_video": ""}, "forum": "SJxIm0VtwH", "id": "SJxIm0VtwH"}, "HJxNAnVtDS": {"content": {"appendix": "", "keywords": ["federated learning", "gradient descent", "learning rate", "optimization"], "paperhash": "li|on_the_convergence_of_fedavg_on_noniid_data", "code": "https://github.com/lx10077/fedavgpy", "spotlight_video": "", "poster": "", "slides": "", "abstract": "Federated learning enables a large amount of edge computing devices to jointly learn a model without data sharing. As a leading algorithm in this setting, Federated Averaging (\\texttt{FedAvg}) runs Stochastic Gradient Descent (SGD) in parallel on a small subset of the total devices and averages the sequences only once in a while. Despite its simplicity, it lacks theoretical guarantees under realistic settings. In this paper, we analyze the convergence of \\texttt{FedAvg} on non-iid data and establish a convergence rate of $\\mathcal{O}(\\frac{1}{T})$ for strongly convex and smooth problems, where $T$ is the number of SGDs. Importantly, our bound demonstrates a trade-off between communication-efficiency and convergence rate. As user devices may be disconnected from the server, we relax the assumption of full device participation to partial device participation and study different averaging schemes; low device participation rate can be achieved without severely slowing down the learning. Our results indicate that heterogeneity of data slows down the convergence, which matches empirical observations. Furthermore, we provide a necessary condition for \\texttt{FedAvg} on non-iid data: the learning rate $\\eta$ must decay, even if full-gradient is used; otherwise, the solution will be $\\Omega (\\eta)$ away from the optimal.", "_bibtex": "@inproceedings{\nLi2020On,\ntitle={On the Convergence of FedAvg on Non-IID Data},\nauthor={Xiang Li and Kaixuan Huang and Wenhao Yang and Shusen Wang and Zhihua Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJxNAnVtDS}\n}", "authorids": ["smslixiang@pku.edu.cn", "hackyhuang@pku.edu.cn", "yangwhsms@gmail.com", "shusen.wang@stevens.edu", "zhzhang@math.pku.edu.cn"], "title": "On the Convergence of FedAvg on Non-IID Data", "authors": ["Xiang Li", "Kaixuan Huang", "Wenhao Yang", "Shusen Wang", "Zhihua Zhang"], "original_pdf": "/attachment/7ebf4085003e4d555a0668dcc5ad1661e7ef13ba.pdf", "pdf": "/pdf/dcd68da80bc99678b1254ec0b4d49dee872c7898.pdf", "full_presentation_video": ""}, "forum": "HJxNAnVtDS", "id": "HJxNAnVtDS"}, "HJloElBYvB": {"content": {"appendix": "", "TL;DR": "We give a theoretical analysis of the Information Bottleneck objective to understand and predict observed phase transitions in the prediction vs. compression tradeoff.", "keywords": ["compression", "information bottleneck", "information theory", "loss landscape", "representation learning"], "paperhash": "wu|phase_transitions_for_the_information_bottleneck_in_representation_learning", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Tailin Wu", "Ian Fischer"], "_bibtex": "@inproceedings{\nWu2020Phase,\ntitle={Phase Transitions for the Information Bottleneck in Representation Learning},\nauthor={Tailin Wu and Ian Fischer},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJloElBYvB}\n}", "authorids": ["tailin@mit.edu", "iansf@google.com"], "title": "Phase Transitions for the Information Bottleneck in Representation Learning", "original_pdf": "/attachment/865210d98688510472e5da7dfe7f00e71f0de6ad.pdf", "pdf": "/pdf/cd060172f7b627cf51d4832079d5117557d67981.pdf", "abstract": "In the Information Bottleneck (IB), when tuning the relative strength between compression and prediction terms, how do the two terms behave, and what's their relationship with the dataset and the learned representation? In this paper, we set out to answer these questions by studying multiple phase transitions in the IB objective: IB_\u03b2[p(z|x)] = I(X; Z) \u2212 \u03b2I(Y; Z) defined on the encoding distribution p(z|x) for input X, target Y and representation Z, where sudden jumps of dI(Y; Z)/d\u03b2 and prediction accuracy are observed with increasing \u03b2. We introduce a definition for IB phase transitions as a qualitative change of the IB loss landscape, and show that the transitions correspond to the onset of learning new classes. Using second-order calculus of variations, we derive a formula that provides a practical condition for IB phase transitions, and draw its connection with the Fisher information matrix for parameterized models. We provide two perspectives to understand the formula, revealing that each IB phase transition is finding a component of maximum (nonlinear) correlation between X and Y orthogonal to the learned representation, in close analogy with canonical-correlation analysis (CCA) in linear settings. Based on the theory, we present an algorithm for discovering phase transition points. Finally, we verify that our theory and algorithm accurately predict phase transitions in categorical datasets, predict the onset of learning new classes and class difficulty in MNIST, and predict prominent phase transitions in CIFAR10.\n", "full_presentation_video": ""}, "forum": "HJloElBYvB", "id": "HJloElBYvB"}, "S1e_9xrFvS": {"content": {"appendix": "", "TL;DR": "Energy-based models trained on crystallized protein structures predict native side chain configurations and automatically discover molecular energy features.", "keywords": ["transformer"], "paperhash": "du|energybased_models_for_atomicresolution_protein_conformations", "code": "https://github.com/facebookresearch/protein-ebm", "spotlight_video": "", "authorids": ["yilundu@mit.edu", "jmeier@fb.com", "maj@fb.com", "robfergus@fb.com", "arives@cs.nyu.edu"], "poster": "", "slides": "", "authors": ["Yilun Du", "Joshua Meier", "Jerry Ma", "Rob Fergus", "Alexander Rives"], "_bibtex": "@inproceedings{\nDu2020Energy-based,\ntitle={Energy-based models for atomic-resolution protein conformations},\nauthor={Yilun Du and Joshua Meier and Jerry Ma and Rob Fergus and Alexander Rives},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1e_9xrFvS}\n}", "original_pdf": "/attachment/dfaa5450a616e2c3d5a08f6ff36c3616b582b117.pdf", "title": "Energy-based models for atomic-resolution protein conformations", "pdf": "/pdf/18cca0a2ddd3d7d3db696683694911d0715c5b8a.pdf", "abstract": "We propose an energy-based model (EBM) of protein conformations that operates at atomic scale. The model is trained solely on crystallized protein data. By contrast, existing approaches for scoring conformations use energy functions that incorporate knowledge of physical principles and features that are the complex product of several decades of research and tuning. To evaluate the model, we benchmark on the rotamer recovery task, the problem of predicting the conformation of a side chain from its context within a protein structure, which has been used to evaluate energy functions for protein design. The model achieves performance close to that of the Rosetta energy function, a state-of-the-art method widely used in protein structure prediction and design. An investigation of the model\u2019s outputs and hidden representations finds that it captures physicochemical properties relevant to protein energy.", "full_presentation_video": ""}, "forum": "S1e_9xrFvS", "id": "S1e_9xrFvS"}, "SJetQpEYvB": {"content": {"appendix": "", "keywords": ["graph embedding", "graph networks", "memory", "transfer learning"], "paperhash": "shi|learning_execution_through_neural_code_fusion", "code": "https://www.dropbox.com/s/yrjhx8ifowdktwh/ncf_code.zip?dl=0", "spotlight_video": "", "poster": "", "slides": "", "abstract": "As the performance of computer systems stagnates due to the end of Moore\u2019s Law,\nthere is a need for new models that can understand and optimize the execution\nof general purpose code. While there is a growing body of work on using Graph\nNeural Networks (GNNs) to learn static representations of source code, these\nrepresentations do not understand how code executes at runtime. In this work, we\npropose a new approach using GNNs to learn fused representations of general\nsource code and its execution. Our approach defines a multi-task GNN over\nlow-level representations of source code and program state (i.e., assembly code\nand dynamic memory states), converting complex source code constructs and data\nstructures into a simpler, more uniform format. We show that this leads to improved\nperformance over similar methods that do not use execution and it opens the door\nto applying GNN models to new tasks that would not be feasible from static code\nalone. As an illustration of this, we apply the new model to challenging dynamic\ntasks (branch prediction and prefetching) from the SPEC CPU benchmark suite,\noutperforming the state-of-the-art by 26% and 45% respectively. Moreover, we\nuse the learned fused graph embeddings to demonstrate transfer learning with high\nperformance on an indirectly related algorithm classification task.", "_bibtex": "@inproceedings{\nShi2020LEARNING,\ntitle={LEARNING EXECUTION THROUGH NEURAL CODE FUSION},\nauthor={Zhan Shi and Kevin Swersky and Daniel Tarlow and Parthasarathy Ranganathan and Milad Hashemi},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJetQpEYvB}\n}", "authorids": ["zshi17@cs.utexas.edu", "kswersky@google.com", "dtarlow@google.com", "parthas@google.com", "miladh@google.com"], "title": "LEARNING EXECUTION THROUGH NEURAL CODE FUSION", "authors": ["Zhan Shi", "Kevin Swersky", "Daniel Tarlow", "Parthasarathy Ranganathan", "Milad Hashemi"], "original_pdf": "/attachment/3d2c49efa4de6c2495d67de08cf42b77e1c33147.pdf", "pdf": "/pdf/607dc220d4f786c1a3b440746c261155e31143d5.pdf", "full_presentation_video": ""}, "forum": "SJetQpEYvB", "id": "SJetQpEYvB"}, "rylwJxrYDS": {"content": {"appendix": "", "TL;DR": "Learn how to quantize speech signal and apply algorithms requiring discrete inputs to audio data such as BERT.", "keywords": ["clustering", "representation learning", "self supervised learning"], "paperhash": "baevski|vqwav2vec_selfsupervised_learning_of_discrete_speech_representations", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Alexei Baevski", "Steffen Schneider", "Michael Auli"], "_bibtex": "@inproceedings{\nBaevski2020vq-wav2vec:,\ntitle={vq-wav2vec: Self-Supervised Learning of Discrete Speech Representations},\nauthor={Alexei Baevski and Steffen Schneider and Michael Auli},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rylwJxrYDS}\n}", "authorids": ["alexei.b@gmail.com", "stes@fb.com", "michael.auli@gmail.com"], "title": "vq-wav2vec: Self-Supervised Learning of Discrete Speech Representations", "original_pdf": "/attachment/410980b52776c4a8c8a87309e55b0b0da2c36caa.pdf", "pdf": "/pdf/0a5ac6d85b01d047385eff9fc4507ef6fc067b1d.pdf", "abstract": "We propose vq-wav2vec to learn discrete representations of audio segments through a wav2vec-style self-supervised context prediction task. The algorithm uses either a gumbel softmax or online k-means clustering to quantize the dense representations. Discretization enables the direct application of algorithms from the NLP community which require discrete inputs. Experiments show that BERT pre-training achieves a new state of the art on TIMIT phoneme classification and WSJ speech recognition.", "full_presentation_video": ""}, "forum": "rylwJxrYDS", "id": "rylwJxrYDS"}, "BkluqlSFDS": {"content": {"appendix": "", "TL;DR": "Communication efficient federated learning with layer-wise matching", "keywords": ["cnn", "federated learning"], "paperhash": "wang|federated_learning_with_matched_averaging", "code": "https://github.com/IBM/FedMA", "spotlight_video": "", "authorids": ["hongyiwang@cs.wisc.edu", "mikhail.yurochkin@ibm.com", "yuekai@umich.edu", "dimitris@papail.io", "yasaman.khazaeni@us.ibm.com"], "poster": "", "slides": "", "authors": ["Hongyi Wang", "Mikhail Yurochkin", "Yuekai Sun", "Dimitris Papailiopoulos", "Yasaman Khazaeni"], "_bibtex": "@inproceedings{\nWang2020Federated,\ntitle={Federated Learning with Matched Averaging},\nauthor={Hongyi Wang and Mikhail Yurochkin and Yuekai Sun and Dimitris Papailiopoulos and Yasaman Khazaeni},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BkluqlSFDS}\n}", "original_pdf": "/attachment/91e4a38abd8596c8f85df8727ad9befa1d5198e0.pdf", "title": "Federated Learning with Matched Averaging", "pdf": "/pdf/6b9ef72b07bb3390dcf6145f41df02ceffbb916e.pdf", "abstract": "Federated learning allows edge devices to collaboratively learn a shared model while keeping the training data on device, decoupling the ability to do model training from the need to store the data in the cloud. We propose Federated matched averaging (FedMA) algorithm designed for federated learning of modern neural network architectures e.g. convolutional neural networks (CNNs) and LSTMs. FedMA constructs the shared global model in a layer-wise manner by matching and averaging hidden elements (i.e. channels for convolution layers; hidden states for LSTM; neurons for fully connected layers) with similar feature extraction signatures. Our experiments indicate that FedMA not only outperforms popular state-of-the-art federated learning algorithms on deep CNN and LSTM architectures trained on real world datasets, but also reduces the overall communication burden.", "full_presentation_video": ""}, "forum": "BkluqlSFDS", "id": "BkluqlSFDS"}, "ByxaUgrFvH": {"content": {"appendix": "", "keywords": ["information bottleneck", "mutual information", "representation learning", "unsupervised"], "paperhash": "wen|mutual_information_gradient_estimation_for_representation_learning", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Liangjian Wen", "Yiji Zhou", "Lirong He", "Mingyuan Zhou", "Zenglin Xu"], "_bibtex": "@inproceedings{\nWen2020Mutual,\ntitle={Mutual Information Gradient Estimation for Representation Learning},\nauthor={Liangjian Wen and Yiji Zhou and Lirong He and Mingyuan Zhou and Zenglin Xu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ByxaUgrFvH}\n}", "authorids": ["wlj6816@gmail.com", "zhouyiji@outlook.com", "ronghe1217@gmail.com", "mingyuan.zhou@mccombs.utexas.edu", "zenglin@gmail.com"], "title": "Mutual Information Gradient Estimation for Representation Learning", "original_pdf": "/attachment/39912406879177b92545e396c5e43caf4353279f.pdf", "pdf": "/pdf/ea84a2f99f1dc41367d0c5c08aff02a82d9a1936.pdf", "abstract": "Mutual Information (MI) plays an important role in representation learning. However, MI is unfortunately intractable in continuous and high-dimensional settings. Recent advances establish tractable and scalable MI estimators to discover useful representation. However, most of the existing methods are not capable of providing an accurate estimation of MI with low-variance when the MI is large. We argue that directly estimating the gradients of MI is more appealing for representation learning than estimating MI in itself. To this end, we propose the Mutual Information Gradient Estimator (MIGE) for representation learning based on the score estimation of implicit distributions. MIGE exhibits a tight and smooth gradient estimation of MI in the high-dimensional and large-MI settings. We expand the applications of MIGE in both unsupervised learning of deep representations based on InfoMax and the Information Bottleneck method. Experimental results have indicated significant performance improvement in learning useful representation.", "full_presentation_video": ""}, "forum": "ByxaUgrFvH", "id": "ByxaUgrFvH"}, "H1ezFREtwH": {"content": {"appendix": "", "TL;DR": "We propose a novel reinforcement learning-based skill transfer and composition method that takes the agent's primitive policies to solve unseen tasks.", "keywords": ["planning", "reinforcement learning", "transfer learning"], "paperhash": "qureshi|composing_taskagnostic_policies_with_deep_reinforcement_learning", "code": "https://drive.google.com/file/d/1pbF9vMy5E3NLdOE5Id5zqzKlUesgStym/view?usp=sharing", "spotlight_video": "", "authorids": ["a1qureshi@ucsd.edu", "jjj025@eng.ucsd.edu", "y1qin@eng.ucsd.edu", "tjwest@ucsd.edu", "bboots@cs.washington.edu", "yip@ucsd.edu"], "poster": "", "slides": "", "authors": ["Ahmed H. Qureshi", "Jacob J. Johnson", "Yuzhe Qin", "Taylor Henderson", "Byron Boots", "Michael C. Yip"], "_bibtex": "@inproceedings{\nQureshi2020Composing,\ntitle={Composing Task-Agnostic Policies with Deep Reinforcement Learning},\nauthor={Ahmed H. Qureshi and Jacob J. Johnson and Yuzhe Qin and Taylor Henderson and Byron Boots and Michael C. Yip},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1ezFREtwH}\n}", "original_pdf": "/attachment/43e3fef7dfc61219e4b6f1f8beb9d17426bd8962.pdf", "title": "Composing Task-Agnostic Policies with Deep Reinforcement Learning", "pdf": "/pdf/d6449c3d0afd1851b14d77c73b095a010e2d83bf.pdf", "abstract": "The composition of elementary behaviors to solve challenging transfer learning problems is one of the key elements in building intelligent machines. To date, there has been plenty of work on learning task-specific policies or skills but almost no focus on composing necessary, task-agnostic skills to find a solution to new problems. In this paper, we propose a novel deep reinforcement learning-based skill transfer and composition method that takes the agent's primitive policies to solve unseen tasks. We evaluate our method in difficult cases where training policy through standard reinforcement learning (RL) or even hierarchical RL is either not feasible or exhibits high sample complexity. We show that our method not only transfers skills to new problem settings but also solves the challenging environments requiring both task planning and motion control with high data efficiency.", "full_presentation_video": ""}, "forum": "H1ezFREtwH", "id": "H1ezFREtwH"}, "SJleNCNtDH": {"content": {"appendix": "", "TL;DR": "We propose a formulation of intrinsic motivation that is suitable as an exploration bias in synergistic multi-agent tasks, by encouraging agents to affect the world in ways that would not be achieved if they were acting individually.", "keywords": ["intrinsic motivation", "reinforcement learning"], "paperhash": "chitnis|intrinsic_motivation_for_encouraging_synergistic_behavior", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Rohan Chitnis", "Shubham Tulsiani", "Saurabh Gupta", "Abhinav Gupta"], "_bibtex": "@inproceedings{\nChitnis2020Intrinsic,\ntitle={Intrinsic Motivation for Encouraging Synergistic Behavior},\nauthor={Rohan Chitnis and Shubham Tulsiani and Saurabh Gupta and Abhinav Gupta},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJleNCNtDH}\n}", "authorids": ["ronuchit@mit.edu", "shubhtuls@fb.com", "saurabhg@illinois.edu", "abhinavg@cs.cmu.edu"], "title": "Intrinsic Motivation for Encouraging Synergistic Behavior", "original_pdf": "/attachment/9d3528001556ea5327894733dd7e2ca4b8ab2359.pdf", "pdf": "/pdf/1c76c8b597ee076527d77a8b9befd563bdeb5733.pdf", "abstract": "We study the role of intrinsic motivation as an exploration bias for reinforcement learning in sparse-reward synergistic tasks, which are tasks where multiple agents must work together to achieve a goal they could not individually. Our key idea is that a good guiding principle for intrinsic motivation in synergistic tasks is to take actions which affect the world in ways that would not be achieved if the agents were acting on their own. Thus, we propose to incentivize agents to take (joint) actions whose effects cannot be predicted via a composition of the predicted effect for each individual agent. We study two instantiations of this idea, one based on the true states encountered, and another based on a dynamics model trained concurrently with the policy. While the former is simpler, the latter has the benefit of being analytically differentiable with respect to the action taken. We validate our approach in robotic bimanual manipulation and multi-agent locomotion tasks with sparse rewards; we find that our approach yields more efficient learning than both 1) training with only the sparse reward and 2) using the typical surprise-based formulation of intrinsic motivation, which does not bias toward synergistic behavior. Videos are available on the project webpage: https://sites.google.com/view/iclr2020-synergistic.", "full_presentation_video": ""}, "forum": "SJleNCNtDH", "id": "SJleNCNtDH"}, "BkeoaeHKDS": {"content": {"appendix": "", "TL;DR": "Given a pre-trained model, we explored the per-sample gradients of the model parameters relative to a task-specific loss, and constructed a linear model that combines gradients of model parameters and the activation of the model.", "keywords": ["representation learning"], "paperhash": "mu|gradients_as_features_for_deep_representation_learning", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Fangzhou Mu", "Yingyu Liang", "Yin Li"], "_bibtex": "@inproceedings{\nMu2020Gradients,\ntitle={Gradients as Features for Deep Representation Learning},\nauthor={Fangzhou Mu and Yingyu Liang and Yin Li},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BkeoaeHKDS}\n}", "authorids": ["fmu@cs.wisc.edu", "yliang@cs.wisc.edu", "yin.li@wisc.edu"], "title": "Gradients as Features for Deep Representation Learning", "original_pdf": "/attachment/660c0d48cd42509386092f1e7484ee347a678a68.pdf", "pdf": "/pdf/39fae24443fad5d9c9daf948e80ad4a03be21b9e.pdf", "abstract": "We address the challenging problem of deep representation learning -- the efficient adaption of a pre-trained deep network to different tasks. Specifically, we propose to explore gradient-based features. These features are gradients of the model parameters with respect to a task-specific loss given an input sample. Our key innovation is the design of a linear model that incorporates both gradient and activation of the pre-trained network. We demonstrate that our model provides a local linear approximation to an underlying deep model, and discuss important theoretical insights. Moreover, we present an efficient algorithm for the training and inference of our model without computing the actual gradients. Our method is evaluated across a number of representation-learning tasks on several datasets and using different network architectures. Strong results are obtained in all settings, and are well-aligned with our theoretical insights.", "full_presentation_video": ""}, "forum": "BkeoaeHKDS", "id": "BkeoaeHKDS"}, "SJeLIgBKPS": {"content": {"appendix": "", "TL;DR": "We study the implicit bias of gradient descent and prove under a minimal set of assumptions that the parameter direction of homogeneous models converges to KKT points of a natural margin maximization problem.", "keywords": ["cnn", "gradient descent", "optimization", "regression", "regularization", "robustness"], "paperhash": "lyu|gradient_descent_maximizes_the_margin_of_homogeneous_neural_networks", "code": "https://github.com/vfleaking/max-margin", "spotlight_video": "", "authorids": ["vfleaking@gmail.com", "lijian83@mail.tsinghua.edu.cn"], "poster": "", "slides": "", "authors": ["Kaifeng Lyu", "Jian Li"], "_bibtex": "@inproceedings{\nLyu2020Gradient,\ntitle={Gradient Descent Maximizes the Margin of Homogeneous Neural Networks},\nauthor={Kaifeng Lyu and Jian Li},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJeLIgBKPS}\n}", "original_pdf": "/attachment/d1a24a7d706bfb9a441f0228882c153ae05937d9.pdf", "title": "Gradient Descent Maximizes the Margin of Homogeneous Neural Networks", "pdf": "/pdf/1961d0a01f9b41951a88793dcc0e818a80d108fe.pdf", "abstract": "In this paper, we study the implicit regularization of the gradient descent algorithm in homogeneous neural networks, including fully-connected and convolutional neural networks with ReLU or LeakyReLU activations. In particular, we study the gradient descent or gradient flow (i.e., gradient descent with infinitesimal step size) optimizing the logistic loss or cross-entropy loss of any homogeneous model (possibly non-smooth), and show that if the training loss decreases below a certain threshold, then we can define a smoothed version of the normalized margin which increases over time. We also formulate a natural constrained optimization problem related to margin maximization, and prove that both the normalized margin and its smoothed version converge to the objective value at a KKT point of the optimization problem. Our results generalize the previous results for logistic regression with one-layer or multi-layer linear networks, and provide more quantitative convergence results with weaker assumptions than previous results for homogeneous smooth neural networks. We conduct several experiments to justify our theoretical finding on MNIST and CIFAR-10 datasets. Finally, as margin is closely related to robustness, we discuss potential benefits of training longer for improving the robustness of the model.", "full_presentation_video": ""}, "forum": "SJeLIgBKPS", "id": "SJeLIgBKPS"}, "ryl3ygHYDB": {"content": {"appendix": "", "TL;DR": "We study a multi-layer generalization of the magnitude-based pruning.", "keywords": ["optimization", "pruning"], "paperhash": "park|lookahead_a_farsighted_alternative_of_magnitudebased_pruning", "code": "https://github.com/alinlab/lookahead_pruning", "spotlight_video": "", "authorids": ["sejun.park@kaist.ac.kr", "jaeho-lee@kaist.ac.kr", "swmo@kaist.ac.kr", "jinwoos@kaist.ac.kr"], "poster": "", "slides": "", "authors": ["Sejun Park*", "Jaeho Lee*", "Sangwoo Mo", "Jinwoo Shin"], "_bibtex": "@inproceedings{\nPark*2020Lookahead:,\ntitle={Lookahead: A Far-sighted Alternative of Magnitude-based Pruning},\nauthor={Sejun Park* and Jaeho Lee* and Sangwoo Mo and Jinwoo Shin},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ryl3ygHYDB}\n}", "original_pdf": "/attachment/3352afbcb09c1cbfb3a0ff2ba99195bb04306f7f.pdf", "title": "Lookahead: A Far-sighted Alternative of Magnitude-based Pruning", "pdf": "/pdf/bf14b43babc55de31af665a6582516f8fe660919.pdf", "abstract": "Magnitude-based pruning is one of the simplest methods for pruning neural networks. Despite its simplicity, magnitude-based pruning and its variants demonstrated remarkable performances for pruning modern architectures. Based on the observation that magnitude-based pruning indeed minimizes the Frobenius distortion of a linear operator corresponding to a single layer, we develop a simple pruning method, coined lookahead pruning, by extending the single layer optimization to a multi-layer optimization. Our experimental results demonstrate that the proposed method consistently outperforms magnitude-based pruning on various networks, including VGG and ResNet, particularly in the high-sparsity regime. See https://github.com/alinlab/lookahead_pruning for codes.", "full_presentation_video": ""}, "forum": "ryl3ygHYDB", "id": "ryl3ygHYDB"}, "ryeG924twB": {"content": {"appendix": "", "TL;DR": "We propose an event-based policy gradient to train the leader and an action abstraction policy gradient to train the followers in leader-follower Markov game.", "keywords": ["attention", "multi agent reinforcement learning", "navigation", "policy gradient", "reinforcement learning"], "paperhash": "shi|learning_expensive_coordination_an_eventbased_deep_rl_approach", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Zhenyu Shi*", "Runsheng Yu*", "Xinrun Wang*", "Rundong Wang", "Youzhi Zhang", "Hanjiang Lai", "Bo An"], "_bibtex": "@inproceedings{\nShi*2020Learning,\ntitle={Learning Expensive Coordination: An Event-Based Deep RL Approach},\nauthor={Zhenyu Shi* and Runsheng Yu* and Xinrun Wang* and Rundong Wang and Youzhi Zhang and Hanjiang Lai and Bo An},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ryeG924twB}\n}", "authorids": ["shizhy6@mail2.sysu.edu.cn", "runsheng.yu@ntu.edu.sg", "xwang033@e.ntu.edu.sg", "rundong001@e.ntu.edu.sg", "yzhang137@e.ntu.edu.sg", "laihanj3@mail.sysu.edu.cn", "boan@ntu.edu.sg"], "title": "Learning Expensive Coordination: An Event-Based Deep RL Approach", "original_pdf": "/attachment/6ebb0851fa77baf80099fb432618d469fefde670.pdf", "pdf": "/pdf/a0af93829f387e544131364f643d911eb03803ae.pdf", "abstract": "Existing works in deep Multi-Agent Reinforcement Learning (MARL) mainly focus on coordinating cooperative agents to complete certain tasks jointly. However, in many cases of the real world, agents are self-interested such as employees in a company and clubs in a league. Therefore, the leader, i.e., the manager of the company or the league, needs to provide bonuses to followers for efficient coordination, which we call expensive coordination. The main difficulties of expensive coordination are that i) the leader has to consider the long-term effect and predict the followers' behaviors when assigning bonuses and ii) the complex interactions between followers make the training process hard to converge, especially when the leader's policy changes with time. In this work, we address this problem through an event-based deep RL approach. Our main contributions are threefold. (1) We model the leader's decision-making process as a semi-Markov Decision Process and propose a novel multi-agent event-based policy gradient to learn the leader's long-term policy. (2) We exploit the leader-follower consistency scheme to design a follower-aware module and a follower-specific attention module to predict the followers' behaviors and make accurate response to their behaviors. (3) We propose an action abstraction-based policy gradient algorithm to reduce the followers' decision space and thus accelerate the training process of followers. Experiments in resource collections, navigation, and the predator-prey game reveal that our approach outperforms the state-of-the-art methods dramatically.", "full_presentation_video": ""}, "forum": "ryeG924twB", "id": "ryeG924twB"}, "Sklgs0NFvr": {"content": {"appendix": "", "TL;DR": "Humans in the loop revise documents to accord with counterfactual labels, resulting resource helps to reduce reliance on spurious associations.", "keywords": ["causality", "natural language inference", "nlp", "sentiment analysis", "text classification"], "paperhash": "kaushik|learning_the_difference_that_makes_a_difference_with_counterfactuallyaugmented_data", "code": "https://github.com/dkaushik96/counterfactually-augmented-data", "spotlight_video": "", "authorids": ["dkaushik@cs.cmu.edu", "hovy@cmu.edu", "zlipton@cmu.edu"], "poster": "", "slides": "", "authors": ["Divyansh Kaushik", "Eduard Hovy", "Zachary Lipton"], "_bibtex": "@inproceedings{\nKaushik2020Learning,\ntitle={Learning The Difference That Makes A Difference With Counterfactually-Augmented Data},\nauthor={Divyansh Kaushik and Eduard Hovy and Zachary Lipton},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Sklgs0NFvr}\n}", "original_pdf": "/attachment/929e49b9a0b543a84a7b28dbd46a5b6281659412.pdf", "title": "Learning The Difference That Makes A Difference With Counterfactually-Augmented Data", "pdf": "/pdf/6267805abf4a2ab7b3f971c759b8c669e6060774.pdf", "abstract": "Despite alarm over the reliance of machine learning systems on so-called spurious patterns, the term lacks coherent meaning in standard statistical frameworks. However, the language of causality offers clarity: spurious associations are due to confounding (e.g., a common cause), but not direct or indirect causal effects. In this paper, we focus on natural language processing, introducing methods and resources for training models less sensitive to spurious patterns. Given documents and their initial labels, we task humans with revising each document so that it (i) accords with a counterfactual target label; (ii) retains internal coherence; and (iii) avoids unnecessary changes. Interestingly, on sentiment analysis and natural language inference tasks, classifiers trained on original data fail on their counterfactually-revised counterparts and vice versa. Classifiers trained on combined datasets perform remarkably well, just shy of those specialized to either domain. While classifiers trained on either original or manipulated data alone are sensitive to spurious features (e.g., mentions of genre), models trained on the combined data are less sensitive to this signal. Both datasets are publicly available.", "full_presentation_video": ""}, "forum": "Sklgs0NFvr", "id": "Sklgs0NFvr"}, "Hyg-JC4FDr": {"content": {"appendix": "", "keywords": ["adversarial", "imitation learning", "optimization", "reinforcement learning", "sample efficiency"], "paperhash": "kostrikov|imitation_learning_via_offpolicy_distribution_matching", "spotlight_video": "", "poster": "", "slides": "", "abstract": "When performing imitation learning from expert demonstrations, distribution matching is a popular approach, in which one alternates between estimating distribution ratios and then using these ratios as rewards in a standard reinforcement learning (RL) algorithm. Traditionally, estimation of the distribution ratio requires on-policy data, which has caused previous work to either be exorbitantly data- inefficient or alter the original objective in a manner that can drastically change its optimum. In this work, we show how the original distribution ratio estimation objective may be transformed in a principled manner to yield a completely off-policy objective. In addition to the data-efficiency that this provides, we are able to show that this objective also renders the use of a separate RL optimization unnecessary. Rather, an imitation policy may be learned directly from this objective without the use of explicit rewards. We call the resulting algorithm ValueDICE and evaluate it on a suite of popular imitation learning benchmarks, finding that it can achieve state-of-the-art sample efficiency and performance.", "_bibtex": "@inproceedings{\nKostrikov2020Imitation,\ntitle={Imitation Learning via Off-Policy Distribution Matching},\nauthor={Ilya Kostrikov and Ofir Nachum and Jonathan Tompson},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Hyg-JC4FDr}\n}", "authorids": ["kostrikov@cs.nyu.edu", "ofirnachum@google.com", "tompson@google.com"], "title": "Imitation Learning via Off-Policy Distribution Matching", "authors": ["Ilya Kostrikov", "Ofir Nachum", "Jonathan Tompson"], "original_pdf": "/attachment/bfcf0e6479dc168f8a2782fc3a0a60488769425c.pdf", "pdf": "/pdf/e760b8c2e400ca1d33a3bbac52125055587fe455.pdf", "full_presentation_video": ""}, "forum": "Hyg-JC4FDr", "id": "Hyg-JC4FDr"}, "BygPO2VKPH": {"content": {"appendix": "", "TL;DR": "We propose gated mechanisms to enhance learned ISTA for sparse coding, with theoretical guarantees on the superiority of the method. ", "keywords": ["sparse coding"], "paperhash": "wu|sparse_coding_with_gated_learned_ista", "code": "https://github.com/wukailun/GLISTA", "spotlight_video": "", "authorids": ["wukl14@mails.tsinghua.edu.cn", "guoyiwen.ai@bytedance.com", "liza19@mails.tsinghua.edu.cn", "zcs@mail.tsinghua.edu.cn"], "poster": "", "slides": "", "authors": ["Kailun Wu", "Yiwen Guo", "Ziang Li", "Changshui Zhang"], "_bibtex": "@inproceedings{\nWu2020Sparse,\ntitle={Sparse Coding with Gated Learned ISTA},\nauthor={Kailun Wu and Yiwen Guo and Ziang Li and Changshui Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BygPO2VKPH}\n}", "original_pdf": "/attachment/b31f8944c8517047c8d24c103d06c6cfad90e42b.pdf", "title": "Sparse Coding with Gated Learned ISTA", "pdf": "/pdf/3af900efdd403a600321a3e02a64f2514b34039f.pdf", "abstract": "In this paper, we study the learned iterative shrinkage thresholding algorithm (LISTA) for solving sparse coding problems. Following assumptions made by prior works, we first discover that the code components in its estimations may be lower than expected, i.e., require gains, and to address this problem, a gated mechanism amenable to theoretical analysis is then introduced. Specific design of the gates is inspired by convergence analyses of the mechanism and hence its effectiveness can be formally guaranteed. In addition to the gain gates, we further introduce overshoot gates for compensating insufficient step size in LISTA. Extensive empirical results confirm our theoretical findings and verify the effectiveness of our method.", "full_presentation_video": ""}, "forum": "BygPO2VKPH", "id": "BygPO2VKPH"}, "rkxs0yHFPH": {"content": {"appendix": "", "TL;DR": "An implementation of the backpropagation algorithm using spiking neurons for forward and backward propagation.", "keywords": [], "paperhash": "thiele|spikegrad_an_annequivalent_computation_model_for_implementing_backpropagation_with_spikes", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Johannes C. Thiele", "Olivier Bichler", "Antoine Dupret"], "_bibtex": "@inproceedings{\nThiele2020SpikeGrad:,\ntitle={SpikeGrad: An ANN-equivalent Computation Model for Implementing Backpropagation with Spikes},\nauthor={Johannes C. Thiele and Olivier Bichler and Antoine Dupret},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkxs0yHFPH}\n}", "authorids": ["johannes.thiele@cea.fr", "olivier.bichler@cea.fr", "antoine.dupret@cea.fr"], "title": "SpikeGrad: An ANN-equivalent Computation Model for Implementing Backpropagation with Spikes", "original_pdf": "/attachment/961c40bce9452e3b2392e7f945f07008264e324e.pdf", "pdf": "/pdf/f6708d62ff26dc70ee82019e32d3c6c59f6989d0.pdf", "abstract": "Event-based neuromorphic systems promise to reduce the energy consumption of deep neural networks by replacing expensive floating point operations on dense matrices by low energy, sparse operations on spike events. While these systems can be trained increasingly well using approximations of the backpropagation algorithm, this usually requires high precision errors and is therefore incompatible with the typical communication infrastructure of neuromorphic circuits. In this work, we analyze how the gradient can be discretized into spike events when training a spiking neural network. To accelerate our simulation, we show that using a special implementation of the integrate-and-fire neuron allows us to describe the accumulated activations and errors of the spiking neural network in terms of an equivalent artificial neural network, allowing us to largely speed up training compared to an explicit simulation of all spike events. This way we are able to demonstrate that even for deep networks, the gradients can be discretized sufficiently well with spikes if the gradient is properly rescaled. This form of spike-based backpropagation enables us to achieve equivalent or better accuracies on the MNIST and CIFAR10 datasets than comparable state-of-the-art spiking neural networks trained with full precision gradients. The algorithm, which we call SpikeGrad, is based on only accumulation and comparison operations and can naturally exploit sparsity in the gradient computation, which makes it an interesting choice for a spiking neuromorphic systems with on-chip learning capacities.", "full_presentation_video": ""}, "forum": "rkxs0yHFPH", "id": "rkxs0yHFPH"}, "HkgsUJrtDB": {"content": {"appendix": "", "keywords": ["adversarial", "clustering", "fairness"], "paperhash": "baharlouei|r\u00e9nyi_fair_inference", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Sina Baharlouei", "Maher Nouiehed", "Ahmad Beirami", "Meisam Razaviyayn"], "_bibtex": "@inproceedings{\nBaharlouei2020R\u00e9nyi,\ntitle={R\u00e9nyi Fair Inference},\nauthor={Sina Baharlouei and Maher Nouiehed and Ahmad Beirami and Meisam Razaviyayn},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HkgsUJrtDB}\n}", "authorids": ["baharlou@usc.edu", "nouiehed@usc.edu", "beirami@mit.edu", "razaviya@usc.edu"], "title": "R\u00e9nyi Fair Inference", "original_pdf": "/attachment/f4afd3916c3ee59f9893c35f965fce7946f83200.pdf", "pdf": "/pdf/ed9857cae96b1b46a1fee63490fc0c839f449ffd.pdf", "abstract": "Machine learning algorithms have been increasingly deployed in critical automated decision-making systems that directly affect human lives. When these algorithms are solely trained to minimize the training/test error, they could suffer from systematic discrimination against individuals based on their sensitive attributes, such as gender or race. Recently, there has been a surge in machine learning society to develop algorithms for fair machine learning. \nIn particular, several adversarial learning procedures have been proposed to impose fairness. Unfortunately, these algorithms either can only impose fairness up to linear dependence between the variables, or they lack computational convergence guarantees. In this paper, we use R\u00e9nyi correlation as a measure of fairness of machine learning models and develop a general training framework to impose fairness. In particular, we propose a min-max formulation which balances the accuracy and fairness when solved to optimality. For the case of discrete sensitive attributes, we suggest an iterative algorithm with theoretical convergence guarantee for solving the proposed min-max problem. Our algorithm and analysis are then specialized to fair classification and fair clustering problems. To demonstrate the performance of the proposed R\u00e9nyi fair inference framework in practice, we compare it with well-known existing methods on several benchmark datasets. Experiments indicate that the proposed method has favorable empirical performance against state-of-the-art approaches.", "full_presentation_video": ""}, "forum": "HkgsUJrtDB", "id": "HkgsUJrtDB"}, "rylnK6VtDH": {"content": {"appendix": "", "TL;DR": "We explore the role of multiplicative interaction as a unifying framework to describe a range of classical and modern neural network architectural motifs, such as gating, attention layers, hypernetworks, and dynamic convolutions amongst others.", "keywords": ["attention", "hypernetworks", "inductive bias"], "paperhash": "jayakumar|multiplicative_interactions_and_where_to_find_them", "spotlight_video": "", "poster": "", "slides": "", "abstract": "We explore the role of multiplicative interaction as a unifying framework to describe a range of classical and modern neural network architectural motifs, such as gating, attention layers, hypernetworks, and dynamic convolutions amongst others.\nMultiplicative interaction layers as primitive operations have a long-established presence in the literature, though this often not emphasized and thus under-appreciated. We begin by showing that such layers strictly enrich the representable function classes of neural networks. We conjecture that multiplicative interactions offer a particularly powerful inductive bias when fusing multiple streams of information or when conditional computation is required. We therefore argue that they should be considered in many situation where multiple compute or information paths need to be combined, in place of the simple and oft-used concatenation operation. Finally, we back up our claims and demonstrate the potential of multiplicative interactions by applying them in large-scale complex RL and sequence modelling tasks, where their use allows us to deliver state-of-the-art results, and thereby provides new evidence in support of multiplicative interactions playing a more prominent role when designing new neural network architectures.", "_bibtex": "@inproceedings{\nJayakumar2020Multiplicative,\ntitle={Multiplicative Interactions and Where to Find Them},\nauthor={Siddhant M. Jayakumar and Wojciech M. Czarnecki and Jacob Menick and Jonathan Schwarz and Jack Rae and Simon Osindero and Yee Whye Teh and Tim Harley and Razvan Pascanu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rylnK6VtDH}\n}", "authorids": ["sidmj@google.com", "lejlot@google.com", "jmenick@google.com", "schwarzjn@google.com", "jwrae@google.com", "osindero@google.com", "ywteh@google.com", "tharley@google.com", "razp@google.com"], "title": "Multiplicative Interactions and Where to Find Them", "authors": ["Siddhant M. Jayakumar", "Wojciech M. Czarnecki", "Jacob Menick", "Jonathan Schwarz", "Jack Rae", "Simon Osindero", "Yee Whye Teh", "Tim Harley", "Razvan Pascanu"], "original_pdf": "/attachment/c0237ed572c00dddacfa91e7acb8f26e08248ec9.pdf", "pdf": "/pdf/25d97c4a79fac39e47afae9943ee47ffbd93b248.pdf", "full_presentation_video": ""}, "forum": "rylnK6VtDH", "id": "rylnK6VtDH"}, "Bkl7bREtDr": {"content": {"appendix": "", "TL;DR": "In Deep RL, order-invariant functions can be used in conjunction with standard memory modules to improve gradient decay and resilience to noise.", "keywords": ["memory", "reinforcement learning", "sample efficiency"], "paperhash": "beck|amrl_aggregated_memory_for_reinforcement_learning", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Jacob Beck", "Kamil Ciosek", "Sam Devlin", "Sebastian Tschiatschek", "Cheng Zhang", "Katja Hofmann"], "_bibtex": "@inproceedings{\nBeck2020AMRL:,\ntitle={AMRL: Aggregated Memory For Reinforcement Learning},\nauthor={Jacob Beck and Kamil Ciosek and Sam Devlin and Sebastian Tschiatschek and Cheng Zhang and Katja Hofmann},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Bkl7bREtDr}\n}", "authorids": ["jacob_beck@alumni.brown.edu", "kamil.ciosek@microsoft.com", "sam.devlin@microsoft.com", "sebastian.tschiatschek@microsoft.com", "cheng.zhang@microsoft.com", "katja.hofmann@microsoft.com"], "title": "AMRL: Aggregated Memory For Reinforcement Learning", "original_pdf": "/attachment/f25c81db5fd58a8c8d3f4322f62059eaffe39971.pdf", "pdf": "/pdf/de1a9c01e9b3fac27abfcf1fc8d89b3da4d61570.pdf", "abstract": "In many partially observable scenarios, Reinforcement Learning (RL) agents must rely on long-term memory in order to learn an optimal policy. We demonstrate that using techniques from NLP and supervised learning fails at RL tasks due to stochasticity from the environment and from exploration. Utilizing our insights on the limitations of traditional memory methods in RL, we propose AMRL, a class of models that can learn better policies with greater sample efficiency and are resilient to noisy inputs. Specifically, our models use a standard memory module to summarize short-term context, and then aggregate all prior states from the standard model without respect to order. We show that this provides advantages both in terms of gradient decay and signal-to-noise ratio over time. Evaluating in Minecraft and maze environments that test long-term memory, we find that our model improves average return by 19% over a baseline that has the same number of parameters and by 9% over a stronger baseline that has far more parameters.", "full_presentation_video": ""}, "forum": "Bkl7bREtDr", "id": "Bkl7bREtDr"}, "SJx9ngStPH": {"content": {"appendix": "", "keywords": ["computer vision", "neural architecture search"], "paperhash": "zela|nasbench1shot1_benchmarking_and_dissecting_oneshot_neural_architecture_search", "code": "https://github.com/automl/nasbench-1shot1", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Arber Zela", "Julien Siems", "Frank Hutter"], "_bibtex": "@inproceedings{\nZela2020NAS-Bench-1Shot1:,\ntitle={NAS-Bench-1Shot1: Benchmarking and Dissecting One-shot Neural Architecture Search},\nauthor={Arber Zela and Julien Siems and Frank Hutter},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJx9ngStPH}\n}", "authorids": ["zelaa@cs.uni-freiburg.de", "siemsj@cs.uni-freiburg.de", "fh@cs.uni-freiburg.de"], "title": "NAS-Bench-1Shot1: Benchmarking and Dissecting One-shot Neural Architecture Search", "original_pdf": "/attachment/f33f4aa80d5c9e7e2758b5e94f9ebcbce1954a56.pdf", "pdf": "/pdf/629de63a76318d4dba0710a49082db53188ddd27.pdf", "abstract": "One-shot neural architecture search (NAS) has played a crucial role in making\nNAS methods computationally feasible in practice. Nevertheless, there is still a\nlack of understanding on how these weight-sharing algorithms exactly work due\nto the many factors controlling the dynamics of the process. In order to allow\na scientific study of these components, we introduce a general framework for\none-shot NAS that can be instantiated to many recently-introduced variants and\nintroduce a general benchmarking framework that draws on the recent large-scale\ntabular benchmark NAS-Bench-101 for cheap anytime evaluations of one-shot\nNAS methods. To showcase the framework, we compare several state-of-the-art\none-shot NAS methods, examine how sensitive they are to their hyperparameters\nand how they can be improved by tuning their hyperparameters, and compare their\nperformance to that of blackbox optimizers for NAS-Bench-101.", "full_presentation_video": ""}, "forum": "SJx9ngStPH", "id": "SJx9ngStPH"}, "SJxUjlBtwB": {"content": {"appendix": "", "TL;DR": "We propose a deep generative model of volumes for 3D cryo-EM reconstruction from unlabelled 2D images and show that it can learn can learn continuous deformations in protein structure.", "keywords": ["3d reconstruction", "clustering", "ensembles", "generative models", "variational inference"], "paperhash": "zhong|reconstructing_continuous_distributions_of_3d_protein_structure_from_cryoem_images", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Ellen D. Zhong", "Tristan Bepler", "Joseph H. Davis", "Bonnie Berger"], "_bibtex": "@inproceedings{\nZhong2020Reconstructing,\ntitle={Reconstructing continuous distributions of 3D protein structure from cryo-EM images},\nauthor={Ellen D. Zhong and Tristan Bepler and Joseph H. Davis and Bonnie Berger},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJxUjlBtwB}\n}", "authorids": ["zhonge@mit.edu", "tbepler@mit.edu", "jhdavis@mit.edu", "bab@mit.edu"], "title": "Reconstructing continuous distributions of 3D protein structure from cryo-EM images", "original_pdf": "/attachment/a50f0f2b4ed63383cc5280edf15e1023d2c58531.pdf", "pdf": "/pdf/4b8f5fc7ed172b71ebc5488a94aea5027661b843.pdf", "abstract": "Cryo-electron microscopy (cryo-EM) is a powerful technique for determining the structure of proteins and other macromolecular complexes at near-atomic resolution. In single particle cryo-EM, the central problem is to reconstruct the 3D structure of a macromolecule from $10^{4-7}$ noisy and randomly oriented 2D projection images. However, the imaged protein complexes may exhibit structural variability, which complicates reconstruction and is typically addressed using discrete clustering approaches that fail to capture the full range of protein dynamics. Here, we introduce a novel method for cryo-EM reconstruction that extends naturally to modeling continuous generative factors of structural heterogeneity. This method encodes structures in Fourier space using coordinate-based deep neural networks, and trains these networks from unlabeled 2D cryo-EM images by combining exact inference over image orientation with variational inference for structural heterogeneity. We demonstrate that the proposed method, termed cryoDRGN, can perform ab-initio reconstruction of 3D protein complexes from simulated and real 2D cryo-EM image data. To our knowledge, cryoDRGN is the first neural network-based approach for cryo-EM reconstruction and the first end-to-end method for directly reconstructing continuous ensembles of protein structures from cryo-EM images.", "full_presentation_video": ""}, "forum": "SJxUjlBtwB", "id": "SJxUjlBtwB"}, "rklp93EtwH": {"content": {"appendix": "", "TL;DR": "Addressing task heterogeneity problem in meta-learning by introducing meta-knowledge graph", "keywords": ["interpretability", "meta learning", "regression"], "paperhash": "yao|automated_relational_metalearning", "code": "https://github.com/huaxiuyao/ARML", "spotlight_video": "", "authorids": ["huaxiuyao@psu.edu", "xwu9@nd.edu", "zqtao@ece.neu.edu", "yaliangl.ub@gmail.com", "bolin.ding@alibaba-inc.com", "rrli@cs.ucla.edu", "jessieli@ist.psu.edu"], "poster": "", "slides": "", "authors": ["Huaxiu Yao", "Xian Wu", "Zhiqiang Tao", "Yaliang Li", "Bolin Ding", "Ruirui Li", "Zhenhui Li"], "_bibtex": "@inproceedings{\nYao2020Automated,\ntitle={Automated Relational Meta-learning},\nauthor={Huaxiu Yao and Xian Wu and Zhiqiang Tao and Yaliang Li and Bolin Ding and Ruirui Li and Zhenhui Li},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rklp93EtwH}\n}", "original_pdf": "/attachment/d7a9d5e65df2e55cd8dccf3f8c53972a5857c845.pdf", "title": "Automated Relational Meta-learning", "pdf": "/pdf/9dfa155739c85d043dd873f0543b2643ec480195.pdf", "abstract": "In order to efficiently learn with small amount of data on new tasks, meta-learning transfers knowledge learned from previous tasks to the new ones. However, a critical challenge in meta-learning is the task heterogeneity which cannot be well handled by traditional globally shared meta-learning methods. In addition, current task-specific meta-learning methods may either suffer from hand-crafted structure design or lack the capability to capture complex relations between tasks. In this paper, motivated by the way of knowledge organization in knowledge bases, we propose an automated relational meta-learning (ARML) framework that automatically extracts the cross-task relations and constructs the meta-knowledge graph. When a new task arrives, it can quickly find the most relevant structure and tailor the learned structure knowledge to the meta-learner. As a result, the proposed framework not only addresses the challenge of task heterogeneity by a learned meta-knowledge graph, but also increases the model interpretability. We conduct extensive experiments on 2D toy regression and few-shot image classification and the results demonstrate the superiority of ARML over state-of-the-art baselines.", "full_presentation_video": ""}, "forum": "rklp93EtwH", "id": "rklp93EtwH"}, "HJxMYANtPH": {"content": {"appendix": "", "keywords": ["clustering", "gradient descent", "neural tangent kernel"], "paperhash": "he|the_local_elasticity_of_neural_networks", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Hangfeng He", "Weijie Su"], "_bibtex": "@inproceedings{\nHe2020The,\ntitle={The Local Elasticity of Neural Networks},\nauthor={Hangfeng He and Weijie Su},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJxMYANtPH}\n}", "authorids": ["hangfeng@seas.upenn.edu", "suw@wharton.upenn.edu"], "title": "The Local Elasticity of Neural Networks", "original_pdf": "/attachment/4aadc7d35c76ab6e8d418804757f46b17aaac6f7.pdf", "pdf": "/pdf/28d5679dc56726f471eb847aa741005dd33f4e0a.pdf", "abstract": "This paper presents a phenomenon in neural networks that we refer to as local elasticity. Roughly speaking, a classifier is said to be locally elastic if its prediction at a feature vector x' is not significantly perturbed, after the classifier is updated via stochastic gradient descent at a (labeled) feature vector x that is dissimilar to x' in a certain sense. This phenomenon is shown to persist for neural networks with nonlinear activation functions through extensive simulations on real-life and synthetic datasets, whereas this is not observed in linear classifiers. In addition, we offer a geometric interpretation of local elasticity using the neural tangent kernel (Jacot et al., 2018). Building on top of local elasticity, we obtain pairwise similarity measures between feature vectors, which can be used for clustering in conjunction with K-means. The effectiveness of the clustering algorithm on the MNIST and CIFAR-10 datasets in turn corroborates the hypothesis of local elasticity of neural networks on real-life data. Finally, we discuss some implications of local elasticity to shed light on several intriguing aspects of deep neural networks.", "full_presentation_video": ""}, "forum": "HJxMYANtPH", "id": "HJxMYANtPH"}, "H1gmHaEKwB": {"content": {"appendix": "", "TL;DR": "We propose an efficient, provable and data independent method for network compression via neural pruning using coresets of neurons -- a novel construction proposed in this paper.", "keywords": ["adversarial", "compression", "memory", "model compression", "network compression", "pruning"], "paperhash": "mussay|dataindependent_neural_pruning_via_coresets", "spotlight_video": "", "poster": "", "slides": "", "abstract": "Previous work showed empirically that large neural networks can be significantly reduced in size while preserving their accuracy. Model compression became a central research topic, as it is crucial for deployment of neural networks on devices with limited computational and memory resources. The majority of the compression methods are based on heuristics and offer no worst-case guarantees on the trade-off between the compression rate and the approximation error for an arbitrarily new sample.\n\nWe propose the first efficient, data-independent neural pruning algorithm with a provable trade-off between its compression rate and the approximation error for any future test sample. Our method is based on the coreset framework, which finds a small weighted subset of points that provably approximates the original inputs. Specifically, we approximate the output of a layer of neurons by a coreset of neurons in the previous layer and discard the rest. We apply this framework in a layer-by-layer fashion from the top to the bottom. Unlike previous works, our coreset is data independent, meaning that it provably guarantees the accuracy of the function for any input $x\\in \\mathbb{R}^d$, including an adversarial one. We demonstrate the effectiveness of our method on popular network architectures. In particular, our coresets yield 90% compression of the LeNet-300-100 architecture on MNIST while improving the accuracy.", "_bibtex": "@inproceedings{\nMussay2020Data-Independent,\ntitle={Data-Independent Neural Pruning via Coresets},\nauthor={Ben Mussay and Margarita Osadchy and Vladimir Braverman and Samson Zhou and Dan Feldman},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1gmHaEKwB}\n}", "authorids": ["bengordoncshaifa@gmail.com", "rita@cs.haifa.ac.il", "vova@cs.jhu.edu", "samsonzhou@gmail.com", "dannyf.post@gmail.co"], "title": "Data-Independent Neural Pruning via Coresets", "authors": ["Ben Mussay", "Margarita Osadchy", "Vladimir Braverman", "Samson Zhou", "Dan Feldman"], "original_pdf": "/attachment/44b75218f2f612e6648e7de9101adb3e5d66ad7c.pdf", "pdf": "/pdf/5120c7006cf551983e41c4ccd53877c232cc2c21.pdf", "full_presentation_video": ""}, "forum": "H1gmHaEKwB", "id": "H1gmHaEKwB"}, "rJxtgJBKDr": {"content": {"appendix": "", "TL;DR": "We propose SNOW, an efficient way of transfer and lifelong learning by subscribing knowledge of a source model for new tasks through a novel channel pooling block.", "keywords": ["cnn", "lifelong learning", "multi task", "transfer learning"], "paperhash": "yoo|snow_subscribing_to_knowledge_via_channel_pooling_for_transfer_lifelong_learning_of_convolutional_neural_networks", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Chungkuk Yoo", "Bumsoo Kang", "Minsik Cho"], "_bibtex": "@inproceedings{\nYoo2020SNOW:,\ntitle={SNOW: Subscribing to Knowledge via Channel Pooling for Transfer & Lifelong Learning of Convolutional Neural Networks},\nauthor={Chungkuk Yoo and Bumsoo Kang and Minsik Cho},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJxtgJBKDr}\n}", "authorids": ["ckyoo@ibm.com", "steve.kang@kaist.ac.kr", "thyeros@gmail.com"], "title": "SNOW: Subscribing to Knowledge via Channel Pooling for Transfer & Lifelong Learning of Convolutional Neural Networks", "original_pdf": "/attachment/45a5d24185d1279891be92dc8c7d709d2899a0f0.pdf", "pdf": "/pdf/f9060b44e8657b6d708c6c180451273066356b70.pdf", "abstract": "SNOW is an efficient learning method to improve training/serving throughput as well as accuracy for transfer and lifelong learning of convolutional neural networks based on knowledge subscription. SNOW selects the top-K useful intermediate\nfeature maps for a target task from a pre-trained and frozen source model through a novel channel pooling scheme, and utilizes them in the task-specific delta model. The source model is responsible for generating a large number of generic feature maps. Meanwhile, the delta model selectively subscribes to those feature maps and fuses them with its local ones to deliver high accuracy for the target task. Since a source model takes part in both training and serving of all target tasks\nin an inference-only mode, one source model can serve multiple delta models, enabling significant computation sharing. The sizes of such delta models are fractional of the source model, thus SNOW also provides model-size efficiency.\nOur experimental results show that SNOW offers a superior balance between accuracy and training/inference speed for various image classification tasks to the existing transfer and lifelong learning practices.", "full_presentation_video": ""}, "forum": "rJxtgJBKDr", "id": "rJxtgJBKDr"}, "ryxQuANKPB": {"content": {"appendix": "", "keywords": ["generation", "interpretability", "planning"], "paperhash": "zhou|augmenting_noncollaborative_dialog_systems_with_explicit_semantic_and_strategic_dialog_history", "spotlight_video": "", "poster": "", "slides": "", "abstract": "We study non-collaborative dialogs, where two agents have a conflict of interest but must strategically communicate to reach an agreement (e.g., negotiation). This setting poses new challenges for modeling dialog history because the dialog's outcome relies not only on the semantic intent, but also on tactics that convey the intent. We propose to model both semantic and tactic history using finite state transducers (FSTs). Unlike RNN, FSTs can explicitly represent dialog history through all the states traversed, facilitating interpretability of dialog structure. We train FSTs on a set of strategies and tactics used in negotiation dialogs. The trained FSTs show plausible tactic structure and can be generalized to other non-collaborative domains (e.g., persuasion). We evaluate the FSTs by incorporating them in an automated negotiating system that attempts to sell products and a persuasion system that persuades people to donate to a charity. Experiments show that explicitly modeling both semantic and tactic history is an effective way to improve both dialog policy planning and generation performance. ", "_bibtex": "@inproceedings{\nZhou2020Augmenting,\ntitle={Augmenting Non-Collaborative Dialog Systems with Explicit Semantic and Strategic Dialog History},\nauthor={Yiheng Zhou and Yulia Tsvetkov and Alan W Black and Zhou Yu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ryxQuANKPB}\n}", "authorids": ["yihengz1@cs.cmu.edu", "ytsvetko@cs.cmu.edu", "awb@cs.cmu.edu", "joyu@ucdavis.edu"], "title": "Augmenting Non-Collaborative Dialog Systems with Explicit Semantic and Strategic Dialog History", "authors": ["Yiheng Zhou", "Yulia Tsvetkov", "Alan W Black", "Zhou Yu"], "original_pdf": "/attachment/d0b49c9cb12f076b54547b8ea11c3b8fe74907ac.pdf", "pdf": "/pdf/1c05a5de3d3ec368ca9a42f20cfd7d389ec184e2.pdf", "full_presentation_video": ""}, "forum": "ryxQuANKPB", "id": "ryxQuANKPB"}, "BJlS634tPr": {"content": {"appendix": "", "TL;DR": "Allowing partial channel connection in super-networks to regularize and accelerate differentiable architecture search", "keywords": ["imagenet", "memory", "neural architecture search", "normalization", "regularization", "stability", "uncertainty"], "paperhash": "xu|pcdarts_partial_channel_connections_for_memoryefficient_architecture_search", "code": "https://www.dropbox.com/sh/on9lg3rpx1r6dkf/AABG5mt0sMHjnEJyoRnLEYW4a?dl=0", "spotlight_video": "", "authorids": ["yuhuixu@sjtu.edu.cn", "198808xc@gmail.com", "zxphistory@gmail.com", "1410452@tongji.edu.cn", "guojunq@gmail.com", "tian.qi1@huawei.com", "xionghongkai@sjtu.edu.cn"], "poster": "", "slides": "", "authors": ["Yuhui Xu", "Lingxi Xie", "Xiaopeng Zhang", "Xin Chen", "Guo-Jun Qi", "Qi Tian", "Hongkai Xiong"], "_bibtex": "@inproceedings{\nXu2020PC-DARTS:,\ntitle={PC-DARTS: Partial Channel Connections for Memory-Efficient Architecture Search},\nauthor={Yuhui Xu and Lingxi Xie and Xiaopeng Zhang and Xin Chen and Guo-Jun Qi and Qi Tian and Hongkai Xiong},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJlS634tPr}\n}", "original_pdf": "/attachment/59b2d6807d091d9821adac5ad7f5eb1d966e45f8.pdf", "title": "PC-DARTS: Partial Channel Connections for Memory-Efficient Architecture Search", "pdf": "/pdf/a0247293765eea46b869860d2bea72d7cb6be4e6.pdf", "abstract": "Differentiable architecture search (DARTS) provided a fast solution in finding effective network architectures, but suffered from large memory and computing overheads in jointly training a super-net and searching for an optimal architecture. In this paper, we present a novel approach, namely Partially-Connected DARTS, by sampling a small part of super-net to reduce the redundancy in exploring the network space, thereby performing a more efficient search without comprising the performance. In particular, we perform operation search in a subset of channels while bypassing the held out part in a shortcut. This strategy may suffer from an undesired inconsistency on selecting the edges of super-net caused by sampling different channels. We solve it by introducing edge normalization, which adds a new set of edge-level hyper-parameters to reduce uncertainty in search. Thanks to the reduced memory cost, PC-DARTS can be trained with a larger batch size and, consequently, enjoy both faster speed and higher training stability. Experiment results demonstrate the effectiveness of the proposed method. Specifically, we achieve an error rate of 2.57% on CIFAR10 within merely 0.1 GPU-days for architecture search, and a state-of-the-art top-1 error rate of 24.2% on ImageNet (under the mobile setting) within 3.8 GPU-days for search. Our code has been made available at https://www.dropbox.com/sh/on9lg3rpx1r6dkf/AABG5mt0sMHjnEJyoRnLEYW4a?dl=0.", "full_presentation_video": ""}, "forum": "BJlS634tPr", "id": "BJlS634tPr"}, "B1g5sA4twr": {"content": {"appendix": "", "TL;DR": "We demonstrate, and characterize, realistic settings where bigger models are worse, and more data hurts.", "keywords": ["gradient descent", "optimization"], "paperhash": "nakkiran|deep_double_descent_where_bigger_models_and_more_data_hurt", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Preetum Nakkiran", "Gal Kaplun", "Yamini Bansal", "Tristan Yang", "Boaz Barak", "Ilya Sutskever"], "_bibtex": "@inproceedings{\nNakkiran2020Deep,\ntitle={Deep Double Descent: Where Bigger Models and More Data Hurt},\nauthor={Preetum Nakkiran and Gal Kaplun and Yamini Bansal and Tristan Yang and Boaz Barak and Ilya Sutskever},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1g5sA4twr}\n}", "authorids": ["preetum@cs.harvard.edu", "galkaplun@g.harvard.edu", "ybansal@g.harvard.edu", "tristanyang@college.harvard.edu", "b@boazbarak.org", "ilyasu@openai.com"], "title": "Deep Double Descent: Where Bigger Models and More Data Hurt", "original_pdf": "/attachment/51a9c89ab14e0b8ab44670eaabb35678344db624.pdf", "pdf": "/pdf/2313f8e4c1bbcb174b1e34904fbc5f638c589efa.pdf", "abstract": "We show that a variety of modern deep learning tasks exhibit a \"double-descent\" phenomenon where, as we increase model size, performance first gets worse and then gets better. Moreover, we show that double descent occurs not just as a function of model size, but also as a function of the number of training epochs. We unify the above phenomena by defining a new complexity measure we call the effective model complexity, and conjecture a generalized double descent with respect to this measure. Furthermore, our notion of model complexity allows us to identify certain regimes where increasing (even quadrupling) the number of train samples actually hurts test performance.", "full_presentation_video": ""}, "forum": "B1g5sA4twr", "id": "B1g5sA4twr"}, "r1xMH1BtvB": {"content": {"appendix": "", "TL;DR": "A text encoder trained to distinguish real input tokens from plausible fakes efficiently learns effective language representations.", "keywords": ["language modeling", "nlp", "representation learning"], "paperhash": "clark|electra_pretraining_text_encoders_as_discriminators_rather_than_generators", "code": "https://github.com/google-research/electra", "spotlight_video": "", "authorids": ["kevclark@cs.stanford.edu", "thangluong@google.com", "qvl@google.com", "manning@cs.stanford.edu"], "poster": "", "slides": "", "authors": ["Kevin Clark", "Minh-Thang Luong", "Quoc V. Le", "Christopher D. Manning"], "_bibtex": "@inproceedings{\nClark2020ELECTRA:,\ntitle={ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators},\nauthor={Kevin Clark and Minh-Thang Luong and Quoc V. Le and Christopher D. Manning},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1xMH1BtvB}\n}", "original_pdf": "/attachment/2d0c8ee9ae9ca35a655f0b061044bb790347b3f2.pdf", "title": "ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators", "pdf": "/pdf/81eb9548d84e4498b2dae9e4355551a1764e8cfe.pdf", "abstract": "Masked language modeling (MLM) pre-training methods such as BERT corrupt the input by replacing some tokens with [MASK] and then train a model to reconstruct the original tokens. While they produce good results when transferred to downstream NLP tasks, they generally require large amounts of compute to be effective. As an alternative, we propose a more sample-efficient pre-training task called replaced token detection. Instead of masking the input, our approach corrupts it by replacing some tokens with plausible alternatives sampled from a small generator network. Then, instead of training a model that predicts the original identities of the corrupted tokens, we train a discriminative model that predicts whether each token in the corrupted input was replaced by a generator sample or not. Thorough experiments demonstrate this new pre-training task is more efficient than MLM because the task is defined over all input tokens rather than just the small subset that was masked out. As a result, the contextual representations learned by our approach substantially outperform the ones learned by BERT given the same model size, data, and compute. The gains are particularly strong for small models; for example, we train a model on one GPU for 4 days that outperforms GPT (trained using 30x more compute) on the GLUE natural language understanding benchmark. Our approach also works well at scale, where it performs comparably to RoBERTa and XLNet while using less than 1/4 of their compute and outperforms them when using the same amount of compute.\n", "full_presentation_video": ""}, "forum": "r1xMH1BtvB", "id": "r1xMH1BtvB"}, "r1lZgyBYwS": {"content": {"appendix": "", "TL;DR": "We scale up lossless compression with latent variables, achieving state of the art on full-size ImageNet images.", "keywords": ["compression", "imagenet", "variational inference"], "paperhash": "townsend|hilloc_lossless_image_compression_with_hierarchical_latent_variable_models", "code": "https://github.com/hilloc-submission/hilloc", "spotlight_video": "", "authorids": ["james.townsend@cs.ucl.ac.uk", "thomas.bird@cs.ucl.ac.uk", "julius.kunze@cs.ucl.ac.uk", "david.barber@ucl.ac.uk"], "poster": "", "slides": "", "authors": ["James Townsend", "Thomas Bird", "Julius Kunze", "David Barber"], "_bibtex": "@inproceedings{\nTownsend2020HiLLoC:,\ntitle={HiLLoC: lossless image compression with hierarchical latent variable models},\nauthor={James Townsend and Thomas Bird and Julius Kunze and David Barber},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1lZgyBYwS}\n}", "original_pdf": "/attachment/37b2463721c41e5390ce12ed6a24d81bc1a567c1.pdf", "title": "HiLLoC: lossless image compression with hierarchical latent variable models", "pdf": "/pdf/34119517f309200cc11db1cbe6d3f127b858fbcf.pdf", "abstract": "We make the following striking observation: fully convolutional VAE models trained on 32x32 ImageNet can generalize well, not just to 64x64 but also to far larger photographs, with no changes to the model. We use this property, applying fully convolutional models to lossless compression, demonstrating a method to scale the VAE-based 'Bits-Back with ANS' algorithm for lossless compression to large color photographs, and achieving state of the art for compression of full size ImageNet images. We release Craystack, an open source library for convenient prototyping of lossless compression using probabilistic models, along with full implementations of all of our compression results.", "full_presentation_video": ""}, "forum": "r1lZgyBYwS", "id": "r1lZgyBYwS"}, "Bkxe2AVtPS": {"content": {"appendix": "", "TL;DR": "We propose a novel 8-bit format that eliminates the need for loss scaling, stochastic rounding, and other low precision techniques", "keywords": ["fine tuning", "memory", "quantization", "transformer"], "paperhash": "cambier|shifted_and_squeezed_8bit_floating_point_format_for_lowprecision_training_of_deep_neural_networks", "spotlight_video": "", "poster": "", "slides": "", "abstract": "Training with larger number of parameters while keeping fast iterations is an increasingly\nadopted strategy and trend for developing better performing Deep Neural\nNetwork (DNN) models. This necessitates increased memory footprint and\ncomputational requirements for training. Here we introduce a novel methodology\nfor training deep neural networks using 8-bit floating point (FP8) numbers.\nReduced bit precision allows for a larger effective memory and increased computational\nspeed. We name this method Shifted and Squeezed FP8 (S2FP8). We\nshow that, unlike previous 8-bit precision training methods, the proposed method\nworks out of the box for representative models: ResNet50, Transformer and NCF.\nThe method can maintain model accuracy without requiring fine-tuning loss scaling\nparameters or keeping certain layers in single precision. We introduce two\nlearnable statistics of the DNN tensors - shifted and squeezed factors that are used\nto optimally adjust the range of the tensors in 8-bits, thus minimizing the loss in\ninformation due to quantization.", "_bibtex": "@inproceedings{\nCambier2020Shifted,\ntitle={Shifted and Squeezed 8-bit Floating Point format for Low-Precision Training of Deep Neural Networks},\nauthor={Leopold Cambier and Anahita Bhiwandiwalla and Ting Gong and Oguz H. Elibol and Mehran Nekuii and Hanlin Tang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Bkxe2AVtPS}\n}", "authorids": ["lcambier@stanford.edu", "anahita.bhiwandiwalla@intel.com", "ting.gong@intel.com", "oguz.h.elibol@intel.com", "mehran.nekuii@intel.com", "hanlin.tang@intel.com"], "title": "Shifted and Squeezed 8-bit Floating Point format for Low-Precision Training of Deep Neural Networks", "authors": ["Leopold Cambier", "Anahita Bhiwandiwalla", "Ting Gong", "Oguz H. Elibol", "Mehran Nekuii", "Hanlin Tang"], "original_pdf": "/attachment/2be0cad4eec64e32c61f44462fe495cc1c971d44.pdf", "pdf": "/pdf/94b825d79e3c9748e18b7e535e5ec86d18a625c9.pdf", "full_presentation_video": ""}, "forum": "Bkxe2AVtPS", "id": "Bkxe2AVtPS"}, "SJxstlHFPH": {"content": {"appendix": "", "TL;DR": "Differentiable multi-hop access to a textual knowledge base of indexed contextual representations", "keywords": ["multi hop qa", "nlp", "question answering", "reasoning"], "paperhash": "dhingra|differentiable_reasoning_over_a_virtual_knowledge_base", "code": "http://www.cs.cmu.edu/~bdhingra/pages/drkit.html", "spotlight_video": "", "authorids": ["bdhingra@andrew.cmu.edu", "manzilzaheer@google.com", "vbalacha@andrew.cmu.edu", "gneubig@cs.cmu.edu", "rsalakhu@cs.cmu.edu", "wcohen@google.com"], "poster": "", "slides": "", "authors": ["Bhuwan Dhingra", "Manzil Zaheer", "Vidhisha Balachandran", "Graham Neubig", "Ruslan Salakhutdinov", "William W. Cohen"], "_bibtex": "@inproceedings{\nDhingra2020Differentiable,\ntitle={Differentiable Reasoning over a Virtual Knowledge Base},\nauthor={Bhuwan Dhingra and Manzil Zaheer and Vidhisha Balachandran and Graham Neubig and Ruslan Salakhutdinov and William W. Cohen},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJxstlHFPH}\n}", "original_pdf": "/attachment/4bfb75495946426c81b66cf83f73c8be4ef6c97d.pdf", "title": "Differentiable Reasoning over a Virtual Knowledge Base", "pdf": "/pdf/71ad1c4dd7040b1e293243632ab89138be6be4d0.pdf", "abstract": "We consider the task of answering complex multi-hop questions using a corpus as a virtual knowledge base (KB). In particular, we describe a neural module, DrKIT, that traverses textual data like a KB, softly following paths of relations between mentions of entities in the corpus. At each step the module uses a combination of sparse-matrix TFIDF indices and a maximum inner product search (MIPS) on a special index of contextual representations of the mentions. This module is differentiable, so the full system can be trained end-to-end using gradient based methods, starting from natural language inputs. We also describe a pretraining scheme for the contextual representation encoder by generating hard negative examples using existing knowledge bases. We show that DrKIT improves accuracy by 9 points on 3-hop questions in the MetaQA dataset, cutting the gap between text-based and KB-based state-of-the-art by 70%. On HotpotQA, DrKIT leads to a 10% improvement over a BERT-based re-ranking approach to retrieving the relevant passages required to answer a question. DrKIT is also very efficient, processing up to 10-100x more queries per second than existing multi-hop systems.", "full_presentation_video": ""}, "forum": "SJxstlHFPH", "id": "SJxstlHFPH"}, "rJxGLlBtwH": {"content": {"appendix": "", "keywords": ["nlp", "sample efficiency"], "paperhash": "lowe|on_the_interaction_between_supervision_and_selfplay_in_emergent_communication", "code": "https://github.com/backpropper/s2p", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Ryan Lowe*", "Abhinav Gupta*", "Jakob Foerster", "Douwe Kiela", "Joelle Pineau"], "_bibtex": "@inproceedings{\nLowe*2020On,\ntitle={On the interaction between supervision and self-play in emergent communication},\nauthor={Ryan Lowe* and Abhinav Gupta* and Jakob Foerster and Douwe Kiela and Joelle Pineau},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJxGLlBtwH}\n}", "authorids": ["rlowe1@cs.mcgill.ca", "abhinav.gupta@umontreal.ca", "jakobfoerster@gmail.com", "dkiela@fb.com", "jpineau@cs.mcgill.ca"], "title": "On the interaction between supervision and self-play in emergent communication", "original_pdf": "/attachment/2ac50412acb3744ef0ec33f3d0dcc778401ee434.pdf", "pdf": "/pdf/2d5f96a3d23721ee0d9ff70b5f2103ef49ad2ad9.pdf", "abstract": "A promising approach for teaching artificial agents to use natural language involves using human-in-the-loop training. However, recent work suggests that current machine learning methods are too data inefficient to be trained in this way from scratch. In this paper, we investigate the relationship between two categories of learning signals with the ultimate goal of improving sample efficiency: imitating human language data via supervised learning, and maximizing reward in a simulated multi-agent environment via self-play (as done in emergent communication), and introduce the term supervised self-play (S2P) for algorithms using both of these signals. We find that first training agents via supervised learning on human data followed by self-play outperforms the converse, suggesting that it is not beneficial to emerge languages from scratch. We then empirically investigate various S2P schedules that begin with supervised learning in two environments: a Lewis signaling game with symbolic inputs, and an image-based referential game with natural language descriptions. Lastly, we introduce population based approaches to S2P, which further improves the performance over single-agent methods.", "full_presentation_video": ""}, "forum": "rJxGLlBtwH", "id": "rJxGLlBtwH"}, "SylzhkBtDB": {"content": {"appendix": "", "TL;DR": "A Theoretical Study of Multi-Task Learning with Practical Implications for Improving Multi-Task Training and Transfer Learning", "keywords": ["multi task learning", "robustness", "transfer learning"], "paperhash": "wu|understanding_and_improving_information_transfer_in_multitask_learning", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Sen Wu", "Hongyang Zhang", "Christopher R\u00e9"], "_bibtex": "@inproceedings{\nWu2020Understanding,\ntitle={Understanding and Improving Information Transfer in Multi-Task Learning},\nauthor={Sen Wu and Hongyang Zhang and Christopher R\u00e9},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SylzhkBtDB}\n}", "authorids": ["senwu@cs.stanford.edu", "hongyang@cs.stanford.edu", "chrismre@stanford.edu"], "title": "Understanding and Improving Information Transfer in Multi-Task Learning", "original_pdf": "/attachment/3f3aa09455a2751180827cdca5fb7ab3998b6393.pdf", "pdf": "/pdf/11da0a955a55bf4fd3731d8bad0984ac216801cf.pdf", "abstract": "We investigate multi-task learning approaches which use a shared feature representation for all tasks. To better understand the transfer of task information, we study an architecture with a shared module for all tasks and a separate output module for each task. We study the theory of this setting on linear and ReLU-activated models. Our key observation is that whether or not tasks' data are well-aligned can significantly affect the performance of multi-task learning. We show that misalignment between task data can cause negative transfer (or hurt performance) and provide sufficient conditions for positive transfer. Inspired by the theoretical insights, we show that aligning tasks' embedding layers leads to performance gains for multi-task training and transfer learning on the GLUE benchmark and sentiment analysis tasks; for example, we obtained a 2.35% GLUE score average improvement on 5 GLUE tasks over BERT LARGE using our alignment method. We also design an SVD-based task re-weighting scheme and show that it improves the robustness of multi-task training on a multi-label image dataset.", "full_presentation_video": ""}, "forum": "SylzhkBtDB", "id": "SylzhkBtDB"}, "Sye57xStvB": {"content": {"appendix": "", "TL;DR": "We propose a reinforcement learning agent to solve hard exploration games by learning a range of directed exploratory policies. ", "keywords": ["distributed", "episodic memory", "intrinsic motivation", "memory", "reinforcement learning"], "paperhash": "badia|never_give_up_learning_directed_exploration_strategies", "spotlight_video": "", "poster": "", "slides": "", "abstract": "We propose a reinforcement learning agent to solve hard exploration games by learning a range of directed exploratory policies. We construct an episodic memory-based intrinsic reward using k-nearest neighbors over the agent's recent experience to train the directed exploratory policies, thereby encouraging the agent to repeatedly revisit all states in its environment. A self-supervised inverse dynamics model is used to train the embeddings of the nearest neighbour lookup, biasing the novelty signal towards what the agent can control. We employ the framework of Universal Value Function Approximators to simultaneously learn many directed exploration policies with the same neural network, with different trade-offs between exploration and exploitation. By using the same neural network for different degrees of exploration/exploitation, transfer is demonstrated from predominantly exploratory policies yielding effective exploitative policies. The proposed method can be incorporated to run with modern distributed RL agents that collect large amounts of experience from many actors running in parallel on separate environment instances. Our method doubles the performance of the base agent in all hard exploration in the Atari-57 suite while maintaining a very high score across the remaining games, obtaining a median human normalised score of 1344.0%. Notably, the proposed method is the first algorithm to achieve non-zero rewards (with a mean score of 8,400) in the game of Pitfall! without using demonstrations or hand-crafted features.", "_bibtex": "@inproceedings{\nBadia2020Never,\ntitle={Never Give Up: Learning Directed Exploration Strategies},\nauthor={Adri\u00e0 Puigdom\u00e8nech Badia and Pablo Sprechmann and Alex Vitvitskyi and Daniel Guo and Bilal Piot and Steven Kapturowski and Olivier Tieleman and Martin Arjovsky and Alexander Pritzel and Andrew Bolt and Charles Blundell},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Sye57xStvB}\n}", "authorids": ["adriap@google.com", "psprechmann@google.com", "avlife@google.com", "danielguo@google.com", "piot@google.com", "skapturowski@google.com", "tieleman@google.com", "martinarjovsky@gmail.com", "apritzel@google.com", "abolt@google.com", "cblundell@google.com"], "title": "Never Give Up: Learning Directed Exploration Strategies", "authors": ["Adri\u00e0 Puigdom\u00e8nech Badia", "Pablo Sprechmann", "Alex Vitvitskyi", "Daniel Guo", "Bilal Piot", "Steven Kapturowski", "Olivier Tieleman", "Martin Arjovsky", "Alexander Pritzel", "Andrew Bolt", "Charles Blundell"], "original_pdf": "/attachment/3584b6d48b35b86df046a85543fa07ed417f3065.pdf", "pdf": "/pdf/ae4c36f07c0395c529c8c3c1a8001f9230d923ae.pdf", "full_presentation_video": ""}, "forum": "Sye57xStvB", "id": "Sye57xStvB"}, "rkgMkCEtPB": {"content": {"appendix": "", "TL;DR": "The success of MAML relies on feature reuse from the meta-initialization, which also yields a natural simplification of the algorithm, with the inner loop removed for the network body, as well as other insights on the head and body.", "keywords": ["fewshot learning", "meta learning", "optimization", "representation learning"], "paperhash": "raghu|rapid_learning_or_feature_reuse_towards_understanding_the_effectiveness_of_maml", "spotlight_video": "", "poster": "", "slides": "", "abstract": "An important research direction in machine learning has centered around developing meta-learning algorithms to tackle few-shot learning. An especially successful algorithm has been Model Agnostic Meta-Learning (MAML), a method that consists of two optimization loops, with the outer loop finding a meta-initialization, from which the inner loop can efficiently learn new tasks. Despite MAML's popularity, a fundamental open question remains -- is the effectiveness of MAML due to the meta-initialization being primed for rapid learning (large, efficient changes in the representations) or due to feature reuse, with the meta initialization already containing high quality features? We investigate this question, via ablation studies and analysis of the latent representations, finding that feature reuse is the dominant factor. This leads to the ANIL (Almost No Inner Loop) algorithm, a simplification of MAML where we remove the inner loop for all but the (task-specific) head of the underlying neural network. ANIL matches MAML's performance on benchmark few-shot image classification and RL and offers computational improvements over MAML. We further study the precise contributions of the head and body of the network, showing that performance on the test tasks is entirely determined by the quality of the learned features, and we can remove even the head of the network (the NIL algorithm). We conclude with a discussion of the rapid learning vs feature reuse question for meta-learning algorithms more broadly.", "_bibtex": "@inproceedings{\nRaghu2020Rapid,\ntitle={Rapid Learning or Feature Reuse? Towards Understanding the Effectiveness of MAML},\nauthor={Aniruddh Raghu and Maithra Raghu and Samy Bengio and Oriol Vinyals},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkgMkCEtPB}\n}", "authorids": ["aniruddhraghu@gmail.com", "maithrar@gmail.com", "bengio@google.com", "vinyals@google.com"], "title": "Rapid Learning or Feature Reuse? Towards Understanding the Effectiveness of MAML", "authors": ["Aniruddh Raghu", "Maithra Raghu", "Samy Bengio", "Oriol Vinyals"], "original_pdf": "/attachment/36be6c8e943e59a47af8e489b2970b162c59f1f4.pdf", "pdf": "/pdf/f0530e2cf88af3b74bf61bc8591b7a5a1339c49e.pdf", "full_presentation_video": ""}, "forum": "rkgMkCEtPB", "id": "rkgMkCEtPB"}, "HklUCCVKDB": {"content": {"appendix": "", "TL;DR": "A regularization-based approach for continual learning using Bayesian neural networks to predict parameters' importance", "keywords": ["capacity", "catastrophic forgetting", "continual learning", "learning rate", "pruning", "regularization", "uncertainty"], "paperhash": "ebrahimi|uncertaintyguided_continual_learning_with_bayesian_neural_networks", "code": "https://github.com/SaynaEbrahimi/UCB", "spotlight_video": "", "authorids": ["sayna@berkeley.edu", "mohamed.elhoseiny@gmail.com", "trevor@eecs.berkeley.edu", "maroffm@gmail.com"], "poster": "", "slides": "", "authors": ["Sayna Ebrahimi", "Mohamed Elhoseiny", "Trevor Darrell", "Marcus Rohrbach"], "_bibtex": "@inproceedings{\nEbrahimi2020Uncertainty-guided,\ntitle={Uncertainty-guided Continual Learning with Bayesian Neural Networks},\nauthor={Sayna Ebrahimi and Mohamed Elhoseiny and Trevor Darrell and Marcus Rohrbach},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HklUCCVKDB}\n}", "original_pdf": "/attachment/979b981c5cb54fdde94cd97ba44dbeff4c318af2.pdf", "title": "Uncertainty-guided Continual Learning with Bayesian Neural Networks", "pdf": "/pdf/8ccdcfeb98a76b5cd86b24aac86dad75b579af1a.pdf", "abstract": "Continual learning aims to learn new tasks without forgetting previously learned ones. This is especially challenging when one cannot access data from previous tasks and when the model has a fixed capacity. Current regularization-based continual learning algorithms need an external representation and extra computation to measure the parameters' \\textit{importance}. In contrast, we propose Uncertainty-guided Continual Bayesian Neural Networks (UCB), where the learning rate adapts according to the uncertainty defined in the probability distribution of the weights in networks. Uncertainty is a natural way to identify \\textit{what to remember} and \\textit{what to change} as we continually learn, and thus mitigate catastrophic forgetting. We also show a variant of our model, which uses uncertainty for weight pruning \nand retains task performance after pruning by saving binary masks per tasks. We evaluate our UCB approach extensively on diverse object classification datasets with short and long sequences of tasks and report superior or on-par performance compared to existing approaches. Additionally, we show that our model does not necessarily need task information at test time, i.e. it does not presume knowledge of which task a sample belongs to.", "full_presentation_video": ""}, "forum": "HklUCCVKDB", "id": "HklUCCVKDB"}, "Hke0V1rKPS": {"content": {"appendix": "", "TL;DR": "We show that training classifiers to produce salient input Jacobian matrices with a GAN-like regularization can boost adversarial robustness.", "keywords": ["adversarial", "perturbation", "robustness"], "paperhash": "chan|jacobian_adversarially_regularized_networks_for_robustness", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Alvin Chan", "Yi Tay", "Yew Soon Ong", "Jie Fu"], "_bibtex": "@inproceedings{\nChan2020Jacobian,\ntitle={Jacobian Adversarially Regularized Networks for Robustness},\nauthor={Alvin Chan and Yi Tay and Yew Soon Ong and Jie Fu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Hke0V1rKPS}\n}", "authorids": ["guoweial001@e.ntu.edu.sg", "ytay017@e.ntu.edu.sg", "asysong@ntu.edu.sg", "jie.fu@polymtl.ca"], "title": "Jacobian Adversarially Regularized Networks for Robustness", "original_pdf": "/attachment/d4be400372717ad63adc592e4323630726cefade.pdf", "pdf": "/pdf/b71a044605b9fb104c10c4aeff0af76763e44138.pdf", "abstract": "Adversarial examples are crafted with imperceptible perturbations with the intent to fool neural networks. Against such attacks, adversarial training and its variants stand as the strongest defense to date. Previous studies have pointed out that robust models that have undergone adversarial training tend to produce more salient and interpretable Jacobian matrices than their non-robust counterparts. A natural question is whether a model trained with an objective to produce salient Jacobian can result in better robustness. This paper answers this question with affirmative empirical results. We propose Jacobian Adversarially Regularized Networks (JARN) as a method to optimize the saliency of a classifier's Jacobian by adversarially regularizing the model's Jacobian to resemble natural training images. Image classifiers trained with JARN show improved robust accuracy compared to standard models on the MNIST, SVHN and CIFAR-10 datasets, uncovering a new angle to boost robustness without using adversarial training.", "full_presentation_video": ""}, "forum": "Hke0V1rKPS", "id": "Hke0V1rKPS"}, "HJgCF0VFwr": {"content": {"appendix": "", "keywords": ["compression", "pruning"], "paperhash": "xing|probabilistic_connection_importance_inference_and_lossless_compression_of_deep_neural_networks", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Xin Xing", "Long Sha", "Pengyu Hong", "Zuofeng Shang", "Jun S. Liu"], "_bibtex": "@inproceedings{\nXing2020Probabilistic,\ntitle={Probabilistic Connection Importance Inference and Lossless Compression of Deep Neural Networks},\nauthor={Xin Xing and Long Sha and Pengyu Hong and Zuofeng Shang and Jun S. Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJgCF0VFwr}\n}", "authorids": ["xin_xing@fas.harvard.edu", "longsha@brandeis.edu", "hongpeng@brandeis.edu", "zuofeng.shang@njit.edu", "jliu@stat.harvard.edu"], "title": "Probabilistic Connection Importance Inference and Lossless Compression of Deep Neural Networks", "original_pdf": "/attachment/1fa5e83278ec3558d7208ba74708754f8756610b.pdf", "pdf": "/pdf/0b4d4acde853a70c2593060c34e9c3d57e83700f.pdf", "abstract": "Deep neural networks (DNNs) can be huge in size, requiring a considerable a mount of energy and computational resources to operate, which limits their applications in numerous scenarios. It is thus of interest to compress DNNs while maintaining their performance levels. We here propose a probabilistic importance inference approach for pruning DNNs. Specifically, we test the significance of the relevance of a connection in a DNN to the DNN\u2019s outputs using a nonparemtric scoring testand keep only those significant ones. Experimental results show that the proposed approach achieves better lossless compression rates than existing techniques", "full_presentation_video": ""}, "forum": "HJgCF0VFwr", "id": "HJgCF0VFwr"}, "ryeFY0EFwS": {"content": {"appendix": "", "TL;DR": "We propose a hypothesis for why gradient descent generalizes based on how per-example gradients interact with each other.", "keywords": ["generalization", "gradient descent", "optimization", "overfitting"], "paperhash": "chatterjee|coherent_gradients_an_approach_to_understanding_generalization_in_gradient_descentbased_optimization", "spotlight_video": "", "poster": "", "slides": "", "abstract": "An open question in the Deep Learning community is why neural networks trained with Gradient Descent generalize well on real datasets even though they are capable of fitting random data. We propose an approach to answering this question based on a hypothesis about the dynamics of gradient descent that we call Coherent Gradients: Gradients from similar examples are similar and so the overall gradient is stronger in certain directions where these reinforce each other. Thus changes to the network parameters during training are biased towards those that (locally) simultaneously benefit many examples when such similarity exists. We support this hypothesis with heuristic arguments and perturbative experiments and outline how this can explain several common empirical observations about Deep Learning. Furthermore, our analysis is not just descriptive, but prescriptive. It suggests a natural modification to gradient descent that can greatly reduce overfitting.", "_bibtex": "@inproceedings{\nChatterjee2020Coherent,\ntitle={Coherent Gradients: An Approach to Understanding Generalization in Gradient Descent-based Optimization},\nauthor={Satrajit Chatterjee},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ryeFY0EFwS}\n}", "authorids": ["satrajit@gmail.com"], "title": "Coherent Gradients: An Approach to Understanding Generalization in Gradient Descent-based Optimization", "authors": ["Satrajit Chatterjee"], "original_pdf": "/attachment/8d30a4d2b9cceb4df96357e398486aae26d667f3.pdf", "pdf": "/pdf/2484d08967b6883dd5824cccfb67216ee65d38b6.pdf", "full_presentation_video": ""}, "forum": "ryeFY0EFwS", "id": "ryeFY0EFwS"}, "ByglLlHFDS": {"content": {"appendix": "", "TL;DR": "A novel, non-adversarial, approach to learn latent variable models in general and mixture models in particular by computing the I-Projection solely based on samples.", "keywords": ["density estimation", "optimization"], "paperhash": "becker|expected_information_maximization_using_the_iprojection_for_mixture_density_estimation", "code": "https://github.com/pbecker93/ExpectedInformationMaximization", "spotlight_video": "", "authorids": ["philippbecker93@googlemail.com", "oleg@robot-learning.de", "geri@robot-learning.de"], "poster": "", "slides": "", "authors": ["Philipp Becker", "Oleg Arenz", "Gerhard Neumann"], "_bibtex": "@inproceedings{\nBecker2020Expected,\ntitle={Expected Information Maximization: Using the I-Projection for Mixture Density Estimation},\nauthor={Philipp Becker and Oleg Arenz and Gerhard Neumann},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ByglLlHFDS}\n}", "original_pdf": "/attachment/68aaf6390c5d75bd5aa233ff08abfa71c61eeb7c.pdf", "title": "Expected Information Maximization: Using the I-Projection for Mixture Density Estimation", "pdf": "/pdf/11a3505e1ab90cc3bf2a00810d8210a8eebc12c9.pdf", "abstract": "Modelling highly multi-modal data is a challenging problem in machine learning. Most algorithms are based on maximizing the likelihood, which corresponds to the M(oment)-projection of the data distribution to the model distribution.\nThe M-projection forces the model to average over modes it cannot represent. In contrast, the I(nformation)-projection ignores such modes in the data and concentrates on the modes the model can represent. Such behavior is appealing whenever we deal with highly multi-modal data where modelling single modes correctly is more important than covering all the modes. Despite this advantage, the I-projection is rarely used in practice due to the lack of algorithms that can efficiently optimize it based on data. In this work, we present a new algorithm called Expected Information Maximization (EIM) for computing the I-projection solely based on samples for general latent variable models, where we focus on Gaussian mixtures models and Gaussian mixtures of experts. Our approach applies a variational upper bound to the I-projection objective which decomposes the original objective into single objectives for each mixture component as well as for the coefficients, allowing an efficient optimization. Similar to GANs, our approach employs discriminators but uses a more stable optimization procedure, using a tight upper bound. We show that our algorithm is much more effective in computing the I-projection than recent GAN approaches and we illustrate the effectiveness of our approach for modelling multi-modal behavior on two pedestrian and traffic prediction datasets. ", "full_presentation_video": ""}, "forum": "ByglLlHFDS", "id": "ByglLlHFDS"}, "ryxC6kSYPr": {"content": {"appendix": "", "keywords": ["imitation learning", "optimization"], "paperhash": "east|infinitehorizon_differentiable_model_predictive_control", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Sebastian East", "Marco Gallieri", "Jonathan Masci", "Jan Koutnik", "Mark Cannon"], "_bibtex": "@inproceedings{\nEast2020Infinite-Horizon,\ntitle={Infinite-Horizon Differentiable Model Predictive Control},\nauthor={Sebastian East and Marco Gallieri and Jonathan Masci and Jan Koutnik and Mark Cannon},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ryxC6kSYPr}\n}", "authorids": ["sebastian.east@bath.edu", "marco@nnaisense.com", "jonathan@nnaisense.com", "jan@nnaisense.com", "mark.cannon@eng.ox.ac.uk"], "title": "Infinite-Horizon Differentiable Model Predictive Control", "original_pdf": "/attachment/3098bd9c0f939fb0bbdebf1b1404575ea38682e3.pdf", "pdf": "/pdf/2143d86b80f501eb62418b2b12ec31c9893aec77.pdf", "abstract": "This paper proposes a differentiable linear quadratic Model Predictive Control (MPC) framework for safe imitation learning. The infinite-horizon cost is enforced using a terminal cost function obtained from the discrete-time algebraic Riccati equation (DARE), so that the learned controller can be proven to be stabilizing in closed-loop. A central contribution is the derivation of the analytical derivative of the solution of the DARE, thereby allowing the use of differentiation-based learning methods. A further contribution is the structure of the MPC optimization problem: an augmented Lagrangian method ensures that the MPC optimization is feasible throughout training whilst enforcing hard constraints on state and input, and a pre-stabilizing controller ensures that the MPC solution and derivatives are accurate at each iteration. The learning capabilities of the framework are demonstrated in a set of numerical studies. ", "full_presentation_video": ""}, "forum": "ryxC6kSYPr", "id": "ryxC6kSYPr"}, "rkgU1gHtvr": {"content": {"appendix": "", "TL;DR": "A new partially policy-agnostic method for infinite-horizon off-policy policy evalution with multiple known or unknown behavior policies.", "keywords": ["importance sampling", "off policy policy evaluation", "variance reduction"], "paperhash": "chen|infinitehorizon_offpolicy_policy_evaluation_with_multiple_behavior_policies", "spotlight_video": "", "poster": "", "slides": "", "abstract": "We consider off-policy policy evaluation when the trajectory data are generated by multiple behavior policies. Recent work has shown the key role played by the state or state-action stationary distribution corrections in the infinite horizon context for off-policy policy evaluation. We propose estimated mixture policy (EMP), a novel class of partially policy-agnostic methods to accurately estimate those quantities. With careful analysis, we show that EMP gives rise to estimates with reduced variance for estimating the state stationary distribution correction while it also offers a useful induction bias for estimating the state-action stationary distribution correction. In extensive experiments with both continuous and discrete environments, we demonstrate that our algorithm offers significantly improved accuracy compared to the state-of-the-art methods.", "_bibtex": "@inproceedings{\nChen2020Infinite-horizon,\ntitle={Infinite-horizon Off-Policy Policy Evaluation with Multiple Behavior Policies},\nauthor={Xinyun Chen and Lu Wang and Yizhe Hang and Heng Ge and Hongyuan Zha},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkgU1gHtvr}\n}", "authorids": ["chenxinyun@cuhk.edu.cn", "luwang@stu.ecnu.edu.cn", "hangyhan@mail.ustc.edu.cn", "hengge@mail.sdu.edu.cn", "zhahy@cuhk.edu.cn"], "title": "Infinite-horizon Off-Policy Policy Evaluation with Multiple Behavior Policies", "authors": ["Xinyun Chen", "Lu Wang", "Yizhe Hang", "Heng Ge", "Hongyuan Zha"], "original_pdf": "/attachment/6ed285eef23f2ccb5571eaa9b0e2d7fec46706aa.pdf", "pdf": "/pdf/7efb165a914c892ca47ef91f33ab01dc593aa66a.pdf", "full_presentation_video": ""}, "forum": "rkgU1gHtvr", "id": "rkgU1gHtvr"}, "r1evOhEKvH": {"content": {"appendix": "", "TL;DR": " We propose a novel graph inference learning framework by building structure relations to infer unknown node labels from those labeled nodes in an end-to-end way.", "keywords": ["optimization"], "paperhash": "xu|graph_inference_learning_for_semisupervised_classification", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Chunyan Xu", "Zhen Cui", "Xiaobin Hong", "Tong Zhang", "Jian Yang", "Wei Liu"], "_bibtex": "@inproceedings{\nXu2020Graph,\ntitle={Graph inference learning for semi-supervised classification},\nauthor={Chunyan Xu and Zhen Cui and Xiaobin Hong and Tong Zhang and Jian Yang and Wei Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1evOhEKvH}\n}", "authorids": ["cyx@njust.edu.cn", "zhen.cui@njust.edu.cn", "xbhong@njust.edu.cn", "tong.zhang@njust.edu.cn", "csjyang@njust.edu.cn", "wl2223@columbia.edu"], "title": "Graph inference learning for semi-supervised classification", "original_pdf": "/attachment/244c7f8d27fdee11a99bc8c7817ab80db42492a2.pdf", "pdf": "/pdf/03d35e4c92edd9cb8ba89a592f7fe50ec7b372eb.pdf", "abstract": "In this work, we address the semi-supervised classification of graph data, where the categories of those unlabeled nodes are inferred from labeled nodes as well as graph structures. Recent works often solve this problem with the advanced graph convolution in a conventional supervised manner, but the performance could be heavily affected when labeled data is scarce. Here we propose a Graph Inference Learning (GIL) framework to boost the performance of node classification by learning the inference of node labels on graph topology. To bridge the connection of two nodes, we formally define a structure relation by encapsulating node attributes, between-node paths and local topological structures together, which can make inference conveniently deduced from one node to another node. For learning the inference process, we further introduce meta-optimization on structure relations from training nodes to validation nodes, such that the learnt graph inference capability can be better self-adapted into test nodes. Comprehensive evaluations on four benchmark datasets (including Cora, Citeseer, Pubmed and NELL) demonstrate the superiority of our GIL when compared with other state-of-the-art methods in the semi-supervised node classification task.", "full_presentation_video": ""}, "forum": "r1evOhEKvH", "id": "r1evOhEKvH"}, "S1xitgHtvS": {"content": {"appendix": "", "TL;DR": "Popular algorithms that cast \"RL as Inference\" ignore the role of uncertainty and exploration. We highlight the importance of these issues and present a coherent framework for RL and inference that handles them gracefully.", "keywords": ["bayesian inference", "reinforcement learning"], "paperhash": "odonoghue|making_sense_of_reinforcement_learning_and_probabilistic_inference", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Brendan O'Donoghue", "Ian Osband", "Catalin Ionescu"], "_bibtex": "@inproceedings{\nO'Donoghue2020Making,\ntitle={Making Sense of Reinforcement Learning and Probabilistic Inference},\nauthor={Brendan O'Donoghue and Ian Osband and Catalin Ionescu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1xitgHtvS}\n}", "authorids": ["bodonoghue85@gmail.com", "iosband@google.com", "cdi@google.com"], "title": "Making Sense of Reinforcement Learning and Probabilistic Inference", "original_pdf": "/attachment/63cd6eb8ab473633219b0b33e08da14b208c7462.pdf", "pdf": "/pdf/2f653b1fba689d249ca2d622778d13d1840816eb.pdf", "abstract": "Reinforcement learning (RL) combines a control problem with statistical estimation: The system dynamics are not known to the agent, but can be learned through experience. A recent line of research casts \u2018RL as inference\u2019 and suggests a particular framework to generalize the RL problem as probabilistic inference. Our paper surfaces a key shortcoming in that approach, and clarifies the sense in which RL can be coherently cast as an inference problem. In particular, an RL agent must consider the effects of its actions upon future rewards and observations: The exploration-exploitation tradeoff. In all but the most simple settings, the resulting inference is computationally intractable so that practical RL algorithms must resort to approximation. We demonstrate that the popular \u2018RL as inference\u2019 approximation can perform poorly in even very basic problems. However, we show that with a small modification the framework does yield algorithms that can provably perform well, and we show that the resulting algorithm is equivalent to the recently proposed K-learning, which we further connect with Thompson sampling.\n", "full_presentation_video": ""}, "forum": "S1xitgHtvS", "id": "S1xitgHtvS"}, "S1efxTVYDr": {"content": {"appendix": "", "TL;DR": "We introduce an extra data-dependent Gaussian prior objective to augment the current MLE training, which is designed to capture the prior knowledge in the ground-truth data.", "keywords": ["generation", "machine translation", "nlp", "regularization", "unsupervised"], "paperhash": "li|datadependent_gaussian_prior_objective_for_language_generation", "code": "https://drive.google.com/file/d/1q8PqhF9eOLOHOcOCGVKXtA_OlP6qq2mn", "spotlight_video": "", "authorids": ["charlee@sjtu.edu.cn", "wangrui@nict.go.jp", "khchen@nict.go.jp", "mutiyama@nict.go.jp", "eiichiro.sumita@nict.go.jp", "zhangzs@sjtu.edu.cn", "zhaohai@cs.sjtu.edu.cn"], "poster": "", "slides": "", "authors": ["Zuchao Li", "Rui Wang", "Kehai Chen", "Masso Utiyama", "Eiichiro Sumita", "Zhuosheng Zhang", "Hai Zhao"], "_bibtex": "@inproceedings{\nLi2020Data-dependent,\ntitle={Data-dependent Gaussian Prior Objective for Language Generation},\nauthor={Zuchao Li and Rui Wang and Kehai Chen and Masso Utiyama and Eiichiro Sumita and Zhuosheng Zhang and Hai Zhao},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1efxTVYDr}\n}", "original_pdf": "/attachment/ad3ca48556cade5cadff617fc28b40d75e508748.pdf", "title": "Data-dependent Gaussian Prior Objective for Language Generation", "pdf": "/pdf/415754fab241d91cf9f51f235afc8ac851f4bed8.pdf", "abstract": "For typical sequence prediction problems such as language generation, maximum likelihood estimation (MLE) has commonly been adopted as it encourages the predicted sequence most consistent with the ground-truth sequence to have the highest probability of occurring. However, MLE focuses on once-to-all matching between the predicted sequence and gold-standard, consequently treating all incorrect predictions as being equally incorrect. We refer to this drawback as {\\it negative diversity ignorance} in this paper. Treating all incorrect predictions as equal unfairly downplays the nuance of these sequences' detailed token-wise structure. To counteract this, we augment the MLE loss by introducing an extra Kullback--Leibler divergence term derived by comparing a data-dependent Gaussian prior and the detailed training prediction. The proposed data-dependent Gaussian prior objective (D2GPo) is defined over a prior topological order of tokens and is poles apart from the data-independent Gaussian prior (L2 regularization) commonly adopted in smoothing the training of MLE. Experimental results show that the proposed method makes effective use of a more detailed prior in the data and has improved performance in typical language generation tasks, including supervised and unsupervised machine translation, text summarization, storytelling, and image captioning.\n", "full_presentation_video": ""}, "forum": "S1efxTVYDr", "id": "S1efxTVYDr"}, "rkgvXlrKwH": {"content": {"appendix": "", "TL;DR": "SEED RL, a scalable and efficient deep reinforcement learning agent with accelerated central inference. State of the art results, reduces cost and can process millions of frames per second. ", "keywords": ["distributed", "policy gradient", "reinforcement learning", "scalability"], "paperhash": "espeholt|seed_rl_scalable_and_efficient_deeprl_with_accelerated_central_inference", "code": "https://drive.google.com/file/d/144yp7PQf486dmctE2oS2md_qmNBTFbez/view?usp=sharing", "spotlight_video": "", "authorids": ["lespeholt@google.com", "raphaelm@google.com", "stanczyk@google.com", "kewa@google.com", "michalski@google.com"], "poster": "", "slides": "", "authors": ["Lasse Espeholt", "Rapha\u00ebl Marinier", "Piotr Stanczyk", "Ke Wang", "Marcin Michalski\u200e"], "_bibtex": "@inproceedings{\nEspeholt2020SEED,\ntitle={SEED RL: Scalable and Efficient Deep-RL with Accelerated Central Inference},\nauthor={Lasse Espeholt and Rapha\u00ebl Marinier and Piotr Stanczyk and Ke Wang and Marcin Michalski\u200e},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkgvXlrKwH}\n}", "original_pdf": "/attachment/c27c18b2100faeacaaa4f2e4a6cf5e2cec421815.pdf", "title": "SEED RL: Scalable and Efficient Deep-RL with Accelerated Central Inference", "pdf": "/pdf/bce334f7e39344120d107d0806e64d13b8c8f874.pdf", "abstract": "We present a modern scalable reinforcement learning agent called SEED (Scalable, Efficient Deep-RL). By effectively utilizing modern accelerators, we show that it is not only possible to train on millions of frames per second but also to lower the cost. of experiments compared to current methods. We achieve this with a simple architecture that features centralized inference and an optimized communication layer. SEED adopts two state-of-the-art distributed algorithms, IMPALA/V-trace (policy gradients) and R2D2 (Q-learning), and is evaluated on Atari-57, DeepMind Lab and Google Research Football. We improve the state of the art on Football and are able to reach state of the art on Atari-57 twice as fast in wall-time. For the scenarios we consider, a 40% to 80% cost reduction for running experiments is achieved. The implementation along with experiments is open-sourced so results can be reproduced and novel ideas tried out.", "full_presentation_video": ""}, "forum": "rkgvXlrKwH", "id": "rkgvXlrKwH"}, "H1lmyRNFvr": {"content": {"appendix": "", "TL;DR": "Tackling inverse design via genetic algorithms augmented with deep neural networks. ", "keywords": ["generative models", "interpretability", "optimization"], "paperhash": "nigam|augmenting_genetic_algorithms_with_deep_neural_networks_for_exploring_the_chemical_space", "code": "https://github.com/aspuru-guzik-group/GA", "spotlight_video": "", "authorids": ["akshat.nigam@mail.utoronto.ca", "pascal.friederich@utoronto.ca", "mario.krenn@utoronto.ca", "alan@aspuru.com"], "poster": "", "slides": "", "authors": ["AkshatKumar Nigam", "Pascal Friederich", "Mario Krenn", "Alan Aspuru-Guzik"], "_bibtex": "@inproceedings{\nNigam2020Augmenting,\ntitle={Augmenting Genetic Algorithms with Deep Neural Networks for Exploring the Chemical Space},\nauthor={AkshatKumar Nigam and Pascal Friederich and Mario Krenn and Alan Aspuru-Guzik},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1lmyRNFvr}\n}", "original_pdf": "/attachment/ad95d7dc82482e35e1f88d155475cceadfb36506.pdf", "title": "Augmenting Genetic Algorithms with Deep Neural Networks for Exploring the Chemical Space", "pdf": "/pdf/cc77b7d7b507fb64261f05a9db656bb3121d7aad.pdf", "abstract": "Challenges in natural sciences can often be phrased as optimization problems. Machine learning techniques have recently been applied to solve such problems. One example in chemistry is the design of tailor-made organic materials and molecules, which requires efficient methods to explore the chemical space. We present a genetic algorithm (GA) that is enhanced with a neural network (DNN) based discriminator model to improve the diversity of generated molecules and at the same time steer the GA. We show that our algorithm outperforms other generative models in optimization tasks. We furthermore present a way to increase interpretability of genetic algorithms, which helped us to derive design principles", "full_presentation_video": ""}, "forum": "H1lmyRNFvr", "id": "H1lmyRNFvr"}, "rJe4_xSFDB": {"content": {"appendix": "", "TL;DR": "LP-based upper bounds on the Lipschitz constant of Neural Networks", "keywords": ["optimization"], "paperhash": "latorre|lipschitz_constant_estimation_of_neural_networks_via_sparse_polynomial_optimization", "code": "https://drive.google.com/drive/folders/1bkj0H6Thgd9sjRloyq9NBP0uO0v704E9?usp=sharing", "spotlight_video": "", "authorids": ["fabian.latorre@epfl.ch", "paul.rolland@epfl.ch", "volkan.cevher@epfl.ch"], "poster": "", "slides": "", "authors": ["Fabian Latorre", "Paul Rolland", "Volkan Cevher"], "_bibtex": "@inproceedings{\nLatorre2020Lipschitz,\ntitle={Lipschitz constant estimation of Neural Networks via sparse polynomial optimization},\nauthor={Fabian Latorre and Paul Rolland and Volkan Cevher},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJe4_xSFDB}\n}", "original_pdf": "/attachment/c075a98a8e5b44c6419ec464560a934415fe6866.pdf", "title": "Lipschitz constant estimation of Neural Networks via sparse polynomial optimization", "pdf": "/pdf/1955c3585c78c18fe3ce133f94802e9b47f80307.pdf", "abstract": "We introduce LiPopt, a polynomial optimization framework for computing increasingly tighter upper bound on the Lipschitz constant of neural networks. The underlying optimization problems boil down to either linear (LP) or semidefinite (SDP) programming. We show how to use the sparse connectivity of a network, to significantly reduce the complexity of computation. This is specially useful for convolutional as well as pruned neural networks. We conduct experiments on networks with random weights as well as networks trained on MNIST, showing that in the particular case of the $\\ell_\\infty$-Lipschitz constant, our approach yields superior estimates as compared to other baselines available in the literature.\n", "full_presentation_video": ""}, "forum": "rJe4_xSFDB", "id": "rJe4_xSFDB"}, "rkeIq2VYPr": {"content": {"appendix": "", "TL;DR": "We proposed a specific back-propagation method via proper spectral sub-gradient to integrate determinantal point process to deep learning framework.", "keywords": ["computer vision", "optimization", "stability"], "paperhash": "yu|deep_learning_of_determinantal_point_processes_via_proper_spectral_subgradient", "spotlight_video": "", "poster": "", "slides": "", "abstract": "Determinantal point processes (DPPs) is an effective tool to deliver diversity on multiple machine learning and computer vision tasks. Under deep learning framework, DPP is typically optimized via approximation, which is not straightforward and has some conflict with diversity requirement. We note, however, there has been no deep learning paradigms to optimize DPP directly since it involves matrix inversion which may result in highly computational instability. This fact greatly hinders the wide use of DPP on some specific objectives where DPP serves as a term to measure the feature diversity. In this paper, we devise a simple but effective algorithm to address this issue to optimize DPP term directly expressed with L-ensemble in spectral domain over gram matrix, which is more flexible than learning on parametric kernels. By further taking into account some geometric constraints, our algorithm seeks to generate valid sub-gradients of DPP term in case when the DPP gram matrix is not invertible (no gradients exist in this case). In this sense, our algorithm can be easily incorporated with multiple deep learning tasks. Experiments show the effectiveness of our algorithm, indicating promising performance for practical learning problems. ", "_bibtex": "@inproceedings{\nYu2020Deep,\ntitle={Deep Learning of Determinantal Point Processes via Proper Spectral Sub-gradient},\nauthor={Tianshu Yu and Yikang Li and Baoxin Li},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkeIq2VYPr}\n}", "authorids": ["tianshuy@asu.edu", "yikang.li@asu.edu", "baoxin.li@asu.edu"], "title": "Deep Learning of Determinantal Point Processes via Proper Spectral Sub-gradient", "authors": ["Tianshu Yu", "Yikang Li", "Baoxin Li"], "original_pdf": "/attachment/507602b4dcf616cd605321c5a9b72b9c66cff880.pdf", "pdf": "/pdf/1fea5d1a4d56edfc0b9b0c8a5b5a15d9f7aa7f43.pdf", "full_presentation_video": ""}, "forum": "rkeIq2VYPr", "id": "rkeIq2VYPr"}, "HyeJf1HKvS": {"content": {"appendix": "", "TL;DR": "We develop a deep graph matching architecture which refines initial correspondences in order to reach neighborhood consensus.", "keywords": ["computer vision", "graph networks", "knowledge graphs"], "paperhash": "fey|deep_graph_matching_consensus", "code": "https://github.com/rusty1s/deep-graph-matching-consensus", "spotlight_video": "", "authorids": ["matthias.fey@tu-dortmund.de", "janeric.lenssen@udo.edu", "christopher.morris@tu-dortmund.de", "jonathan@nnaisense.com", "nils.kriege@tu-dortmund.de"], "poster": "", "slides": "", "authors": ["Matthias Fey", "Jan E. Lenssen", "Christopher Morris", "Jonathan Masci", "Nils M. Kriege"], "_bibtex": "@inproceedings{\nFey2020Deep,\ntitle={Deep Graph Matching Consensus},\nauthor={Matthias Fey and Jan E. Lenssen and Christopher Morris and Jonathan Masci and Nils M. Kriege},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HyeJf1HKvS}\n}", "original_pdf": "/attachment/88621ce0dea2530c489083c1d2935c8d277a0f8c.pdf", "title": "Deep Graph Matching Consensus", "pdf": "/pdf/b1adda2ead3cb872e2efeba22d8c766c02eebe8d.pdf", "abstract": "This work presents a two-stage neural architecture for learning and refining structural correspondences between graphs. First, we use localized node embeddings computed by a graph neural network to obtain an initial ranking of soft correspondences between nodes. Secondly, we employ synchronous message passing networks to iteratively re-rank the soft correspondences to reach a matching consensus in local neighborhoods between graphs. We show, theoretically and empirically, that our message passing scheme computes a well-founded measure of consensus for corresponding neighborhoods, which is then used to guide the iterative re-ranking process. Our purely local and sparsity-aware architecture scales well to large, real-world inputs while still being able to recover global correspondences consistently. We demonstrate the practical effectiveness of our method on real-world tasks from the fields of computer vision and entity alignment between knowledge graphs, on which we improve upon the current state-of-the-art.", "full_presentation_video": ""}, "forum": "HyeJf1HKvS", "id": "HyeJf1HKvS"}, "rJgUfTEYvH": {"content": {"appendix": "", "TL;DR": "We demonstrate that flow-based generative models offer a viable and competitive approach to generative modeling of video.", "keywords": ["autoregressive models", "generation", "generative models", "optimization", "video generation", "video prediction"], "paperhash": "kumar|videoflow_a_conditional_flowbased_model_for_stochastic_video_generation", "code": "https://storage.googleapis.com/iclr_code/videoflow_code.zip", "spotlight_video": "", "authorids": ["manojkumarsivaraj334@gmail.com", "mb2@uiuc.edu", "dumitru@google.com", "cbfinn@eecs.berkeley.edu", "slevine@google.com", "laurentdinh@google.com", "d.p.kingma@uva.nl"], "poster": "", "slides": "", "authors": ["Manoj Kumar", "Mohammad Babaeizadeh", "Dumitru Erhan", "Chelsea Finn", "Sergey Levine", "Laurent Dinh", "Durk Kingma"], "_bibtex": "@inproceedings{\nKumar2020VideoFlow:,\ntitle={VideoFlow: A Conditional Flow-Based Model for Stochastic Video Generation},\nauthor={Manoj Kumar and Mohammad Babaeizadeh and Dumitru Erhan and Chelsea Finn and Sergey Levine and Laurent Dinh and Durk Kingma},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJgUfTEYvH}\n}", "original_pdf": "/attachment/be409170e023d9741b593fcfe6327ceb4005a18b.pdf", "title": "VideoFlow: A Conditional Flow-Based Model for Stochastic Video Generation", "pdf": "/pdf/f0048f33cb4788f547460b93e23a9fa5253a59ac.pdf", "abstract": "Generative models that can model and predict sequences of future events can, in principle, learn to capture complex real-world phenomena, such as physical interactions. However, a central challenge in video prediction is that the future is highly uncertain: a sequence of past observations of events can imply many possible futures. Although a number of recent works have studied probabilistic models that can represent uncertain futures, such models are either extremely expensive computationally as in the case of pixel-level autoregressive models, or do not directly optimize the likelihood of the data. To our knowledge, our work is the first to propose multi-frame video prediction with normalizing flows, which allows for direct optimization of the data likelihood, and produces high-quality stochastic predictions. We describe an approach for modeling the latent space dynamics, and demonstrate that flow-based generative models offer a viable and competitive approach to generative modeling of video.", "full_presentation_video": ""}, "forum": "rJgUfTEYvH", "id": "rJgUfTEYvH"}, "r1g87C4KwB": {"content": {"appendix": "", "TL;DR": "In the early phase of training of deep neural networks there exists a \"break-even point\" which determines properties of the entire optimization trajectory.", "keywords": ["batch normalization", "generalization", "gradient descent", "learning rate", "loss surface", "optimization"], "paperhash": "jastrzebski|the_breakeven_point_on_optimization_trajectories_of_deep_neural_networks", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Stanislaw Jastrzebski", "Maciej Szymczak", "Stanislav Fort", "Devansh Arpit", "Jacek Tabor", "Kyunghyun Cho*", "Krzysztof Geras*"], "_bibtex": "@inproceedings{\nJastrzebski2020The,\ntitle={The Break-Even Point on Optimization Trajectories of Deep Neural Networks},\nauthor={Stanislaw Jastrzebski and Maciej Szymczak and Stanislav Fort and Devansh Arpit and Jacek Tabor and Kyunghyun Cho* and Krzysztof Geras*},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1g87C4KwB}\n}", "authorids": ["staszek.jastrzebski@gmail.com", "msz93@o2.pl", "stanislav.fort@gmail.com", "devansharpit@gmail.com", "jcktbr@gmail.com", "kyunghyun.cho@nyu.edu", "k.j.geras@nyu.edu"], "title": "The Break-Even Point on Optimization Trajectories of Deep Neural Networks", "original_pdf": "/attachment/61f4b1dbcc5542757e57b73e97b43dc69de1d9ac.pdf", "pdf": "/pdf/614bb41ca81c9195f40d67ce8f058d3a88ee87e2.pdf", "abstract": "The early phase of training of deep neural networks is critical for their final performance. In this work, we study how the hyperparameters of stochastic gradient descent (SGD) used in the early phase of training affect the rest of the optimization trajectory. We argue for the existence of the \"``break-even\" point on this trajectory, beyond which the curvature of the loss surface and noise in the gradient are implicitly regularized by SGD. In particular, we demonstrate on multiple classification tasks that using a large learning rate in the initial phase of training reduces the variance of the gradient, and improves the conditioning of the covariance of gradients. These effects are beneficial from the optimization perspective and become visible after the break-even point. Complementing prior work, we also show that using a low learning rate results in bad conditioning of the loss surface even for a neural network with batch normalization layers. In short, our work shows that key properties of the loss surface are strongly influenced by SGD in the early phase of training. We argue that studying the impact of the identified effects on generalization is a promising future direction.", "full_presentation_video": ""}, "forum": "r1g87C4KwB", "id": "r1g87C4KwB"}, "B1xMEerYvB": {"content": {"appendix": "", "TL;DR": "We introduce a class of n-player games suited to gradient-based methods.", "keywords": ["adversarial", "game theory", "gradient descent", "optimization"], "paperhash": "balduzzi|smooth_markets_a_basic_mechanism_for_organizing_gradientbased_learners", "spotlight_video": "", "poster": "", "slides": "", "authors": ["David Balduzzi", "Wojciech M. Czarnecki", "Tom Anthony", "Ian Gemp", "Edward Hughes", "Joel Leibo", "Georgios Piliouras", "Thore Graepel"], "_bibtex": "@inproceedings{\nBalduzzi2020Smooth,\ntitle={Smooth markets: A basic mechanism for organizing gradient-based learners},\nauthor={David Balduzzi and Wojciech M. Czarnecki and Tom Anthony and Ian Gemp and Edward Hughes and Joel Leibo and Georgios Piliouras and Thore Graepel},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1xMEerYvB}\n}", "authorids": ["dbalduzzi@google.com", "lejlot@google.com", "edwardhughes@google.com", "jzl@google.com", "imgemp@google.com", "twa@google.com", "georgios.piliouras@gmail.com", "thore@google.com"], "title": "Smooth markets: A basic mechanism for organizing gradient-based learners", "original_pdf": "/attachment/42288e49980f7b30c43de7003ed1e9a6bd1fdc59.pdf", "pdf": "/pdf/ab64e52dccd7ce6a4df0e4bebb7b3d342e651c6f.pdf", "abstract": "With the success of modern machine learning, it is becoming increasingly important to understand and control how learning algorithms interact. Unfortunately, negative results from game theory show there is little hope of understanding or controlling general n-player games. We therefore introduce smooth markets (SM-games), a class of n-player games with pairwise zero sum interactions. SM-games codify a common design pattern in machine learning that includes some GANs, adversarial training, and other recent algorithms. We show that SM-games are amenable to analysis and optimization using first-order methods.", "full_presentation_video": ""}, "forum": "B1xMEerYvB", "id": "B1xMEerYvB"}, "r1xCMyBtPS": {"content": {"appendix": "", "TL;DR": "We propose procedures for evaluating and strengthening contextual embedding alignment and show that they both improve multilingual BERT's zero-shot XNLI transfer and provide useful insights into the model.", "keywords": ["nlp", "transfer learning", "transformer", "word embedding", "word embeddings"], "paperhash": "cao|multilingual_alignment_of_contextual_word_representations", "spotlight_video": "", "poster": "", "slides": "", "abstract": "We propose procedures for evaluating and strengthening contextual embedding alignment and show that they are useful in analyzing and improving multilingual BERT. In particular, after our proposed alignment procedure, BERT exhibits significantly improved zero-shot performance on XNLI compared to the base model, remarkably matching pseudo-fully-supervised translate-train models for Bulgarian and Greek. Further, to measure the degree of alignment, we introduce a contextual version of word retrieval and show that it correlates well with downstream zero-shot transfer. Using this word retrieval task, we also analyze BERT and find that it exhibits systematic deficiencies, e.g. worse alignment for open-class parts-of-speech and word pairs written in different scripts, that are corrected by the alignment procedure. These results support contextual alignment as a useful concept for understanding large multilingual pre-trained models.", "_bibtex": "@inproceedings{\nCao2020Multilingual,\ntitle={Multilingual Alignment of Contextual Word Representations},\nauthor={Steven Cao and Nikita Kitaev and Dan Klein},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1xCMyBtPS}\n}", "authorids": ["stevencao@berkeley.edu", "kitaev@berkeley.edu", "klein@berkeley.edu"], "title": "Multilingual Alignment of Contextual Word Representations", "authors": ["Steven Cao", "Nikita Kitaev", "Dan Klein"], "original_pdf": "/attachment/6f4a7ca32c03ad01f89b9728bfd30ca8c5cd2ffa.pdf", "pdf": "/pdf/db0d538420ea1471c16cc4c3e0251ab3e34bfb22.pdf", "full_presentation_video": ""}, "forum": "r1xCMyBtPS", "id": "r1xCMyBtPS"}, "SJlKrkSFPH": {"content": {"appendix": "", "TL;DR": "Develop a general framework to establish certified robustness of ML models against various classes of adversarial perturbations", "keywords": ["adversarial", "perturbation", "randomized smoothing", "robustness", "verification"], "paperhash": "dvijotham|a_framework_for_robustness_certification_of_smoothed_classifiers_using_fdivergences", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Krishnamurthy (Dj) Dvijotham", "Jamie Hayes", "Borja Balle", "Zico Kolter", "Chongli Qin", "Andras Gyorgy", "Kai Xiao", "Sven Gowal", "Pushmeet Kohli"], "_bibtex": "@inproceedings{\nDvijotham2020A,\ntitle={A FRAMEWORK FOR ROBUSTNESS CERTIFICATION OF SMOOTHED CLASSIFIERS USING F-DIVERGENCES},\nauthor={Krishnamurthy (Dj) Dvijotham and Jamie Hayes and Borja Balle and Zico Kolter and Chongli Qin and Andras Gyorgy and Kai Xiao and Sven Gowal and Pushmeet Kohli},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJlKrkSFPH}\n}", "authorids": ["dvij@google.com", "j.hayes@cs.ucl.ac.uk", "bballe@google.com", "zkolter@cs.cmu.edu", "chongliqin@google.com", "agyorgy@google.com", "kaix@mit.edu", "sgowal@google.com", "pushmeet@google.com"], "title": "A FRAMEWORK FOR ROBUSTNESS CERTIFICATION OF SMOOTHED CLASSIFIERS USING F-DIVERGENCES", "original_pdf": "/attachment/4a7b75373f2ba1374547cb1ff42c0df98f6be1ce.pdf", "pdf": "/pdf/5c744888eae162e715f7635b283ff4545aee828d.pdf", "abstract": "Formal verification techniques that compute provable guarantees on properties of machine learning models, like robustness to norm-bounded adversarial perturbations, have yielded impressive results. Although most techniques developed so far require knowledge of the architecture of the machine learning model and remain hard to scale to complex prediction pipelines, the method of randomized smoothing has been shown to overcome many of these obstacles. By requiring only black-box access to the underlying model, randomized smoothing scales to large architectures and is agnostic to the internals of the network. However, past work on randomized smoothing has focused on restricted classes of smoothing measures or perturbations (like Gaussian or discrete) and has only been able to prove robustness with respect to simple norm bounds. In this paper we introduce a general framework for proving robustness properties of smoothed machine learning models in the black-box setting. Specifically, we extend randomized smoothing procedures to handle arbitrary smoothing measures and prove robustness of the smoothed classifier by using f-divergences. Our methodology improves upon the state of the art in terms of computation time or certified robustness on several image classification tasks and an audio classification task, with respect to several classes of adversarial perturbations. ", "full_presentation_video": ""}, "forum": "SJlKrkSFPH", "id": "SJlKrkSFPH"}, "Bke89JBtvB": {"content": {"appendix": "", "TL;DR": "A method that trains large capacity neural networks with significantly improved accuracy and lower dynamic computational cost", "keywords": ["capacity", "image classification", "imagenet", "semantic segmentation"], "paperhash": "bejnordi|batchshaping_for_learning_conditional_channel_gated_networks", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Babak Ehteshami Bejnordi", "Tijmen Blankevoort", "Max Welling"], "_bibtex": "@inproceedings{\nBejnordi2020Batch-shaping,\ntitle={Batch-shaping for learning conditional channel gated networks},\nauthor={Babak Ehteshami Bejnordi and Tijmen Blankevoort and Max Welling},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Bke89JBtvB}\n}", "authorids": ["behtesha@qti.qualcomm.com", "tijmen@qti.qualcomm.com", "mwelling@qti.qualcomm.com"], "title": "Batch-shaping for learning conditional channel gated networks", "original_pdf": "/attachment/8da3b203200ced58e1c92dae81f92e8ce66d7cf8.pdf", "pdf": "/pdf/9420f58cf55133d527b356ab5bd899bcb5c1d763.pdf", "abstract": "We present a method that trains large capacity neural networks with significantly improved accuracy and lower dynamic computational cost. This is achieved by gating the deep-learning architecture on a fine-grained-level. Individual convolutional maps are turned on/off conditionally on features in the network. To achieve this, we introduce a new residual block architecture that gates convolutional channels in a fine-grained manner. We also introduce a generally applicable tool batch-shaping that matches the marginal aggregate posteriors of features in a neural network to a pre-specified prior distribution. We use this novel technique to force gates to be more conditional on the data. We present results on CIFAR-10 and ImageNet datasets for image classification, and Cityscapes for semantic segmentation. Our results show that our method can slim down large architectures conditionally, such that the average computational cost on the data is on par with a smaller architecture, but with higher accuracy. In particular, on ImageNet, our ResNet50 and ResNet34 gated networks obtain 74.60% and 72.55% top-1 accuracy compared to the 69.76% accuracy of the baseline ResNet18 model, for similar complexity. We also show that the resulting networks automatically learn to use more features for difficult examples and fewer features for simple examples.", "full_presentation_video": ""}, "forum": "Bke89JBtvB", "id": "Bke89JBtvB"}, "SklTQCNtvS": {"content": {"appendix": "", "keywords": ["adversarial", "imagenet", "optimization", "perturbation", "robustness"], "paperhash": "cheng|signopt_a_queryefficient_hardlabel_adversarial_attack", "code": "https://github.com/cmhcbb/attackbox", "spotlight_video": "", "poster": "", "slides": "", "abstract": "We study the most practical problem setup for evaluating adversarial robustness of a machine learning system with limited access: the hard-label black-box attack setting for generating adversarial examples, where limited model queries are allowed and only the decision is provided to a queried data input. Several algorithms have been proposed for this problem but they typically require huge amount (>20,000) of queries for attacking one example. Among them, one of the state-of-the-art approaches (Cheng et al., 2019) showed that hard-label attack can be modeled as an optimization problem where the objective function can be evaluated by binary search with additional model queries, thereby a zeroth order optimization algorithm can be applied. In this paper, we adopt the same optimization formulation but propose to directly estimate the sign of gradient at any direction instead of the gradient itself, which enjoys the benefit of single query. \nUsing this single query oracle for retrieving sign of directional derivative, we develop a novel query-efficient Sign-OPT approach for hard-label black-box attack. We provide a convergence analysis of the new algorithm and conduct experiments on several models on MNIST, CIFAR-10 and ImageNet. \nWe find that Sign-OPT attack consistently requires 5X to 10X fewer queries when compared to the current state-of-the-art approaches, and usually converges to an adversarial example with smaller perturbation. ", "_bibtex": "@inproceedings{\nCheng2020Sign-OPT:,\ntitle={Sign-OPT: A Query-Efficient Hard-label Adversarial Attack},\nauthor={Minhao Cheng and Simranjit Singh and Patrick H. Chen and Pin-Yu Chen and Sijia Liu and Cho-Jui Hsieh},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SklTQCNtvS}\n}", "authorids": ["mhcheng@ucla.edu", "simranjit@cs.ucla.edu", "patrickchen@ucla.edu", "pin-yu.chen@ibm.com", "sijia.liu@ibm.com", "chohsieh@cs.ucla.edu"], "title": "Sign-OPT: A Query-Efficient Hard-label Adversarial Attack", "authors": ["Minhao Cheng", "Simranjit Singh", "Patrick H. Chen", "Pin-Yu Chen", "Sijia Liu", "Cho-Jui Hsieh"], "original_pdf": "/attachment/b1b4a06f0cafab6c9cc329939c34987d9acd9796.pdf", "pdf": "/pdf/b0e100de9d582c690cc356f1475cfd56649b84a4.pdf", "full_presentation_video": ""}, "forum": "SklTQCNtvS", "id": "SklTQCNtvS"}, "H1gfFaEYDS": {"content": {"appendix": "", "TL;DR": "We propose a method for computing adversarially robust representations in an entirely unsupervised way.", "keywords": ["adversarial", "autoencoder", "regularization", "unsupervised", "variational autoencoders", "variational inference"], "paperhash": "cemgil|adversarially_robust_representations_with_smooth_encoders", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Taylan Cemgil", "Sumedh Ghaisas", "Krishnamurthy (Dj) Dvijotham", "Pushmeet Kohli"], "_bibtex": "@inproceedings{\nCemgil2020Adversarially,\ntitle={Adversarially Robust Representations with Smooth Encoders},\nauthor={Taylan Cemgil and Sumedh Ghaisas and Krishnamurthy (Dj) Dvijotham and Pushmeet Kohli},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1gfFaEYDS}\n}", "authorids": ["taylancemgil@google.com", "sumedhg@google.com", "dvij@google.com", "pushmeet@google.com"], "title": "Adversarially Robust Representations with Smooth Encoders", "original_pdf": "/attachment/64ab4e9f100d17ad503cde1b67d98e06a590ee1f.pdf", "pdf": "/pdf/028b6b32416c54e7f696c014e41d55866a6752a6.pdf", "abstract": "This paper studies the undesired phenomena of over-sensitivity of representations learned by deep networks to semantically-irrelevant changes in data. We identify a cause for this shortcoming in the classical Variational Auto-encoder (VAE) objective, the evidence lower bound (ELBO). We show that the ELBO fails to control the behaviour of the encoder out of the support of the empirical data distribution and this behaviour of the VAE can lead to extreme errors in the learned representation. This is a key hurdle in the effective use of representations for data-efficient learning and transfer. To address this problem, we propose to augment the data with specifications that enforce insensitivity of the representation with respect to families of transformations. To incorporate these specifications, we propose a regularization method that is based on a selection mechanism that creates a fictive data point by explicitly perturbing an observed true data point. For certain choices of parameters, our formulation naturally leads to the minimization of the entropy regularized Wasserstein distance between representations. We illustrate our approach on standard datasets and experimentally show that significant improvements in the downstream adversarial accuracy can be achieved by learning robust representations completely in an unsupervised manner, without a reference to a particular downstream task and without a costly supervised adversarial training procedure. \n", "full_presentation_video": ""}, "forum": "H1gfFaEYDS", "id": "H1gfFaEYDS"}, "Byl5NREFDr": {"content": {"appendix": "", "TL;DR": "Outputs of modern NLP APIs on nonsensical text provide strong signals about model internals, allowing adversaries to steal the APIs.", "keywords": ["nlp", "question answering", "security", "transfer learning", "transformer"], "paperhash": "krishna|thieves_on_sesame_street_model_extraction_of_bertbased_apis", "code": "https://github.com/google-research/language/tree/master/language/bert_extraction", "spotlight_video": "", "authorids": ["kalpesh@cs.umass.edu", "gtomar@google.com", "aparikh@google.com", "papernot@google.com", "miyyer@cs.umass.edu"], "poster": "", "slides": "", "authors": ["Kalpesh Krishna", "Gaurav Singh Tomar", "Ankur P. Parikh", "Nicolas Papernot", "Mohit Iyyer"], "_bibtex": "@inproceedings{\nKrishna2020Thieves,\ntitle={Thieves on Sesame Street! Model Extraction of BERT-based APIs},\nauthor={Kalpesh Krishna and Gaurav Singh Tomar and Ankur P. Parikh and Nicolas Papernot and Mohit Iyyer},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Byl5NREFDr}\n}", "original_pdf": "/attachment/e6b9a22fab94d189f972720cf7b1b164082652cb.pdf", "title": "Thieves on Sesame Street! Model Extraction of BERT-based APIs", "pdf": "/pdf/0072594e40e42b1d603a9ab8dca04f79de577fcb.pdf", "abstract": "We study the problem of model extraction in natural language processing, in which an adversary with only query access to a victim model attempts to reconstruct a local copy of that model. Assuming that both the adversary and victim model fine-tune a large pretrained language model such as BERT (Devlin et al., 2019), we show that the adversary does not need any real training data to successfully mount the attack. In fact, the attacker need not even use grammatical or semantically meaningful queries: we show that random sequences of words coupled with task-specific heuristics form effective queries for model extraction on a diverse set of NLP tasks, including natural language inference and question answering. Our work thus highlights an exploit only made feasible by the shift towards transfer learning methods within the NLP community: for a query budget of a few hundred dollars, an attacker can extract a model that performs only slightly worse than the victim model. Finally, we study two defense strategies against model extraction\u2014membership classification and API watermarking\u2014which while successful against some adversaries can also be circumvented by more clever ones.", "full_presentation_video": ""}, "forum": "Byl5NREFDr", "id": "Byl5NREFDr"}, "B1e9Y2NYvS": {"content": {"appendix": "", "keywords": ["adversarial", "attention", "cnn", "optimization", "perturbation", "robustness"], "paperhash": "yan|on_robustness_of_neural_ordinary_differential_equations", "spotlight_video": "", "poster": "", "slides": "", "abstract": " Neural ordinary differential equations (ODEs) have been attracting increasing attention in various research domains recently. There have been some works studying optimization issues and approximation capabilities of neural ODEs, but their robustness is still yet unclear. In this work, we fill this important gap by exploring robustness properties of neural ODEs both empirically and theoretically. We first present an empirical study on the robustness of the neural ODE-based networks (ODENets) by exposing them to inputs with various types of perturbations and subsequently investigating the changes of the corresponding outputs. In contrast to conventional convolutional neural networks (CNNs), we find that the ODENets are more robust against both random Gaussian perturbations and adversarial attack examples. We then provide an insightful understanding of this phenomenon by exploiting a certain desirable property of the flow of a continuous-time ODE, namely that integral curves are non-intersecting. Our work suggests that, due to their intrinsic robustness, it is promising to use neural ODEs as a basic block for building robust deep network models. To further enhance the robustness of vanilla neural ODEs, we propose the time-invariant steady neural ODE (TisODE), which regularizes the flow on perturbed data via the time-invariant property and the imposition of a steady-state constraint. We show that the TisODE method outperforms vanilla neural ODEs and also can work in conjunction with other state-of-the-art architectural methods to build more robust deep networks.", "_bibtex": "@inproceedings{\nYAN2020On,\ntitle={On Robustness of Neural Ordinary Differential Equations},\nauthor={Hanshu YAN and Jiawei DU and Vincent TAN and Jiashi FENG},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1e9Y2NYvS}\n}", "authorids": ["hanshu.yan@u.nus.edu", "dujiawei@u.nus.edu", "vtan@nus.edu.sg", "elefjia@nus.edu.sg"], "title": "On Robustness of Neural Ordinary Differential Equations", "authors": ["Hanshu YAN", "Jiawei DU", "Vincent TAN", "Jiashi FENG"], "original_pdf": "/attachment/f8c31158e86f50f9937ea5f20f07cdddf34ac904.pdf", "pdf": "/pdf/c8bd5b9e3af463f9c827d58a601d9f465063083a.pdf", "full_presentation_video": ""}, "forum": "B1e9Y2NYvS", "id": "B1e9Y2NYvS"}, "HJlWWJSFDH": {"content": {"appendix": "", "TL;DR": "We develop a strategy for pre-training Graph Neural Networks (GNNs) and systematically study its effectiveness on multiple datasets, GNN architectures, and diverse downstream tasks.", "keywords": ["generalization", "graph networks", "nlp", "pre training", "transfer learning"], "paperhash": "hu|strategies_for_pretraining_graph_neural_networks", "code": "https://github.com/snap-stanford/pretrain-gnns/", "spotlight_video": "", "authorids": ["weihuahu@stanford.edu", "liubowen@stanford.edu", "joegomes@stanford.edu", "marinka@cs.stanford.edu", "pliang@cs.stanford.edu", "pande@stanford.edu", "jure@cs.stanford.edu"], "poster": "", "slides": "", "authors": ["Weihua Hu*", "Bowen Liu*", "Joseph Gomes", "Marinka Zitnik", "Percy Liang", "Vijay Pande", "Jure Leskovec"], "_bibtex": "@inproceedings{\nHu*2020Strategies,\ntitle={Strategies for Pre-training Graph Neural Networks},\nauthor={Weihua Hu* and Bowen Liu* and Joseph Gomes and Marinka Zitnik and Percy Liang and Vijay Pande and Jure Leskovec},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJlWWJSFDH}\n}", "original_pdf": "/attachment/341e619d4bc12981f88a79b9dc402d348eff11e1.pdf", "title": "Strategies for Pre-training Graph Neural Networks", "pdf": "/pdf/0572544c8eb8ee9096e54a46c51dc6b6abf3225a.pdf", "abstract": "Many applications of machine learning require a model to make accurate pre-dictions on test examples that are distributionally different from training ones, while task-specific labels are scarce during training. An effective approach to this challenge is to pre-train a model on related tasks where data is abundant, and then fine-tune it on a downstream task of interest. While pre-training has been effective in many language and vision domains, it remains an open question how to effectively use pre-training on graph datasets. In this paper, we develop a new strategy and self-supervised methods for pre-training Graph Neural Networks (GNNs). The key to the success of our strategy is to pre-train an expressive GNN at the level of individual nodes as well as entire graphs so that the GNN can learn useful local and global representations simultaneously. We systematically study pre-training on multiple graph classification datasets. We find that na\u00efve strategies, which pre-train GNNs at the level of either entire graphs or individual nodes, give limited improvement and can even lead to negative transfer on many downstream tasks. In contrast, our strategy avoids negative transfer and improves generalization significantly across downstream tasks, leading up to 9.4% absolute improvements in ROC-AUC over non-pre-trained models and achieving state-of-the-art performance for molecular property prediction and protein function prediction.", "full_presentation_video": ""}, "forum": "HJlWWJSFDH", "id": "HJlWWJSFDH"}, "BJlQtJSKDB": {"content": {"appendix": "", "TL;DR": "We developed an effective parallel UCT algorithm that achieves linear speedup and suffers negligible performance loss.", "keywords": ["reinforcement learning"], "paperhash": "liu|watch_the_unobserved_a_simple_approach_to_parallelizing_monte_carlo_tree_search", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Anji Liu", "Jianshu Chen", "Mingze Yu", "Yu Zhai", "Xuewen Zhou", "Ji Liu"], "_bibtex": "@inproceedings{\nLiu2020Watch,\ntitle={Watch the Unobserved: A Simple Approach to Parallelizing Monte Carlo Tree Search},\nauthor={Anji Liu and Jianshu Chen and Mingze Yu and Yu Zhai and Xuewen Zhou and Ji Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJlQtJSKDB}\n}", "authorids": ["anjiliu219@gmail.com", "chenjianshu@gmail.com", "yumingze@kuaishou.com", "zhaiyu@kuaishou.com", "zhouxuewen@kuaishou.com", "ji.liu.uwisc@gmail.com"], "title": "Watch the Unobserved: A Simple Approach to Parallelizing Monte Carlo Tree Search", "original_pdf": "/attachment/b8540f8f53668dfb3a0db36905ca0a3685e89428.pdf", "pdf": "/pdf/a348732b5ca8e27b8fe0b211c92ec4d1b6defbae.pdf", "abstract": "Monte Carlo Tree Search (MCTS) algorithms have achieved great success on many challenging benchmarks (e.g., Computer Go). However, they generally require a large number of rollouts, making their applications costly. Furthermore, it is also extremely challenging to parallelize MCTS due to its inherent sequential nature: each rollout heavily relies on the statistics (e.g., node visitation counts) estimated from previous simulations to achieve an effective exploration-exploitation tradeoff. In spite of these difficulties, we develop an algorithm, WU-UCT, to effectively parallelize MCTS, which achieves linear speedup and exhibits only limited performance loss with an increasing number of workers. The key idea in WU-UCT is a set of statistics that we introduce to track the number of on-going yet incomplete simulation queries (named as unobserved samples). These statistics are used to modify the UCT tree policy in the selection steps in a principled manner to retain effective exploration-exploitation tradeoff when we parallelize the most time-consuming expansion and simulation steps. Experiments on a proprietary benchmark and the Atari Game benchmark demonstrate the linear speedup and the superior performance of WU-UCT comparing to existing techniques.", "full_presentation_video": ""}, "forum": "BJlQtJSKDB", "id": "BJlQtJSKDB"}, "H1ldzA4tPr": {"content": {"appendix": "", "TL;DR": "Learning compositional Koopman operators for efficient system identification and model-based control.", "keywords": ["compositionality", "generalization", "graph networks"], "paperhash": "li|learning_compositional_koopman_operators_for_modelbased_control", "spotlight_video": "", "poster": "", "slides": "", "abstract": "Finding an embedding space for a linear approximation of a nonlinear dynamical system enables efficient system identification and control synthesis. The Koopman operator theory lays the foundation for identifying the nonlinear-to-linear coordinate transformations with data-driven methods. Recently, researchers have proposed to use deep neural networks as a more expressive class of basis functions for calculating the Koopman operators. These approaches, however, assume a fixed dimensional state space; they are therefore not applicable to scenarios with a variable number of objects. In this paper, we propose to learn compositional Koopman operators, using graph neural networks to encode the state into object-centric embeddings and using a block-wise linear transition matrix to regularize the shared structure across objects. The learned dynamics can quickly adapt to new environments of unknown physical parameters and produce control signals to achieve a specified goal. Our experiments on manipulating ropes and controlling soft robots show that the proposed method has better efficiency and generalization ability than existing baselines.", "_bibtex": "@inproceedings{\nLi2020Learning,\ntitle={Learning Compositional Koopman Operators for Model-Based Control},\nauthor={Yunzhu Li and Hao He and Jiajun Wu and Dina Katabi and Antonio Torralba},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1ldzA4tPr}\n}", "authorids": ["liyunzhu@mit.edu", "haohe@mit.edu", "jiajunwu.cs@gmail.com", "dina@csail.mit.edu", "torralba@csail.mit.edu"], "title": "Learning Compositional Koopman Operators for Model-Based Control", "authors": ["Yunzhu Li", "Hao He", "Jiajun Wu", "Dina Katabi", "Antonio Torralba"], "original_pdf": "/attachment/e485f634ee50e8d2173d2883d1a8329f7deceb9c.pdf", "pdf": "/pdf/ec6e01ca6a5b56060b6026ec9c9e2cc66ab0be0d.pdf", "full_presentation_video": ""}, "forum": "H1ldzA4tPr", "id": "H1ldzA4tPr"}, "HJx-3grYDB": {"content": {"appendix": "", "keywords": ["multi agent reinforcement learning", "mutual information", "reinforcement learning", "scalability"], "paperhash": "wang|learning_nearly_decomposable_value_functions_via_communication_minimization", "code": "https://github.com/TonghanWang/NDQ", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Tonghan Wang*", "Jianhao Wang*", "Chongyi Zheng", "Chongjie Zhang"], "_bibtex": "@inproceedings{\nWang*2020Learning,\ntitle={Learning Nearly Decomposable Value Functions Via Communication Minimization},\nauthor={Tonghan Wang* and Jianhao Wang* and Chongyi Zheng and Chongjie Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJx-3grYDB}\n}", "authorids": ["tonghanwang1996@gmail.com", "1040594377@qq.com", "chongyeezheng@gmail.com", "chongjie@tsinghua.edu.cn"], "title": "Learning Nearly Decomposable Value Functions Via Communication Minimization", "original_pdf": "/attachment/b53e93e85a7b056f57d4a774cf6002d8f927a7c3.pdf", "pdf": "/pdf/1fd327b39cca477f8ce468645d5c704d3b104e0b.pdf", "abstract": "Reinforcement learning encounters major challenges in multi-agent settings, such as scalability and non-stationarity. Recently, value function factorization learning emerges as a promising way to address these challenges in collaborative multi-agent systems. However, existing methods have been focusing on learning fully decentralized value functions, which are not efficient for tasks requiring communication. To address this limitation, this paper presents a novel framework for learning nearly decomposable Q-functions (NDQ) via communication minimization, with which agents act on their own most of the time but occasionally send messages to other agents in order for effective coordination. This framework hybridizes value function factorization learning and communication learning by introducing two information-theoretic regularizers. These regularizers are maximizing mutual information between agents' action selection and communication messages while minimizing the entropy of messages between agents. We show how to optimize these regularizers in a way that is easily integrated with existing value function factorization methods such as QMIX. Finally, we demonstrate that, on the StarCraft unit micromanagement benchmark, our framework significantly outperforms baseline methods and allows us to cut off more than $80\\%$ of communication without sacrificing the performance. The videos of our experiments are available at https://sites.google.com/view/ndq.", "full_presentation_video": ""}, "forum": "HJx-3grYDB", "id": "HJx-3grYDB"}, "rkgyS0VFvr": {"content": {"appendix": "", "TL;DR": "We proposed a novel distributed backdoor attack on federated learning and show that it is not only more effective compared with standard centralized attacks, but also harder to be defended by existing robust FL methods", "keywords": ["adversarial", "distributed", "federated learning", "robustness"], "paperhash": "xie|dba_distributed_backdoor_attacks_against_federated_learning", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Chulin Xie", "Keli Huang", "Pin-Yu Chen", "Bo Li"], "_bibtex": "@inproceedings{\nXie2020DBA:,\ntitle={DBA: Distributed Backdoor Attacks against Federated Learning},\nauthor={Chulin Xie and Keli Huang and Pin-Yu Chen and Bo Li},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkgyS0VFvr}\n}", "authorids": ["chulinxie@zju.edu.cn", "nick_cooper@sjtu.edu.cn", "pin-yu.chen@ibm.com", "lbo@illinois.edu"], "title": "DBA: Distributed Backdoor Attacks against Federated Learning", "original_pdf": "/attachment/dc0f552db4cfc9d3d21e1f5c47f4c99b9521dca0.pdf", "pdf": "/pdf/f456b2770b704b48ff25e06e9197fe4dbe70504d.pdf", "abstract": "Backdoor attacks aim to manipulate a subset of training data by injecting adversarial triggers such that machine learning models trained on the tampered dataset will make arbitrarily (targeted) incorrect prediction on the testset with the same trigger embedded. While federated learning (FL) is capable of aggregating information provided by different parties for training a better model, its distributed learning methodology and inherently heterogeneous data distribution across parties may bring new vulnerabilities. In addition to recent centralized backdoor attacks on FL where each party embeds the same global trigger during training, we propose the distributed backdoor attack (DBA) --- a novel threat assessment framework developed by fully exploiting the distributed nature of FL. DBA decomposes a global trigger pattern into separate local patterns and embed them into the training set of different adversarial parties respectively. Compared to standard centralized backdoors, we show that DBA is substantially more persistent and stealthy against FL on diverse datasets such as finance and image data. We conduct extensive experiments to show that the attack success rate of DBA is significantly higher than centralized backdoors under different settings. Moreover, we find that distributed attacks are indeed more insidious, as DBA can evade two state-of-the-art robust FL algorithms against centralized backdoors. We also provide explanations for the effectiveness of DBA via feature visual interpretation and feature importance ranking.\nTo further explore the properties of DBA, we test the attack performance by varying different trigger factors, including local trigger variations (size, gap, and location), scaling factor in FL, data distribution, and poison ratio and interval. Our proposed DBA and thorough evaluation results shed lights on characterizing the robustness of FL.", "full_presentation_video": ""}, "forum": "rkgyS0VFvr", "id": "rkgyS0VFvr"}, "SkgscaNYPS": {"content": {"appendix": "", "TL;DR": "Description of the limiting spectrum of the Hesian of the loss surface of DNNs in the infinite-width limit.", "keywords": ["deep learning theory", "gradient descent", "loss surface", "neural tangent kernel"], "paperhash": "jacot|the_asymptotic_spectrum_of_the_hessian_of_dnn_throughout_training", "spotlight_video": "", "poster": "", "slides": "", "abstract": "The dynamics of DNNs during gradient descent is described by the so-called Neural Tangent Kernel (NTK). In this article, we show that the NTK allows one to gain precise insight into the Hessian of the cost of DNNs: we obtain a full characterization of the asymptotics of the spectrum of the Hessian, at initialization and during training. ", "_bibtex": "@inproceedings{\nJacot2020The,\ntitle={The asymptotic spectrum of the Hessian of DNN throughout training},\nauthor={Arthur Jacot and Franck Gabriel and Clement Hongler},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SkgscaNYPS}\n}", "authorids": ["arthur.jacot@epfl.ch", "franck.gabriel@epfl.ch", "clement.hongler@epfl.ch"], "title": "The asymptotic spectrum of the Hessian of DNN throughout training", "authors": ["Arthur Jacot", "Franck Gabriel", "Clement Hongler"], "original_pdf": "/attachment/6c3b8b689f9fb4c3ecc68937a37ce7d75bfc78c9.pdf", "pdf": "/pdf/191586952ff738b8955827a8e8f105d8d76372ca.pdf", "full_presentation_video": ""}, "forum": "SkgscaNYPS", "id": "SkgscaNYPS"}, "SkxQp1StDH": {"content": {"appendix": "", "TL;DR": "We propose a novel node embedding of directed graphs to statistical manifolds and analyze connections to divergence, geometry and efficient learning procedure.", "keywords": ["graph embedding", "information geometry", "unsupervised"], "paperhash": "funke|lowdimensional_statistical_manifold_embedding_of_directed_graphs", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Thorben Funke", "Tian Guo", "Alen Lancic", "Nino Antulov-Fantulin"], "_bibtex": "@inproceedings{\nFunke2020Low-dimensional,\ntitle={Low-dimensional statistical manifold embedding of directed graphs},\nauthor={Thorben Funke and Tian Guo and Alen Lancic and Nino Antulov-Fantulin},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SkxQp1StDH}\n}", "authorids": ["fun@biba.uni-bremen.de", "tian.guo0980@gmail.com", "alen.lancic@math.hr", "nino.antulov@gess.ethz.ch"], "title": "Low-dimensional statistical manifold embedding of directed graphs", "original_pdf": "/attachment/cc4e8368ad45caa2bde7fb3f50aef6d105bf0694.pdf", "pdf": "/pdf/5505a3164f03ecb8799366aee7409994bdcf64b1.pdf", "abstract": "We propose a novel node embedding of directed graphs to statistical manifolds, which is based on a global minimization of pairwise relative entropy and graph geodesics in a non-linear way. Each node is encoded with a probability density function over a measurable space. Furthermore, we analyze the connection of the geometrical properties of such embedding and their efficient learning procedure. Extensive experiments show that our proposed embedding is better preserving the global geodesic information of graphs, as well as outperforming existing embedding models on directed graphs in a variety of evaluation metrics, in an unsupervised setting.", "full_presentation_video": ""}, "forum": "SkxQp1StDH", "id": "SkxQp1StDH"}, "HJenn6VFvB": {"content": {"appendix": "", "TL;DR": "We introduce a class of generative models that reliably learn Hamiltonian dynamics from high-dimensional observations. The learnt Hamiltonian can be applied to sequence modeling or as a normalising flow.", "keywords": ["generative models", "reinforcement learning", "rnn"], "paperhash": "botev|hamiltonian_generative_networks", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Peter Toth", "Danilo J. Rezende", "Andrew Jaegle", "Sébastien Racanière", "Aleksandar Botev", "Irina Higgins"], "_bibtex": "@inproceedings{\nBotev2020Hamiltonian,\ntitle={Hamiltonian Generative Networks},\nauthor={Peter Toth and Danilo J. Rezende and Andrew Jaegle and Sébastien Racanière and Aleksandar Botev and Irina Higgins},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJenn6VFvB}\n}", "authorids": ["botev@google.com", "irinah@google.com", "drewjaegle@google.com", "sracaniere@google.com", "danilor@google.com", "petertoth@google.com"], "title": "Hamiltonian Generative Networks", "original_pdf": "/attachment/021c7482cf980ef80b83eec7fa01d11090a5e641.pdf", "pdf": "/pdf/aff7b5eb43963e39c8330cd2fb8c9054c72286c7.pdf", "abstract": "The Hamiltonian formalism plays a central role in classical and quantum physics. Hamiltonians are the main tool for modelling the continuous time evolution of systems with conserved quantities, and they come equipped with many useful properties, like time reversibility and smooth interpolation in time. These properties are important for many machine learning problems - from sequence prediction to reinforcement learning and density modelling - but are not typically provided out of the box by standard tools such as recurrent neural networks. In this paper, we introduce the Hamiltonian Generative Network (HGN), the first approach capable of consistently learning Hamiltonian dynamics from high-dimensional observations (such as images) without restrictive domain assumptions. Once trained, we can use HGN to sample new trajectories, perform rollouts both forward and backward in time, and even speed up or slow down the learned dynamics. We demonstrate how a simple modification of the network architecture turns HGN into a powerful normalising flow model, called Neural Hamiltonian Flow (NHF), that uses Hamiltonian dynamics to model expressive densities. Hence, we hope that our work serves as a first practical demonstration of the value that the Hamiltonian formalism can bring to machine learning. More results and video evaluations are available at: http://tiny.cc/hgn", "full_presentation_video": ""}, "forum": "HJenn6VFvB", "id": "HJenn6VFvB"}, "HJeiDpVFPr": {"content": {"appendix": "", "TL;DR": "We propose novel neural network architectures, guaranteed to satisfy the triangle inequality, for purposes of (asymmetric) metric learning and modeling graph distances. ", "keywords": ["inductive bias", "metric learning", "reinforcement learning"], "paperhash": "pitis|an_inductive_bias_for_distances_neural_nets_that_respect_the_triangle_inequality", "spotlight_video": "", "poster": "", "slides": "", "abstract": "Distances are pervasive in machine learning. They serve as similarity measures, loss functions, and learning targets; it is said that a good distance measure solves a task. When defining distances, the triangle inequality has proven to be a useful constraint, both theoretically---to prove convergence and optimality guarantees---and empirically---as an inductive bias. Deep metric learning architectures that respect the triangle inequality rely, almost exclusively, on Euclidean distance in the latent space. Though effective, this fails to model two broad classes of subadditive distances, common in graphs and reinforcement learning: asymmetric metrics, and metrics that cannot be embedded into Euclidean space. To address these problems, we introduce novel architectures that are guaranteed to satisfy the triangle inequality. We prove our architectures universally approximate norm-induced metrics on $\\mathbb{R}^n$, and present a similar result for modified Input Convex Neural Networks. We show that our architectures outperform existing metric approaches when modeling graph distances and have a better inductive bias than non-metric approaches when training data is limited in the multi-goal reinforcement learning setting.\n", "_bibtex": "@inproceedings{\nPitis2020An,\ntitle={An Inductive Bias for Distances: Neural Nets that Respect the Triangle Inequality},\nauthor={Silviu Pitis and Harris Chan and Kiarash Jamali and Jimmy Ba},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJeiDpVFPr}\n}", "authorids": ["spitis@cs.toronto.edu", "hchan@cs.toronto.edu", "kiarash.jamali@mail.utoronto.ca", "jba@cs.toronto.edu"], "title": "An Inductive Bias for Distances: Neural Nets that Respect the Triangle Inequality", "authors": ["Silviu Pitis", "Harris Chan", "Kiarash Jamali", "Jimmy Ba"], "original_pdf": "/attachment/e5aa4fbf03c22363bf0315273cd25ecda465e718.pdf", "pdf": "/pdf/d14d420fe624bdaa1af95856989b14e65c7ac9da.pdf", "full_presentation_video": ""}, "forum": "HJeiDpVFPr", "id": "HJeiDpVFPr"}, "HJxyZkBKDr": {"content": {"appendix": "", "TL;DR": "A NAS benchmark applicable to almost any NAS algorithms.", "keywords": ["automl", "neural architecture search", "regularization"], "paperhash": "dong|nasbench201_extending_the_scope_of_reproducible_neural_architecture_search", "code": "https://github.com/D-X-Y/NAS-Bench-201", "spotlight_video": "", "authorids": ["xuanyi.dxy@gmail.com", "yi.yang@uts.edu.au"], "poster": "", "slides": "", "authors": ["Xuanyi Dong", "Yi Yang"], "_bibtex": "@inproceedings{\nDong2020NAS-Bench-201:,\ntitle={NAS-Bench-201: Extending the Scope of Reproducible Neural Architecture Search},\nauthor={Xuanyi Dong and Yi Yang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJxyZkBKDr}\n}", "original_pdf": "/attachment/4c7370e0988266840e70339259a38a93b2faedd9.pdf", "title": "NAS-Bench-201: Extending the Scope of Reproducible Neural Architecture Search", "pdf": "/pdf/73272ec47892eefdbb8e7caf4780b7d0a2d6ef71.pdf", "abstract": "Neural architecture search (NAS) has achieved breakthrough success in a great number of applications in the past few years.\nIt could be time to take a step back and analyze the good and bad aspects in the field of NAS. A variety of algorithms search architectures under different search space. These searched architectures are trained using different setups, e.g., hyper-parameters, data augmentation, regularization. This raises a comparability problem when comparing the performance of various NAS algorithms. NAS-Bench-101 has shown success to alleviate this problem. In this work, we propose an extension to NAS-Bench-101: NAS-Bench-201 with a different search space, results on multiple datasets, and more diagnostic information. NAS-Bench-201 has a fixed search space and provides a unified benchmark for almost any up-to-date NAS algorithms. The design of our search space is inspired by the one used in the most popular cell-based searching algorithms, where a cell is represented as a directed acyclic graph. Each edge here is associated with an operation selected from a predefined operation set. For it to be applicable for all NAS algorithms, the search space defined in NAS-Bench-201 includes all possible architectures generated by 4 nodes and 5 associated operation options, which results in 15,625 neural cell candidates in total. The training log using the same setup and the performance for each architecture candidate are provided for three datasets. This allows researchers to avoid unnecessary repetitive training for selected architecture and focus solely on the search algorithm itself. The training time saved for every architecture also largely improves the efficiency of most NAS algorithms and presents a more computational cost friendly NAS community for a broader range of researchers. We provide additional diagnostic information such as fine-grained loss and accuracy, which can give inspirations to new designs of NAS algorithms. In further support of the proposed NAS-Bench-102, we have analyzed it from many aspects and benchmarked 10 recent NAS algorithms, which verify its applicability.", "full_presentation_video": ""}, "forum": "HJxyZkBKDr", "id": "HJxyZkBKDr"}, "HkxARkrFwB": {"content": {"appendix": "", "TL;DR": "We use ideas from quantum computing to propose word embeddings that utilize much fewer trainable parameters.", "keywords": ["memory", "nlp", "word embedding", "word embeddings"], "paperhash": "panahi|word2ket_spaceefficient_word_embeddings_inspired_by_quantum_entanglement", "code": "https://github.com/panaali/word2ket", "spotlight_video": "", "authorids": ["panahia@vcu.edu", "saeedis@vcu.edu", "tarodz@vcu.edu"], "poster": "", "slides": "", "authors": ["Aliakbar Panahi", "Seyran Saeedi", "Tom Arodz"], "_bibtex": "@inproceedings{\nPanahi2020word2ket:,\ntitle={word2ket: Space-efficient Word Embeddings inspired by Quantum Entanglement},\nauthor={Aliakbar Panahi and Seyran Saeedi and Tom Arodz},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HkxARkrFwB}\n}", "original_pdf": "/attachment/621cda8e79321f4d376a16cf16b895d316f832d0.pdf", "title": "word2ket: Space-efficient Word Embeddings inspired by Quantum Entanglement", "pdf": "/pdf/f89bf14ec936a415e5f5ec3f21691dad77a0cee4.pdf", "abstract": "Deep learning natural language processing models often use vector word embeddings, such as word2vec or GloVe, to represent words. A discrete sequence of words can be much more easily integrated with downstream neural layers if it is represented as a sequence of continuous vectors. Also, semantic relationships between words, learned from a text corpus, can be encoded in the relative configurations of the embedding vectors. However, storing and accessing embedding vectors for all words in a dictionary requires large amount of space, and may stain systems with limited GPU memory. Here, we used approaches inspired by quantum computing to propose two related methods, word2ket and word2ketXS, for storing word embedding matrix during training and inference in a highly efficient way. Our approach achieves a hundred-fold or more reduction in the space required to store the embeddings with almost no relative drop in accuracy in practical natural language processing tasks.", "full_presentation_video": ""}, "forum": "HkxARkrFwB", "id": "HkxARkrFwB"}, "SklOUpEYvB": {"content": {"appendix": "", "keywords": ["autoencoder", "generative models", "nonlinear ica", "optimization", "representation learning"], "paperhash": "li|identifying_through_flows_for_recovering_latent_representations", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Shen Li", "Bryan Hooi", "Gim Hee Lee"], "_bibtex": "@inproceedings{\nLi2020Identifying,\ntitle={Identifying through Flows for Recovering Latent Representations},\nauthor={Shen Li and Bryan Hooi and Gim Hee Lee},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SklOUpEYvB}\n}", "authorids": ["maths.shenli@gmail.com", "bhooi@comp.nus.edu.sg", "dcslgh@nus.edu.sg"], "title": "Identifying through Flows for Recovering Latent Representations", "original_pdf": "/attachment/b735164738b8d36eb6539c6a59adedfd9f147b3e.pdf", "pdf": "/pdf/86068293e74eae321daeca98487beb99215c5a4f.pdf", "abstract": "Identifiability, or recovery of the true latent representations from which the observed data originates, is de facto a fundamental goal of representation learning. Yet, most deep generative models do not address the question of identifiability, and thus fail to deliver on the promise of the recovery of the true latent sources that generate the observations. Recent work proposed identifiable generative modelling using variational autoencoders (iVAE) with a theory of identifiability. Due to the intractablity of KL divergence between variational approximate posterior and the true posterior, however, iVAE has to maximize the evidence lower bound (ELBO) of the marginal likelihood, leading to suboptimal solutions in both theory and practice. In contrast, we propose an identifiable framework for estimating latent representations using a flow-based model (iFlow). Our approach directly maximizes the marginal likelihood, allowing for theoretical guarantees on identifiability, thereby dispensing with variational approximations. We derive its optimization objective in analytical form, making it possible to train iFlow in an end-to-end manner. Simulations on synthetic data validate the correctness and effectiveness of our proposed method and demonstrate its practical advantages over other existing methods.", "full_presentation_video": ""}, "forum": "SklOUpEYvB", "id": "SklOUpEYvB"}, "BJgMFxrYPB": {"content": {"appendix": "", "TL;DR": "We address the task of autonomous exploration and navigation using spatial affordance maps that can be learned in a self-supervised manner, these outperform classic geometric baselines while being more sample efficient than contemporary RL algorithms", "keywords": ["navigation"], "paperhash": "qi|learning_to_move_with_affordance_maps", "code": "https://github.com/wqi/A2L", "spotlight_video": "", "authorids": ["wq@cs.cmu.edu", "raviteja.mullapudi@gmail.com", "saurabhg@illinois.edu", "deva@cs.cmu.edu"], "poster": "", "slides": "", "authors": ["William Qi", "Ravi Teja Mullapudi", "Saurabh Gupta", "Deva Ramanan"], "_bibtex": "@inproceedings{\nQi2020Learning,\ntitle={Learning to Move with Affordance Maps},\nauthor={William Qi and Ravi Teja Mullapudi and Saurabh Gupta and Deva Ramanan},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJgMFxrYPB}\n}", "original_pdf": "/attachment/d6e6da38c25c9989ab972156b624c9ba6bf646ad.pdf", "title": "Learning to Move with Affordance Maps", "pdf": "/pdf/ce58284ea0813c14cf1c563bf0fbefe6a736158b.pdf", "abstract": "The ability to autonomously explore and navigate a physical space is a fundamental requirement for virtually any mobile autonomous agent, from household robotic vacuums to autonomous vehicles. Traditional SLAM-based approaches for exploration and navigation largely focus on leveraging scene geometry, but fail to model dynamic objects (such as other agents) or semantic constraints (such as wet floors or doorways). Learning-based RL agents are an attractive alternative because they can incorporate both semantic and geometric information, but are notoriously sample inefficient, difficult to generalize to novel settings, and are difficult to interpret. In this paper, we combine the best of both worlds with a modular approach that {\\em learns} a spatial representation of a scene that is trained to be effective when coupled with traditional geometric planners. Specifically, we design an agent that learns to predict a spatial affordance map that elucidates what parts of a scene are navigable through active self-supervised experience gathering. In contrast to most simulation environments that assume a static world, we evaluate our approach in the VizDoom simulator, using large-scale randomly-generated maps containing a variety of dynamic actors and hazards. We show that learned affordance maps can be used to augment traditional approaches for both exploration and navigation, providing significant improvements in performance.", "full_presentation_video": ""}, "forum": "BJgMFxrYPB", "id": "BJgMFxrYPB"}, "ByxtC2VtPB": {"content": {"appendix": "", "TL;DR": "We exploit the global linearity of the mixup-trained models in inference to break the locality of the adversarial perturbations.", "keywords": ["adversarial", "adversarial attacks", "generalization", "perturbation", "robustness", "trustworthy machine learning"], "paperhash": "pang|mixup_inference_better_exploiting_mixup_to_defend_adversarial_attacks", "code": "https://github.com/P2333/Mixup-Inference", "spotlight_video": "", "authorids": ["pty17@mails.tsinghua.edu.cn", "kunxu.thu@gmail.com", "dcszj@mail.tsinghua.edu.cn"], "poster": "", "slides": "", "authors": ["Tianyu Pang*", "Kun Xu*", "Jun Zhu"], "_bibtex": "@inproceedings{\nPang*2020Mixup,\ntitle={Mixup Inference: Better Exploiting Mixup to Defend Adversarial Attacks},\nauthor={Tianyu Pang* and Kun Xu* and Jun Zhu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ByxtC2VtPB}\n}", "original_pdf": "/attachment/248f049c55dcf577f775b97d48d421c49e577e79.pdf", "title": "Mixup Inference: Better Exploiting Mixup to Defend Adversarial Attacks", "pdf": "/pdf/2d15cb14ab9e0914711fc1187b082beb9a98f928.pdf", "abstract": "It has been widely recognized that adversarial examples can be easily crafted to fool deep networks, which mainly root from the locally non-linear behavior nearby input examples. Applying mixup in training provides an effective mechanism to improve generalization performance and model robustness against adversarial perturbations, which introduces the globally linear behavior in-between training examples. However, in previous work, the mixup-trained models only passively defend adversarial attacks in inference by directly classifying the inputs, where the induced global linearity is not well exploited. Namely, since the locality of the adversarial perturbations, it would be more efficient to actively break the locality via the globality of the model predictions. Inspired by simple geometric intuition, we develop an inference principle, named mixup inference (MI), for mixup-trained models. MI mixups the input with other random clean samples, which can shrink and transfer the equivalent perturbation if the input is adversarial. Our experiments on CIFAR-10 and CIFAR-100 demonstrate that MI can further improve the adversarial robustness for the models trained by mixup and its variants.", "full_presentation_video": ""}, "forum": "ByxtC2VtPB", "id": "ByxtC2VtPB"}, "BJgqQ6NYvB": {"content": {"appendix": "", "TL;DR": "We present a real-time segmentation model automatically discovered by a multi-scale NAS framework, achieving 30% faster than state-of-the-art models.", "keywords": ["neural architecture search", "regularization", "semantic segmentation"], "paperhash": "chen|fasterseg_searching_for_faster_realtime_semantic_segmentation", "code": "https://github.com/TAMU-VITA/FasterSeg", "spotlight_video": "", "authorids": ["wuyang.chen@tamu.edu", "xy_gong@tamu.edu", "xianming.liu@horizon.ai", "qian01.zhang@horizon.ai", "yuan.li@horizon.ai", "atlaswang@tamu.edu"], "poster": "", "slides": "", "authors": ["Wuyang Chen", "Xinyu Gong", "Xianming Liu", "Qian Zhang", "Yuan Li", "Zhangyang Wang"], "_bibtex": "@inproceedings{\nChen2020FasterSeg:,\ntitle={FasterSeg: Searching for Faster Real-time Semantic Segmentation},\nauthor={Wuyang Chen and Xinyu Gong and Xianming Liu and Qian Zhang and Yuan Li and Zhangyang Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJgqQ6NYvB}\n}", "original_pdf": "/attachment/21f861f2547a1b304a3fce97cf824827e50c5642.pdf", "title": "FasterSeg: Searching for Faster Real-time Semantic Segmentation", "pdf": "/pdf/7ffae4bab2b34b32388f186bee73a9872eef2134.pdf", "abstract": "We present FasterSeg, an automatically designed semantic segmentation network with not only state-of-the-art performance but also faster speed than current methods. Utilizing neural architecture search (NAS), FasterSeg is discovered from a novel and broader search space integrating multi-resolution branches, that has been recently found to be vital in manually designed segmentation models. To better calibrate the balance between the goals of high accuracy and low latency, we propose a decoupled and fine-grained latency regularization, that effectively overcomes our observed phenomenons that the searched networks are prone to \"collapsing\" to low-latency yet poor-accuracy models. Moreover, we seamlessly extend FasterSeg to a new collaborative search (co-searching) framework, simultaneously searching for a teacher and a student network in the same single run. The teacher-student distillation further boosts the student model\u2019s accuracy. Experiments on popular segmentation benchmarks demonstrate the competency of FasterSeg. For example, FasterSeg can run over 30% faster than the closest manually designed competitor on Cityscapes, while maintaining comparable accuracy.", "full_presentation_video": ""}, "forum": "BJgqQ6NYvB", "id": "BJgqQ6NYvB"}, "r1eiu2VtwH": {"content": {"appendix": "", "TL;DR": "We propose a new DNN architecture for deep learning on tabular data", "keywords": ["dnn", "ensembles", "optimization", "representation learning", "tabular data"], "paperhash": "popov|neural_oblivious_decision_ensembles_for_deep_learning_on_tabular_data", "code": "https://github.com/anonICLR2020/node", "spotlight_video": "", "authorids": ["sapopov@yandex-team.ru", "stanis-morozov@yandex.ru", "artem.babenko@phystech.edu"], "poster": "", "slides": "", "authors": ["Sergei Popov", "Stanislav Morozov", "Artem Babenko"], "_bibtex": "@inproceedings{\nPopov2020Neural,\ntitle={Neural Oblivious Decision Ensembles for Deep Learning on Tabular Data},\nauthor={Sergei Popov and Stanislav Morozov and Artem Babenko},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1eiu2VtwH}\n}", "original_pdf": "/attachment/aa66b292289a7945e26b2fbd3c0395b98c0f338e.pdf", "title": "Neural Oblivious Decision Ensembles for Deep Learning on Tabular Data", "pdf": "/pdf/a232ed6c3759aed211ca949a7fe572d04605f653.pdf", "abstract": "Nowadays, deep neural networks (DNNs) have become the main instrument for machine learning tasks within a wide range of domains, including vision, NLP, and speech. Meanwhile, in an important case of heterogenous tabular data, the advantage of DNNs over shallow counterparts remains questionable. In particular, there is no sufficient evidence that deep learning machinery allows constructing methods that outperform gradient boosting decision trees (GBDT), which are often the top choice for tabular problems. In this paper, we introduce Neural Oblivious Decision Ensembles (NODE), a new deep learning architecture, designed to work with any tabular data. In a nutshell, the proposed NODE architecture generalizes ensembles of oblivious decision trees, but benefits from both end-to-end gradient-based optimization and the power of multi-layer hierarchical representation learning. With an extensive experimental comparison to the leading GBDT packages on a large number of tabular datasets, we demonstrate the advantage of the proposed NODE architecture, which outperforms the competitors on most of the tasks. We open-source the PyTorch implementation of NODE and believe that it will become a universal framework for machine learning on tabular data.", "full_presentation_video": ""}, "forum": "r1eiu2VtwH", "id": "r1eiu2VtwH"}, "B1x1ma4tDr": {"content": {"appendix": "", "TL;DR": "Better audio synthesis by combining interpretable DSP with end-to-end learning.", "keywords": ["adversarial", "audio", "autoencoder", "autoregressive models", "disentanglement", "expressive power", "generation", "generative models", "inductive bias"], "paperhash": "engel|ddsp_differentiable_digital_signal_processing", "code": "https://github.com/magenta/ddsp", "spotlight_video": "", "authorids": ["jesseengel@google.com", "hanoih@google.com", "gcj@google.com", "adarob@google.com"], "poster": "", "slides": "", "authors": ["Jesse Engel", "Lamtharn (Hanoi) Hantrakul", "Chenjie Gu", "Adam Roberts"], "_bibtex": "@inproceedings{\nEngel2020DDSP:,\ntitle={DDSP: Differentiable Digital Signal Processing},\nauthor={Jesse Engel and Lamtharn (Hanoi) Hantrakul and Chenjie Gu and Adam Roberts},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1x1ma4tDr}\n}", "original_pdf": "/attachment/6f0a406776dca51c4f1770b8bcc6c64eb75235b8.pdf", "title": "DDSP: Differentiable Digital Signal Processing", "pdf": "/pdf/7373c54036f812b939713120fd76d3e220397a72.pdf", "abstract": "Most generative models of audio directly generate samples in one of two domains: time or frequency. While sufficient to express any signal, these representations are inefficient, as they do not utilize existing knowledge of how sound is generated and perceived. A third approach (vocoders/synthesizers) successfully incorporates strong domain knowledge of signal processing and perception, but has been less actively researched due to limited expressivity and difficulty integrating with modern auto-differentiation-based machine learning methods. In this paper, we introduce the Differentiable Digital Signal Processing (DDSP) library, which enables direct integration of classic signal processing elements with deep learning methods. Focusing on audio synthesis, we achieve high-fidelity generation without the need for large autoregressive models or adversarial losses, demonstrating that DDSP enables utilizing strong inductive biases without losing the expressive power of neural networks. Further, we show that combining interpretable modules permits manipulation of each separate model component, with applications such as independent control of pitch and loudness, realistic extrapolation to pitches not seen during training, blind dereverberation of room acoustics, transfer of extracted room acoustics to new environments, and transformation of timbre between disparate sources. In short, DDSP enables an interpretable and modular approach to generative modeling, without sacrificing the benefits of deep learning. The library will is available at https://github.com/magenta/ddsp and we encourage further contributions from the community and domain experts.\n", "full_presentation_video": ""}, "forum": "B1x1ma4tDr", "id": "B1x1ma4tDr"}, "H1gBsgBYwH": {"content": {"appendix": "", "TL;DR": "Derived population risk of two-layer neural networks in high dimensions and examined presence / absence of \"double descent\".", "keywords": ["generalization", "inductive bias", "regression"], "paperhash": "ba|generalization_of_twolayer_neural_networks_an_asymptotic_viewpoint", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Jimmy Ba", "Murat Erdogdu", "Taiji Suzuki", "Denny Wu", "Tianzong Zhang"], "_bibtex": "@inproceedings{\nBa2020Generalization,\ntitle={Generalization of Two-layer Neural Networks: An Asymptotic Viewpoint},\nauthor={Jimmy Ba and Murat Erdogdu and Taiji Suzuki and Denny Wu and Tianzong Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1gBsgBYwH}\n}", "authorids": ["jba@cs.toronto.edu", "erdogdu@cs.toronto.edu", "taiji@mist.i.u-tokyo.ac.jp", "dennywu@cs.toronto.edu", "ztz16@mails.tsinghua.edu.cn"], "title": "Generalization of Two-layer Neural Networks: An Asymptotic Viewpoint", "original_pdf": "/attachment/414bf03d13704ae0089a886abcb075b6eafa33b4.pdf", "pdf": "/pdf/f00f191828c2fb2bfaedb094247dab585d9b6b7f.pdf", "abstract": "This paper investigates the generalization properties of two-layer neural networks in high-dimensions, i.e. when the number of samples $n$, features $d$, and neurons $h$ tend to infinity at the same rate. Specifically, we derive the exact population risk of the unregularized least squares regression problem with two-layer neural networks when either the first or the second layer is trained using a gradient flow under different initialization setups. When only the second layer coefficients are optimized, we recover the \\textit{double descent} phenomenon: a cusp in the population risk appears at $h\\approx n$ and further overparameterization decreases the risk. In contrast, when the first layer weights are optimized, we highlight how different scales of initialization lead to different inductive bias, and show that the resulting risk is \\textit{independent} of overparameterization. Our theoretical and experimental results suggest that previously studied model setups that provably give rise to \\textit{double descent} might not translate to optimizing two-layer neural networks.", "full_presentation_video": ""}, "forum": "H1gBsgBYwH", "id": "H1gBsgBYwH"}, "HkgsPhNYPS": {"content": {"appendix": "", "TL;DR": "We propose a self-ensemble framework to train more robust deep learning models under noisy labeled datasets.", "keywords": ["ensemble learning", "noisy labels", "robust learning", "semi supervised learning", "unsupervised"], "paperhash": "nguyen|self_learning_to_filter_noisy_labels_with_selfensembling", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Duc Tam Nguyen", "Chaithanya Kumar Mummadi", "Thi Phuong Nhung Ngo", "Thi Hoai Phuong Nguyen", "Laura Beggel", "Thomas Brox"], "_bibtex": "@inproceedings{\nNguyen2020SELF:,\ntitle={SELF: Learning to Filter Noisy Labels with Self-Ensembling},\nauthor={Duc Tam Nguyen and Chaithanya Kumar Mummadi and Thi Phuong Nhung Ngo and Thi Hoai Phuong Nguyen and Laura Beggel and Thomas Brox},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HkgsPhNYPS}\n}", "authorids": ["ductam.nguyen08@gmail.com", "chaithanyakumar.mummadi@de.bosch.com", "thiphuongnhung.ngo@de.bosch.com", "hoai.phuong.nguyen198@gmail.com", "laura.beggel@de.bosch.com", "brox@cs.uni-freiburg.de"], "title": "SELF: Learning to Filter Noisy Labels with Self-Ensembling", "original_pdf": "/attachment/f53c1264354a72df6a3fdcfe8a3344abd4fde7f2.pdf", "pdf": "/pdf/3bafa3876cfccbe25d2fc5c4db5cd495aae2e6c7.pdf", "abstract": "Deep neural networks (DNNs) have been shown to over-fit a dataset when being trained with noisy labels for a long enough time. To overcome this problem, we present a simple and effective method self-ensemble label filtering (SELF) to progressively filter out the wrong labels during training. Our method improves the task performance by gradually allowing supervision only from the potentially non-noisy (clean) labels and stops learning on the filtered noisy labels. For the filtering, we form running averages of predictions over the entire training dataset using the network output at different training epochs. We show that these ensemble estimates yield more accurate identification of inconsistent predictions throughout training than the single estimates of the network at the most recent training epoch. While filtered samples are removed entirely from the supervised training loss, we dynamically leverage them via semi-supervised learning in the unsupervised loss. We demonstrate the positive effect of such an approach on various image classification tasks under both symmetric and asymmetric label noise and at different noise ratios. It substantially outperforms all previous works on noise-aware learning across different datasets and can be applied to a broad set of network architectures.", "full_presentation_video": ""}, "forum": "HkgsPhNYPS", "id": "HkgsPhNYPS"}, "HklXn1BKDH": {"content": {"appendix": "", "TL;DR": "A modular and hierarchical approach to learn policies for exploring 3D environments.", "keywords": ["navigation", "robustness"], "paperhash": "chaplot|learning_to_explore_using_active_neural_slam", "code": "https://github.com/devendrachaplot/Neural-SLAM", "spotlight_video": "", "authorids": ["chaplot@cs.cmu.edu", "dhirajgandhi@fb.com", "saurabhg@illinois.edu", "abhinavg@cs.cmu.edu", "rsalakhu@cs.cmu.edu"], "poster": "", "slides": "", "authors": ["Devendra Singh Chaplot", "Dhiraj Gandhi", "Saurabh Gupta", "Abhinav Gupta", "Ruslan Salakhutdinov"], "_bibtex": "@inproceedings{\nChaplot2020Learning,\ntitle={Learning To Explore Using Active Neural SLAM},\nauthor={Devendra Singh Chaplot and Dhiraj Gandhi and Saurabh Gupta and Abhinav Gupta and Ruslan Salakhutdinov},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HklXn1BKDH}\n}", "original_pdf": "/attachment/050b53b18bb2788b2e860079412d7c62867a008a.pdf", "title": "Learning To Explore Using Active Neural SLAM", "pdf": "/pdf/919bbfc97b66919dc0a9cd2ec89e94caf5a8c725.pdf", "abstract": "This work presents a modular and hierarchical approach to learn policies for exploring 3D environments, called `Active Neural SLAM'. Our approach leverages the strengths of both classical and learning-based methods, by using analytical path planners with learned SLAM module, and global and local policies. The use of learning provides flexibility with respect to input modalities (in the SLAM module), leverages structural regularities of the world (in global policies), and provides robustness to errors in state estimation (in local policies). Such use of learning within each module retains its benefits, while at the same time, hierarchical decomposition and modular training allow us to sidestep the high sample complexities associated with training end-to-end policies. Our experiments in visually and physically realistic simulated 3D environments demonstrate the effectiveness of our approach over past learning and geometry-based approaches. The proposed model can also be easily transferred to the PointGoal task and was the winning entry of CVPR 2019 Habitat PointGoal Navigation Challenge.\n", "full_presentation_video": ""}, "forum": "HklXn1BKDH", "id": "HklXn1BKDH"}, "BJeKh3VYDH": {"content": {"appendix": "", "TL;DR": "We propose a generative neural network approach for temporally coherent point clouds.", "keywords": ["denoising"], "paperhash": "prantl|tranquil_clouds_neural_networks_for_learning_temporally_coherent_features_in_point_clouds", "spotlight_video": "", "poster": "", "slides": "", "abstract": "Point clouds, as a form of Lagrangian representation, allow for powerful and flexible applications in a large number of computational disciplines. We propose a novel deep-learning method to learn stable and temporally coherent feature spaces for points clouds that change over time. We identify a set of inherent problems with these approaches: without knowledge of the time dimension, the inferred solutions can exhibit strong flickering, and easy solutions to suppress this flickering can result in undesirable local minima that manifest themselves as halo structures. We propose a novel temporal loss function that takes into account higher time derivatives of the point positions, and encourages mingling, i.e., to prevent the aforementioned halos. We combine these techniques in a super-resolution method with a truncation approach to flexibly adapt the size of the generated positions. We show that our method works for large, deforming point sets from different sources to demonstrate the flexibility of our approach.", "_bibtex": "@inproceedings{\nPrantl2020Tranquil,\ntitle={Tranquil Clouds: Neural Networks for Learning Temporally Coherent Features in Point Clouds},\nauthor={Lukas Prantl and Nuttapong Chentanez and Stefan Jeschke and Nils Thuerey},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJeKh3VYDH}\n}", "authorids": ["lukas.prantl@tum.de", "nuttapong26@gmail.com", "jeschke@stefan-jeschke.com", "nils.thuerey@tum.de"], "title": "Tranquil Clouds: Neural Networks for Learning Temporally Coherent Features in Point Clouds", "authors": ["Lukas Prantl", "Nuttapong Chentanez", "Stefan Jeschke", "Nils Thuerey"], "original_pdf": "/attachment/bace279991894089e1918c8ba21c381dadcdf4fb.pdf", "pdf": "/pdf/585439c3aa3f3d0fa87a24893c86c5384ee58d27.pdf", "full_presentation_video": ""}, "forum": "BJeKh3VYDH", "id": "BJeKh3VYDH"}, "B1eWOJHKvB": {"content": {"appendix": "", "TL;DR": "The space of approximate solutions of CycleGAN admits a lot of symmetry, and an identity loss does not fix this.", "keywords": ["adversarial", "generative models", "perturbation"], "paperhash": "moriakov|kernel_of_cyclegan_as_a_principal_homogeneous_space", "spotlight_video": "", "poster": "", "slides": "", "abstract": "Unpaired image-to-image translation has attracted significant interest due to the invention of CycleGAN, a method which utilizes a combination of adversarial and cycle consistency losses to avoid the need for paired data. It is known that the CycleGAN problem might admit multiple solutions, and our goal in this paper is to analyze the space of exact solutions and to give perturbation bounds for approximate solutions. We show theoretically that the exact solution space is invariant with respect to automorphisms of the underlying probability spaces, and, furthermore, that the group of automorphisms acts freely and transitively on the space of exact solutions. We examine the case of zero pure CycleGAN loss first in its generality, and, subsequently, expand our analysis to approximate solutions for extended CycleGAN loss where identity loss term is included. In order to demonstrate that these results are applicable, we show that under mild conditions nontrivial smooth automorphisms exist. Furthermore, we provide empirical evidence that neural networks can learn these automorphisms with unexpected and unwanted results. We conclude that finding optimal solutions to the CycleGAN loss does not necessarily lead to the envisioned result in image-to-image translation tasks and that underlying hidden symmetries can render the result useless.", "_bibtex": "@inproceedings{\nMoriakov2020Kernel,\ntitle={Kernel of CycleGAN as a principal homogeneous space},\nauthor={Nikita Moriakov and Jonas Adler and Jonas Teuwen},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1eWOJHKvB}\n}", "authorids": ["nikita.moriakov@radboudumc.nl", "jonasadl@kth.se", "jonas.teuwen@radboudumc.nl"], "title": "Kernel of CycleGAN as a principal homogeneous space", "authors": ["Nikita Moriakov", "Jonas Adler", "Jonas Teuwen"], "original_pdf": "/attachment/36437dd53eb5b202960d428497299859983c005a.pdf", "pdf": "/pdf/8790b57a87025087d771a181a86afbd4b4282d3d.pdf", "full_presentation_video": ""}, "forum": "B1eWOJHKvB", "id": "B1eWOJHKvB"}, "HJe_yR4Fwr": {"content": {"appendix": "", "TL;DR": "We propose a new notion of margin that has a direct relationship with neural net generalization, and obtain improved generalization bounds for neural nets and robust classification by analyzing this margin.", "keywords": ["adversarial", "deep learning theory", "generalization"], "paperhash": "wei|improved_sample_complexities_for_deep_neural_networks_and_robust_classification_via_an_alllayer_margin", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Colin Wei", "Tengyu Ma"], "_bibtex": "@inproceedings{\nWei2020Improved,\ntitle={Improved Sample Complexities for Deep Neural Networks and Robust Classification via an All-Layer Margin},\nauthor={Colin Wei and Tengyu Ma},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJe_yR4Fwr}\n}", "authorids": ["colinwei@stanford.edu", "tengyuma@cs.stanford.edu"], "title": "Improved Sample Complexities for Deep Neural Networks and Robust Classification via an All-Layer Margin", "original_pdf": "/attachment/3e0278730e9ebd1eceb363e4af77390c6795935d.pdf", "pdf": "/pdf/68a22f3da586dac673478e301763f5516fdf3f10.pdf", "abstract": "For linear classifiers, the relationship between (normalized) output margin and generalization is captured in a clear and simple bound \u2013 a large output margin implies good generalization. Unfortunately, for deep models, this relationship is less clear: existing analyses of the output margin give complicated bounds which sometimes depend exponentially on depth. In this work, we propose to instead analyze a new notion of margin, which we call the \u201call-layer margin.\u201d Our analysis reveals that the all-layer margin has a clear and direct relationship with generalization for deep models. This enables the following concrete applications of the all-layer margin: 1) by analyzing the all-layer margin, we obtain tighter generalization bounds for neural nets which depend on Jacobian and hidden layer norms and remove the exponential dependency on depth 2) our neural net results easily translate to the adversarially robust setting, giving the first direct analysis of robust test error for deep networks, and 3) we present a theoretically inspired training algorithm for increasing the all-layer margin. Our algorithm improves both clean and adversarially robust test performance over strong baselines in practice.", "full_presentation_video": ""}, "forum": "HJe_yR4Fwr", "id": "HJe_yR4Fwr"}, "H1gNOeHKPS": {"content": {"appendix": "", "keywords": ["inductive bias"], "paperhash": "madsen|neural_arithmetic_units", "code": "https://github.com/AndreasMadsen/stable-nalu", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Andreas Madsen", "Alexander Rosenberg Johansen"], "_bibtex": "@inproceedings{\nMadsen2020Neural,\ntitle={Neural Arithmetic Units},\nauthor={Andreas Madsen and Alexander Rosenberg Johansen},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1gNOeHKPS}\n}", "authorids": ["amwebdk@gmail.com", "alexander@herhjemme.dk"], "title": "Neural Arithmetic Units", "original_pdf": "/attachment/2c22f94ccc3cea6eca9a516ac3c80b39f6db4bd9.pdf", "pdf": "/pdf/e6df7774ac7fa62900637ffe2b78bd8d6fa1a107.pdf", "abstract": "Neural networks can approximate complex functions, but they struggle to perform exact arithmetic operations over real numbers. The lack of inductive bias for arithmetic operations leaves neural networks without the underlying logic necessary to extrapolate on tasks such as addition, subtraction, and multiplication. We present two new neural network components: the Neural Addition Unit (NAU), which can learn exact addition and subtraction; and the Neural Multiplication Unit (NMU) that can multiply subsets of a vector. The NMU is, to our knowledge, the first arithmetic neural network component that can learn to multiply elements from a vector, when the hidden size is large. The two new components draw inspiration from a theoretical analysis of recently proposed arithmetic components. We find that careful initialization, restricting parameter space, and regularizing for sparsity is important when optimizing the NAU and NMU. Our proposed units NAU and NMU, compared with previous neural units, converge more consistently, have fewer parameters, learn faster, can converge for larger hidden sizes, obtain sparse and meaningful weights, and can extrapolate to negative and small values.", "full_presentation_video": ""}, "forum": "H1gNOeHKPS", "id": "H1gNOeHKPS"}, "Sklf1yrYDr": {"content": {"appendix": "", "TL;DR": "We introduced BatchEnsemble, an efficient method for ensembling and lifelong learning which can be used to improve the accuracy and uncertainty of any neural network like typical ensemble methods.", "keywords": ["ensembles", "imagenet", "lifelong learning", "memory", "uncertainty"], "paperhash": "wen|batchensemble_an_alternative_approach_to_efficient_ensemble_and_lifelong_learning", "code": "https://github.com/google/edward2", "spotlight_video": "", "authorids": ["ywen@cs.toronto.edu", "trandustin@google.com", "jba@cs.toronto.edu"], "poster": "", "slides": "", "authors": ["Yeming Wen", "Dustin Tran", "Jimmy Ba"], "_bibtex": "@inproceedings{\nWen2020BatchEnsemble:,\ntitle={BatchEnsemble: an Alternative Approach to Efficient Ensemble and Lifelong Learning},\nauthor={Yeming Wen and Dustin Tran and Jimmy Ba},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Sklf1yrYDr}\n}", "original_pdf": "/attachment/fb99c648003a2449c6d313b06284e31d5aad732c.pdf", "title": "BatchEnsemble: an Alternative Approach to Efficient Ensemble and Lifelong Learning", "pdf": "/pdf/96249771b93f548bee53e2c0225e8385e27d3b0b.pdf", "abstract": "\nEnsembles, where multiple neural networks are trained individually and their predictions are averaged, have been shown to be widely successful for improving both the accuracy and predictive uncertainty of single neural networks. However, an ensemble\u2019s cost for both training and testing increases linearly with the number of networks, which quickly becomes untenable.\nIn this paper, we propose BatchEnsemble, an ensemble method whose computational and memory costs are significantly lower than typical ensembles. BatchEnsemble achieves this by defining each weight matrix to be the Hadamard product of a shared weight among all ensemble members and a rank-one matrix per member. Unlike ensembles, BatchEnsemble is not only parallelizable across devices, where one device trains one member, but also parallelizable within a device, where multiple ensemble members are updated simultaneously for a given mini-batch. Across CIFAR-10, CIFAR-100, WMT14 EN-DE/EN-FR translation, and out-of-distribution tasks, BatchEnsemble yields competitive accuracy and uncertainties as typical ensembles; the speedup at test time is 3X and memory reduction is 3X at an ensemble of size 4. We also apply BatchEnsemble to lifelong learning, where on Split-CIFAR-100, BatchEnsemble yields comparable performance to progressive neural networks while having a much lower computational and memory costs. We further show that BatchEnsemble can easily scale up to lifelong learning on Split-ImageNet which involves 100 sequential learning tasks", "full_presentation_video": ""}, "forum": "Sklf1yrYDr", "id": "Sklf1yrYDr"}, "Hke-WTVtwr": {"content": {"appendix": "", "keywords": ["language modeling", "machine translation", "nlp", "text classification", "transformer", "word embedding", "word embeddings"], "paperhash": "wang|encoding_word_order_in_complex_embeddings", "code": "https://github.com/iclr-complex-order/complex-order", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Benyou Wang", "Donghao Zhao", "Christina Lioma", "Qiuchi Li", "Peng Zhang", "Jakob Grue Simonsen"], "_bibtex": "@inproceedings{\nWang2020Encoding,\ntitle={Encoding word order in complex embeddings},\nauthor={Benyou Wang and Donghao Zhao and Christina Lioma and Qiuchi Li and Peng Zhang and Jakob Grue Simonsen},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Hke-WTVtwr}\n}", "authorids": ["wang@dei.unipd.it", "zhaodh@tju.edu.cn", "chrh@di.ku.dk", "qiuchili@dei.unipd.it", "pzhang@tju.edu.cn", "simonsen@di.ku.dk"], "title": "Encoding word order in complex embeddings", "original_pdf": "/attachment/dc81904cae9bfbd038037134a8b1340536c7ff82.pdf", "pdf": "/pdf/be07209dc3a935c61ba2a7f215feb20a760e70e5.pdf", "abstract": "Sequential word order is important when processing text. Currently, neural networks (NNs) address this by modeling word position using position embeddings. The problem is that position embeddings capture the position of individual words, but not the ordered relationship (e.g., adjacency or precedence) between individual word positions. We present a novel and principled solution for modeling both the global absolute positions of words and their order relationships. Our solution generalizes word embeddings, previously defined as independent vectors, to continuous word functions over a variable (position). The benefit of continuous functions over variable positions is that word representations shift smoothly with increasing positions. Hence, word representations in different positions can correlate with each other in a continuous function. The general solution of these functions can be extended to complex-valued variants. We extend CNN, RNN and Transformer NNs to complex-valued versions to incorporate our complex embedding (we make all code available). Experiments on text classification, machine translation and language modeling show gains over both classical word embeddings and position-enriched word embeddings. To our knowledge, this is the first work in NLP to link imaginary numbers in complex-valued representations to concrete meanings (i.e., word order).", "full_presentation_video": ""}, "forum": "Hke-WTVtwr", "id": "Hke-WTVtwr"}, "SygXPaEYvH": {"content": {"appendix": "", "TL;DR": "VL-BERT is a simple yet powerful pre-trainable generic representation for visual-linguistic tasks. It is pre-trained on the massive-scale caption dataset and text-only corpus, and can be finetuned for varies down-stream visual-linguistic tasks.", "keywords": ["pre training", "question answering", "reasoning", "transformer"], "paperhash": "su|vlbert_pretraining_of_generic_visuallinguistic_representations", "code": "https://github.com/jackroos/VL-BERT", "spotlight_video": "", "authorids": ["jackroos@mail.ustc.edu.cn", "ezra0408@mail.ustc.edu.cn", "yuecao@microsoft.com", "binli@ustc.edu.cn", "lewlu@microsoft.com", "fuwei@microsoft.com", "jifdai@microsoft.com"], "poster": "", "slides": "", "authors": ["Weijie Su", "Xizhou Zhu", "Yue Cao", "Bin Li", "Lewei Lu", "Furu Wei", "Jifeng Dai"], "_bibtex": "@inproceedings{\nSu2020VL-BERT:,\ntitle={VL-BERT: Pre-training of Generic Visual-Linguistic Representations},\nauthor={Weijie Su and Xizhou Zhu and Yue Cao and Bin Li and Lewei Lu and Furu Wei and Jifeng Dai},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SygXPaEYvH}\n}", "original_pdf": "/attachment/23f94942628459a5cd74c98bdad25e357d21ff8a.pdf", "title": "VL-BERT: Pre-training of Generic Visual-Linguistic Representations", "pdf": "/pdf/82a3e5b930bb31fb2830baf68da549d0431c8209.pdf", "abstract": "We introduce a new pre-trainable generic representation for visual-linguistic tasks, called Visual-Linguistic BERT (VL-BERT for short). VL-BERT adopts the simple yet powerful Transformer model as the backbone, and extends it to take both visual and linguistic embedded features as input. In it, each element of the input is either of a word from the input sentence, or a region-of-interest (RoI) from the input image. It is designed to fit for most of the visual-linguistic downstream tasks. To better exploit the generic representation, we pre-train VL-BERT on the massive-scale Conceptual Captions dataset, together with text-only corpus. Extensive empirical analysis demonstrates that the pre-training procedure can better align the visual-linguistic clues and benefit the downstream tasks, such as visual commonsense reasoning, visual question answering and referring expression comprehension. It is worth noting that VL-BERT achieved the first place of single model on the leaderboard of the VCR benchmark.", "full_presentation_video": ""}, "forum": "SygXPaEYvH", "id": "SygXPaEYvH"}, "BJgWE1SFwS": {"content": {"appendix": "", "TL;DR": "We propose a generic neural network architecture equipping Pairwise Choice Markov Chains choice models with amortized and automatic differentiation based inference using alternatives' and individuals' features.", "keywords": [], "paperhash": "lh\u00e9ritier|pcmcnet_featurebased_pairwise_choice_markov_chains", "code": "https://github.com/alherit/PCMC-Net", "spotlight_video": "", "authorids": ["alherit@gmail.com"], "poster": "", "slides": "", "authors": ["Alix Lh\u00e9ritier"], "_bibtex": "@inproceedings{\nLh\u00e9ritier2020PCMC-Net:,\ntitle={PCMC-Net: Feature-based Pairwise Choice Markov Chains},\nauthor={Alix Lh\u00e9ritier},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJgWE1SFwS}\n}", "original_pdf": "/attachment/2b650fcba58dd7f6bd3295faca33c65b3ed813c0.pdf", "title": "PCMC-Net: Feature-based Pairwise Choice Markov Chains", "pdf": "/pdf/c156e921437557833ce502262c1c9455daf0b48c.pdf", "abstract": "Pairwise Choice Markov Chains (PCMC) have been recently introduced to overcome limitations of choice models based on traditional axioms unable to express empirical observations from modern behavior economics like context effects occurring when a choice between two options is altered by adding a third alternative. The inference approach that estimates the transition rates between each possible pair of alternatives via maximum likelihood suffers when the examples of each alternative are scarce and is inappropriate when new alternatives can be observed at test time. In this work, we propose an amortized inference approach for PCMC by embedding its definition into a neural network that represents transition rates as a function of the alternatives' and individual's features. We apply our construction to the complex case of airline itinerary booking where singletons are common (due to varying prices and individual-specific itineraries), and context effects and behaviors strongly dependent on market segments are observed. Experiments show our network significantly outperforming, in terms of prediction accuracy and logarithmic loss, feature engineered standard and latent class Multinomial Logit models as well as recent machine learning approaches.", "full_presentation_video": ""}, "forum": "BJgWE1SFwS", "id": "BJgWE1SFwS"}, "rJxWxxSYvB": {"content": {"appendix": "", "TL;DR": "We present a learning rule for feedback weights in a spiking neural network that addresses the weight transport problem.", "keywords": ["feedback alignment", "gradient descent", "regression"], "paperhash": "guerguiev|spikebased_causal_inference_for_weight_alignment", "code": "https://anonfile.com/51V8Ge66n3/Code_zip", "spotlight_video": "", "authorids": ["jordan.guerguiev@utoronto.ca", "koerding@gmail.com", "blake.richards@mcgill.ca"], "poster": "", "slides": "", "authors": ["Jordan Guerguiev", "Konrad Kording", "Blake Richards"], "_bibtex": "@inproceedings{\nGuerguiev2020Spike-based,\ntitle={Spike-based causal inference for weight alignment},\nauthor={Jordan Guerguiev and Konrad Kording and Blake Richards},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJxWxxSYvB}\n}", "original_pdf": "/attachment/b7c0b84df96b2baf079ef31d47fb9e47b6d84e7a.pdf", "title": "Spike-based causal inference for weight alignment", "pdf": "/pdf/857516daf7cda3fae833c10b2e12ed6cf6094d49.pdf", "abstract": "In artificial neural networks trained with gradient descent, the weights used for processing stimuli are also used during backward passes to calculate gradients. For the real brain to approximate gradients, gradient information would have to be propagated separately, such that one set of synaptic weights is used for processing and another set is used for backward passes. This produces the so-called \"weight transport problem\" for biological models of learning, where the backward weights used to calculate gradients need to mirror the forward weights used to process stimuli. This weight transport problem has been considered so hard that popular proposals for biological learning assume that the backward weights are simply random, as in the feedback alignment algorithm. However, such random weights do not appear to work well for large networks. Here we show how the discontinuity introduced in a spiking system can lead to a solution to this problem. The resulting algorithm is a special case of an estimator used for causal inference in econometrics, regression discontinuity design. We show empirically that this algorithm rapidly makes the backward weights approximate the forward weights. As the backward weights become correct, this improves learning performance over feedback alignment on tasks such as Fashion-MNIST and CIFAR-10. Our results demonstrate that a simple learning rule in a spiking network can allow neurons to produce the right backward connections and thus solve the weight transport problem.", "full_presentation_video": ""}, "forum": "rJxWxxSYvB", "id": "rJxWxxSYvB"}, "BJxG_0EtDS": {"content": {"appendix": "", "TL;DR": "Learning embedding for control with high-dimensional observations", "keywords": ["representation learning", "variational inference"], "paperhash": "levine|prediction_consistency_curvature_representation_learning_for_locallylinear_control", "spotlight_video": "", "poster": "", "slides": "", "abstract": "Many real-world sequential decision-making problems can be formulated as optimal control with high-dimensional observations and unknown dynamics. A promising approach is to embed the high-dimensional observations into a lower-dimensional latent representation space, estimate the latent dynamics model, then utilize this model for control in the latent space. An important open question is how to learn a representation that is amenable to existing control algorithms? In this paper, we focus on learning representations for locally-linear control algorithms, such as iterative LQR (iLQR). By formulating and analyzing the representation learning problem from an optimal control perspective, we establish three underlying principles that the learned representation should comprise: 1) accurate prediction in the observation space, 2) consistency between latent and observation space dynamics, and 3) low curvature in the latent space transitions. These principles naturally correspond to a loss function that consists of three terms: prediction, consistency, and curvature (PCC). Crucially, to make PCC tractable, we derive an amortized variational bound for the PCC loss function. Extensive experiments on benchmark domains demonstrate that the new variational-PCC learning algorithm benefits from significantly more stable and reproducible training, and leads to superior control performance. Further ablation studies give support to the importance of all three PCC components for learning a good latent space for control.", "_bibtex": "@inproceedings{\nLevine2020Prediction,,\ntitle={Prediction, Consistency, Curvature: Representation Learning for Locally-Linear Control},\nauthor={Nir Levine and Yinlam Chow and Rui Shu and Ang Li and Mohammad Ghavamzadeh and Hung Bui},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJxG_0EtDS}\n}", "authorids": ["nirlevine@google.com", "yinlamchow@google.com", "ruishu@stanford.edu", "anglili@google.com", "mgh@fb.com", "v.hungbh1@vinai.io"], "title": "Prediction, Consistency, Curvature: Representation Learning for Locally-Linear Control", "authors": ["Nir Levine", "Yinlam Chow", "Rui Shu", "Ang Li", "Mohammad Ghavamzadeh", "Hung Bui"], "original_pdf": "/attachment/16be8d9f3e4ad898a15eb90334cc700a3027ed7c.pdf", "pdf": "/pdf/45fa6ffcf44b62ef51bf27ff0cd00e506b9d116d.pdf", "full_presentation_video": ""}, "forum": "BJxG_0EtDS", "id": "BJxG_0EtDS"}, "SJeY-1BKDS": {"content": {"appendix": "", "TL;DR": "We compare the l4-norm based dictionary learning with PCA, ICA and show its stability as well as robustness.", "keywords": ["dictionary learning", "robustness", "sparse coding", "stability"], "paperhash": "zhai|understanding_l4based_dictionary_learning_interpretation_stability_and_robustness", "code": "https://github.com/hermish/ZMZM-ICLR-2020", "spotlight_video": "", "authorids": ["ysz@berkeley.edu", "hermish@berkeley.edu", "zyzhou@stanford.edu", "yima@eecs.berkeley.edu"], "poster": "", "slides": "", "authors": ["Yuexiang Zhai", "Hermish Mehta", "Zhengyuan Zhou", "Yi Ma"], "_bibtex": "@inproceedings{\nZhai2020Understanding,\ntitle={Understanding l4-based Dictionary Learning: Interpretation, Stability, and Robustness},\nauthor={Yuexiang Zhai and Hermish Mehta and Zhengyuan Zhou and Yi Ma},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJeY-1BKDS}\n}", "original_pdf": "/attachment/5f236c0def40e09eb75fca84045199eb7f5ab47c.pdf", "title": "Understanding l4-based Dictionary Learning: Interpretation, Stability, and Robustness", "pdf": "/pdf/82f69ebb81f2a166efaed6c58908c26ecb0bde6b.pdf", "abstract": "Recently, the $\\ell^4$-norm maximization has been proposed to solve the sparse dictionary learning (SDL) problem. The simple MSP (matching, stretching, and projection) algorithm proposed by \\cite{zhai2019a} has proved surprisingly efficient and effective. This paper aims to better understand this algorithm from its strong geometric and statistical connections with the classic PCA and ICA, as well as their associated fixed-point style algorithms. Such connections provide a unified way of viewing problems that pursue {\\em principal}, {\\em independent}, or {\\em sparse} components of high-dimensional data. Our studies reveal additional good properties of $\\ell^4$-maximization: not only is the MSP algorithm for sparse coding insensitive to small noise, but it is also robust to outliers and resilient to sparse corruptions. We provide statistical justification for such inherently nice properties. To corroborate the theoretical analysis, we also provide extensive and compelling experimental evidence with both synthetic data and real images.", "full_presentation_video": ""}, "forum": "SJeY-1BKDS", "id": "SJeY-1BKDS"}, "rkgOlCVYvB": {"content": {"appendix": "", "keywords": ["loss landscape"], "paperhash": "trager|pure_and_spurious_critical_points_a_geometric_study_of_linear_networks", "code": "https://drive.google.com/file/d/1eSU6mwgmowSAyQY3b1jXPzvbymNv338z/view?usp=sharing", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Matthew Trager", "Kathl\u00e9n Kohn", "Joan Bruna"], "_bibtex": "@inproceedings{\nTrager2020Pure,\ntitle={Pure and Spurious Critical Points: a Geometric Study of Linear Networks},\nauthor={Matthew Trager and Kathl\u00e9n Kohn and Joan Bruna},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkgOlCVYvB}\n}", "authorids": ["matthew.trager@cims.nyu.edu", "kathlen.korn@gmail.com", "bruna@cims.nyu.edu"], "title": "Pure and Spurious Critical Points: a Geometric Study of Linear Networks", "original_pdf": "/attachment/83eb4535cffc45ecde569b8565ac40f962a1f340.pdf", "pdf": "/pdf/f203c690f327bdbb7ad7f08538a96a545db208e5.pdf", "abstract": "The critical locus of the loss function of a neural network is determined by the geometry of the functional space and by the parameterization of this space by the network's weights. We introduce a natural distinction between pure critical points, which only depend on the functional space, and spurious critical points, which arise from the parameterization. We apply this perspective to revisit and extend the literature on the loss function of linear neural networks. For this type of network, the functional space is either the set of all linear maps from input to output space, or a determinantal variety, i.e., a set of linear maps with bounded rank. We use geometric properties of determinantal varieties to derive new results on the landscape of linear networks with different loss functions and different parameterizations. Our analysis clearly illustrates that the absence of \"bad\" local minima in the loss landscape of linear networks is due to two distinct phenomena that apply in different settings: it is true for arbitrary smooth convex losses in the case of architectures that can express all linear maps (\"filling architectures\") but it holds only for the quadratic loss when the functional space is a determinantal variety (\"non-filling architectures\"). Without any assumption on the architecture, smooth convex losses may lead to landscapes with many bad minima.", "full_presentation_video": ""}, "forum": "rkgOlCVYvB", "id": "rkgOlCVYvB"}, "SkgKO0EtvS": {"content": {"appendix": "", "TL;DR": "We supervise graph neural networks to imitate intermediate and step-wise outputs of classical graph algorithms, recovering highly favourable insights.", "keywords": ["graph networks", "learning to execute", "program synthesis"], "paperhash": "velikovi|neural_execution_of_graph_algorithms", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Petar Veli\u010dkovi\u0107", "Rex Ying", "Matilde Padovano", "Raia Hadsell", "Charles Blundell"], "_bibtex": "@inproceedings{\nVeli\u010dkovi\u01072020Neural,\ntitle={Neural Execution of Graph Algorithms},\nauthor={Petar Veli\u010dkovi\u0107 and Rex Ying and Matilde Padovano and Raia Hadsell and Charles Blundell},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SkgKO0EtvS}\n}", "authorids": ["petarv@google.com", "rexying@stanford.edu", "mp861@cam.ac.uk", "raia@google.com", "cblundell@google.com"], "title": "Neural Execution of Graph Algorithms", "original_pdf": "/attachment/ce63527f3a102fa08682b8e5acbc05c1725d0118.pdf", "pdf": "/pdf/c0a6e40b48c2e2ed3ead72556fef85e539b1e58d.pdf", "abstract": "Graph Neural Networks (GNNs) are a powerful representational tool for solving problems on graph-structured inputs. In almost all cases so far, however, they have been applied to directly recovering a final solution from raw inputs, without explicit guidance on how to structure their problem-solving. Here, instead, we focus on learning in the space of algorithms: we train several state-of-the-art GNN architectures to imitate individual steps of classical graph algorithms, parallel (breadth-first search, Bellman-Ford) as well as sequential (Prim's algorithm). As graph algorithms usually rely on making discrete decisions within neighbourhoods, we hypothesise that maximisation-based message passing neural networks are best-suited for such objectives, and validate this claim empirically. We also demonstrate how learning in the space of algorithms can yield new opportunities for positive transfer between tasks---showing how learning a shortest-path algorithm can be substantially improved when simultaneously learning a reachability algorithm.", "full_presentation_video": ""}, "forum": "SkgKO0EtvS", "id": "SkgKO0EtvS"}, "SyevYxHtDB": {"content": {"appendix": "", "TL;DR": "We propose the first approach that can resist DNN model stealing/extraction attacks", "keywords": ["adversarial", "adversarial machine learning"], "paperhash": "orekondy|prediction_poisoning_towards_defenses_against_dnn_model_stealing_attacks", "spotlight_video": "", "poster": "", "slides": "", "authors": ["Tribhuvanesh Orekondy", "Bernt Schiele", "Mario Fritz"], "_bibtex": "@inproceedings{\nOrekondy2020Prediction,\ntitle={Prediction Poisoning: Towards Defenses Against DNN Model Stealing Attacks},\nauthor={Tribhuvanesh Orekondy and Bernt Schiele and Mario Fritz},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SyevYxHtDB}\n}", "authorids": ["orekondy@mpi-inf.mpg.de", "schiele@mpi-inf.mpg.de", "fritz@cispa.saarland"], "title": "Prediction Poisoning: Towards Defenses Against DNN Model Stealing Attacks", "original_pdf": "/attachment/a07835a4efecf21aca7ed1df820b1456bac3a9a3.pdf", "pdf": "/pdf/54a327e15441fc15cf63a461ce8a9da3b5e51713.pdf", "abstract": "High-performance Deep Neural Networks (DNNs) are increasingly deployed in many real-world applications e.g., cloud prediction APIs. Recent advances in model functionality stealing attacks via black-box access (i.e., inputs in, predictions out) threaten the business model of such applications, which require a lot of time, money, and effort to develop. Existing defenses take a passive role against stealing attacks, such as by truncating predicted information. We find such passive defenses ineffective against DNN stealing attacks. In this paper, we propose the first defense which actively perturbs predictions targeted at poisoning the training objective of the attacker. We find our defense effective across a wide range of challenging datasets and DNN model stealing attacks, and additionally outperforms existing defenses. Our defense is the first that can withstand highly accurate model stealing attacks for tens of thousands of queries, amplifying the attacker's error rate up to a factor of 85$\\times$ with minimal impact on the utility for benign users.", "full_presentation_video": ""}, "forum": "SyevYxHtDB", "id": "SyevYxHtDB"}, "B1guLAVFDB": {"content": {"appendix": "", "TL;DR": "We provably recover the span of a deep multi-layered neural network with latent structure and empirically apply efficient span recovery algorithms to attack networks by obfuscating inputs.", "keywords": ["adversarial"], "paperhash": "jayaram|span_recovery_for_deep_neural_networks_with_applications_to_input_obfuscation", "code": "https://drive.google.com/open?id=1-vPO5g52w8oON4neivTTmrL53Lnj4bdR https://drive.google.com/open?id=1qXHG90ypdzfYt_sGqRtyQZ5pFgt3KllP", "spotlight_video": "", "authorids": ["rkjayara@cs.cmu.edu", "dwoodruf@andrew.cmu.edu", "qiuyiz@google.com"], "poster": "", "slides": "", "authors": ["Rajesh Jayaram", "David P. Woodruff", "Qiuyi Zhang"], "_bibtex": "@inproceedings{\nJayaram2020Span,\ntitle={Span Recovery for Deep Neural Networks with Applications to Input Obfuscation},\nauthor={Rajesh Jayaram and David P. Woodruff and Qiuyi Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1guLAVFDB}\n}", "original_pdf": "/attachment/6b90cce99b7d592539144e3bedeb679904bf4c5c.pdf", "title": "Span Recovery for Deep Neural Networks with Applications to Input Obfuscation", "pdf": "/pdf/aadc0e2d282ebeaa2b357377fdd72b16d2b7f9da.pdf", "abstract": "The tremendous success of deep neural networks has motivated the need to better understand the fundamental properties of these networks, but many of the theoretical results proposed have only been for shallow networks. In this paper, we study an important primitive for understanding the meaningful input space of a deep network: span recovery. For $k