@inproceedings{9631,
  abstract     = {The ability to leverage large-scale hardware parallelism has been one of the key enablers of the accelerated recent progress in machine learning. Consequently, there has been considerable effort invested into developing efficient parallel variants of classic machine learning algorithms. However, despite the wealth of knowledge on parallelization, some classic machine learning algorithms often prove hard to parallelize efficiently while maintaining convergence. In this paper, we focus on efficient parallel algorithms for the key machine learning task of inference on graphical models, in particular on the fundamental belief propagation algorithm. We address the challenge of efficiently parallelizing this classic paradigm by showing how to leverage scalable relaxed schedulers in this context. We present an extensive empirical study, showing that our approach outperforms previous parallel belief propagation implementations both in terms of scalability and in terms of wall-clock convergence time, on a range of practical applications.},
  author       = {Aksenov, Vitaly and Alistarh, Dan-Adrian and Korhonen, Janne},
  booktitle    = {Advances in Neural Information Processing Systems},
  isbn         = {9781713829546},
  issn         = {10495258},
  location     = {Vancouver, Canada},
  pages        = {22361--22372},
  publisher    = {Curran Associates},
  title        = {{Scalable belief propagation via relaxed scheduling}},
  volume       = {33},
  year         = {2020},
}

@inproceedings{9632,
  abstract     = {Second-order information, in the form of Hessian- or Inverse-Hessian-vector products, is a fundamental tool for solving optimization problems. Recently, there has been significant interest in utilizing this information in the context of deep
neural networks; however, relatively little is known about the quality of existing approximations in this context. Our work examines this question, identifies issues with existing approaches, and proposes a method called WoodFisher to compute a faithful and efficient estimate of the inverse Hessian. Our main application is to neural network compression, where we build on the classic Optimal Brain Damage/Surgeon framework. We demonstrate that WoodFisher significantly outperforms popular state-of-the-art methods for oneshot pruning. Further, even when iterative, gradual pruning is allowed, our method results in a gain in test accuracy over the state-of-the-art approaches, for standard image classification datasets such as ImageNet ILSVRC. We examine how our method can be extended to take into account first-order information, as well as
illustrate its ability to automatically set layer-wise pruning thresholds and perform compression in the limited-data regime. The code is available at the following link, https://github.com/IST-DASLab/WoodFisher.},
  author       = {Singh, Sidak Pal and Alistarh, Dan-Adrian},
  booktitle    = {Advances in Neural Information Processing Systems},
  isbn         = {9781713829546},
  issn         = {10495258},
  location     = {Vancouver, Canada},
  pages        = {18098--18109},
  publisher    = {Curran Associates},
  title        = {{WoodFisher: Efficient second-order approximation for neural network compression}},
  volume       = {33},
  year         = {2020},
}

@inproceedings{8129,
  abstract     = {Cortical circuits exhibit intricate recurrent architectures that are remarkably similar across different brain areas. Such stereotyped structure suggests the existence of common computational principles. However, such principles have remained largely elusive. Inspired by gated-memory networks, namely long short-term memory networks (LSTMs), we introduce a recurrent neural network in which information is gated through inhibitory cells that are subtractive (subLSTM). We propose a natural mapping of subLSTMs onto known canonical excitatory-inhibitory cortical microcircuits. Our empirical evaluation across sequential image classification and language modelling tasks shows that subLSTM units can achieve similar performance to LSTM units. These results suggest that cortical circuits can be optimised to solve complex contextual problems and proposes a novel view on their computational function.
Overall our work provides a step towards unifying recurrent networks as used in machine learning with their biological counterparts.},
  author       = {Costa, Rui Ponte and Assael, Yannis M. and Shillingford, Brendan and Freitas, Nando de and Vogels, Tim P},
  booktitle    = {Advances in Neural Information Processing Systems},
  issn         = {10495258},
  location     = {Long Beach, CA, United States},
  pages        = {272--283},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Cortical microcircuits as gated-recurrent neural networks}},
  volume       = {30},
  year         = {2017},
}

@inproceedings{431,
  abstract     = {Parallel implementations of stochastic gradient descent (SGD) have received significant research attention, thanks to its excellent scalability properties. A fundamental barrier when parallelizing SGD is the high bandwidth cost of communicating gradient updates between nodes; consequently, several lossy compresion heuristics have been proposed, by which nodes only communicate quantized gradients. Although effective in practice, these heuristics do not always converge. In this paper, we propose Quantized SGD (QSGD), a family of compression schemes with convergence guarantees and good practical performance. QSGD allows the user to smoothly trade off communication bandwidth and convergence time: nodes can adjust the number of bits sent per iteration, at the cost of possibly higher variance. We show that this trade-off is inherent, in the sense that improving it past some threshold would violate information-theoretic lower bounds. QSGD guarantees convergence for convex and non-convex objectives, under asynchrony, and can be extended to stochastic variance-reduced techniques. When applied to training deep neural networks for image classification and automated speech recognition, QSGD leads to significant reductions in end-to-end training time. For instance, on 16GPUs, we can train the ResNet-152 network to full accuracy on ImageNet 1.8 × faster than the full-precision variant. },
  author       = {Alistarh, Dan-Adrian and Grubic, Demjan and Li, Jerry and Tomioka, Ryota and Vojnović, Milan},
  issn         = {10495258},
  location     = {Long Beach, CA, United States},
  pages        = {1710--1721},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{QSGD: Communication-efficient SGD via gradient quantization and encoding}},
  volume       = {2017},
  year         = {2017},
}