@inproceedings{10432,
  abstract     = {One key element behind the recent progress of machine learning has been the ability to train machine learning models in large-scale distributed shared-memory and message-passing environments. Most of these models are trained employing variants of stochastic gradient descent (SGD) based optimization, but most methods involve some type of consistency relaxation relative to sequential SGD, to mitigate its large communication or synchronization costs at scale. In this paper, we introduce a general consistency condition covering communication-reduced and asynchronous distributed SGD implementations. Our framework, called elastic consistency, decouples the system-specific aspects of the implementation from the SGD convergence requirements, giving a general way to obtain convergence bounds for a wide variety of distributed SGD methods used in practice. Elastic consistency can be used to re-derive or improve several previous convergence bounds in message-passing and shared-memory settings, but also to analyze new models and distribution schemes. As a direct application, we propose and analyze a new synchronization-avoiding scheduling scheme for distributed SGD, and show that it can be used to efficiently train deep convolutional models for image classification.},
  author       = {Nadiradze, Giorgi and Markov, Ilia and Chatterjee, Bapi and Kungurtsev, Vyacheslav  and Alistarh, Dan-Adrian},
  booktitle    = {Proceedings of the AAAI Conference on Artificial Intelligence},
  location     = {Virtual},
  number       = {10},
  pages        = {9037--9045},
  title        = {{Elastic consistency: A practical consistency model for distributed stochastic gradient descent}},
  volume       = {35},
  year         = {2021},
}

@inproceedings{10435,
  abstract     = {Decentralized optimization is emerging as a viable alternative for scalable distributed machine learning, but also introduces new challenges in terms of synchronization costs. To this end, several communication-reduction techniques, such as non-blocking communication, quantization, and local steps, have been explored in the decentralized setting. Due to the complexity of analyzing optimization in such a relaxed setting, this line of work often assumes \emph{global} communication rounds, which require additional synchronization. In this paper, we consider decentralized optimization in the simpler, but harder to analyze, \emph{asynchronous gossip} model, in which communication occurs in discrete, randomly chosen pairings among nodes. Perhaps surprisingly, we show that a variant of SGD called \emph{SwarmSGD} still converges in this setting, even if \emph{non-blocking communication}, \emph{quantization}, and \emph{local steps} are all applied \emph{in conjunction}, and even if the node data distributions and underlying graph topology are both \emph{heterogenous}. Our analysis is based on a new connection with multi-dimensional load-balancing processes. We implement this algorithm and deploy it in a super-computing environment, showing that it can outperform previous decentralized methods in terms of end-to-end training time, and that it can even rival carefully-tuned large-batch SGD for certain tasks.},
  author       = {Nadiradze, Giorgi and Sabour, Amirmojtaba and Davies, Peter and Li, Shigang and Alistarh, Dan-Adrian},
  booktitle    = {35th Conference on Neural Information Processing Systems},
  location     = {Sydney, Australia},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Asynchronous decentralized SGD with quantized and local updates}},
  year         = {2021},
}

@article{9541,
  abstract     = {The Massively Parallel Computation (MPC) model is an emerging model that distills core aspects of distributed and parallel computation, developed as a tool to solve combinatorial (typically graph) problems in systems of many machines with limited space. Recent work has focused on the regime in which machines have sublinear (in n, the number of nodes in the input graph) space, with randomized algorithms presented for the fundamental problems of Maximal Matching and Maximal Independent Set. However, there have been no prior corresponding deterministic algorithms. A major challenge underlying the sublinear space setting is that the local space of each machine might be too small to store all edges incident to a single node. This poses a considerable obstacle compared to classical models in which each node is assumed to know and have easy access to its incident edges. To overcome this barrier, we introduce a new graph sparsification technique that deterministically computes a low-degree subgraph, with the additional property that solving the problem on this subgraph provides significant progress towards solving the problem for the original input graph. Using this framework to derandomize the well-known algorithm of Luby [SICOMP’86], we obtain O(log Δ + log log n)-round deterministic MPC algorithms for solving the problems of Maximal Matching and Maximal Independent Set with O(nɛ) space on each machine for any constant ɛ > 0. These algorithms also run in O(log Δ) rounds in the closely related model of CONGESTED CLIQUE, improving upon the state-of-the-art bound of O(log 2Δ) rounds by Censor-Hillel et al. [DISC’17].},
  author       = {Czumaj, Artur and Davies, Peter and Parter, Merav},
  issn         = {1549-6333},
  journal      = {ACM Transactions on Algorithms},
  number       = {2},
  publisher    = {Association for Computing Machinery},
  title        = {{Graph sparsification for derandomizing massively parallel computation with low space}},
  doi          = {10.1145/3451992},
  volume       = {17},
  year         = {2021},
}

@inproceedings{9543,
  abstract     = {We consider the problem ofdistributed mean estimation (DME), in which n machines are each given a local d-dimensional vector xv∈Rd, and must cooperate to estimate the mean of their inputs μ=1n∑nv=1xv, while minimizing total communication cost. DME is a fundamental construct in distributed machine learning, and there has been considerable work on variants of this problem, especially in the context of distributed variance reduction for stochastic gradients in parallel SGD. Previous work typically assumes an upper bound on the norm of the input vectors, and achieves an error bound in terms of this norm. However, in many real applications, the input vectors are concentrated around the correct output μ, but μ itself has large norm. In such cases, previous output error bounds perform poorly. In this paper, we show that output error bounds need not depend on input norm. We provide a method of quantization which allows distributed mean estimation to be performed with solution quality dependent only on the distance between inputs, not on input norm, and show an analogous result for distributed variance reduction. The technique is based on a new connection with lattice theory. We also provide lower bounds showing that the communication to error trade-off of our algorithms is asymptotically optimal. As the lattices achieving optimal bounds under l2-norm can be computationally impractical, we also present an extension which leverages easy-to-use cubic lattices, and is loose only up to a logarithmic factor ind. We show experimentally that our method yields practical improvements for common applications, relative to prior approaches.},
  author       = {Davies, Peter and Gurunanthan, Vijaykrishna and Moshrefi, Niusha  and Ashkboos, Saleh and Alistarh, Dan-Adrian},
  booktitle    = {9th International Conference on Learning Representations},
  location     = {Virtual},
  title        = {{New bounds for distributed mean estimation and variance reduction}},
  year         = {2021},
}

@article{9571,
  abstract     = {As the size and complexity of models and datasets grow, so does the need for communication-efficient variants of stochastic gradient descent that can be deployed to perform parallel model training. One popular communication-compression method for data-parallel SGD is QSGD (Alistarh et al., 2017), which quantizes and encodes gradients to reduce communication costs. The baseline variant of QSGD provides strong theoretical guarantees, however, for practical purposes, the authors proposed a heuristic variant which we call QSGDinf, which demonstrated impressive empirical gains for distributed training of large neural networks. In this paper, we build on this work to propose a new gradient quantization scheme, and show that it has both stronger theoretical guarantees than QSGD, and matches and exceeds the empirical performance of the QSGDinf heuristic and of other compression methods.},
  author       = {Ramezani-Kebrya, Ali and Faghri, Fartash and Markov, Ilya and Aksenov, Vitalii and Alistarh, Dan-Adrian and Roy, Daniel M.},
  issn         = {15337928},
  journal      = {Journal of Machine Learning Research},
  number       = {114},
  pages        = {1−43},
  publisher    = {Journal of Machine Learning Research},
  title        = {{NUQSGD: Provably communication-efficient data-parallel SGD via nonuniform quantization}},
  volume       = {22},
  year         = {2021},
}

@inproceedings{9620,
  abstract     = {In this note, we introduce a distributed twist on the classic coupon collector problem: a set of m collectors wish to each obtain a set of n coupons; for this, they can each sample coupons uniformly at random, but can also meet in pairwise interactions, during which they can exchange coupons. By doing so, they hope to reduce the number of coupons that must be sampled by each collector in order to obtain a full set. This extension is natural when considering real-world manifestations of the coupon collector phenomenon, and has been remarked upon and studied empirically (Hayes and Hannigan 2006, Ahmad et al. 2014, Delmarcelle 2019).

We provide the first theoretical analysis for such a scenario. We find that “coupon collecting with friends” can indeed significantly reduce the number of coupons each collector must sample, and raises interesting connections to the more traditional variants of the problem. While our analysis is in most cases asymptotically tight, there are several open questions raised, regarding finer-grained analysis of both “coupon collecting with friends,” and of a long-studied variant of the original problem in which a collector requires multiple full sets of coupons.},
  author       = {Alistarh, Dan-Adrian and Davies, Peter},
  booktitle    = {Structural Information and Communication Complexity},
  isbn         = {9783030795269},
  issn         = {1611-3349},
  location     = {Wrocław, Poland},
  pages        = {3--12},
  publisher    = {Springer Nature},
  title        = {{Collecting coupons is faster with friends}},
  doi          = {10.1007/978-3-030-79527-6_1},
  volume       = {12810},
  year         = {2021},
}

@inproceedings{9678,
  abstract     = {We introduce a new graph problem, the token dropping game, and we show how to solve it efficiently in a distributed setting. We use the token dropping game as a tool to design an efficient distributed algorithm for stable orientations and more generally for locally optimal semi-matchings. The prior work by Czygrinow et al. (DISC 2012) finds a stable orientation in O(Δ^5) rounds in graphs of maximum degree Δ, while we improve it to O(Δ^4) and also prove a lower bound of Ω(Δ). For the more general problem of locally optimal semi-matchings, the prior upper bound is O(S^5) and our new algorithm runs in O(C · S^4) rounds, which is an improvement for C = o(S); here C and S are the maximum degrees of customers and servers, respectively.},
  author       = {Brandt, Sebastian and Keller, Barbara and Rybicki, Joel and Suomela, Jukka and Uitto, Jara},
  booktitle    = {Annual ACM Symposium on Parallelism in Algorithms and Architectures},
  isbn         = {9781450380706},
  location     = { Virtual Event, United States},
  pages        = {129--139},
  title        = {{Efficient load-balancing through distributed token dropping}},
  doi          = {10.1145/3409964.3461785},
  year         = {2021},
}

@inproceedings{9823,
  abstract     = {Approximate agreement is one of the few variants of consensus that can be solved in a wait-free manner in asynchronous systems where processes communicate by reading and writing to shared memory. In this work, we consider a natural generalisation of approximate agreement on arbitrary undirected connected graphs. Each process is given a vertex of the graph as input and, if non-faulty, must output a vertex such that
all the outputs are within distance 1 of one another, and

each output value lies on a shortest path between two input values.

From prior work, it is known that there is no wait-free algorithm among   𝑛≥3  processes for this problem on any cycle of length   𝑐≥4 , by reduction from 2-set agreement (Castañeda et al. 2018).

In this work, we investigate the solvability and complexity of this task on general graphs. We give a new, direct proof of the impossibility of approximate agreement on cycles of length   𝑐≥4 , via a generalisation of Sperner’s Lemma to convex polygons. We also extend the reduction from 2-set agreement to a larger class of graphs, showing that approximate agreement on these graphs is unsolvable. On the positive side, we present a wait-free algorithm for a class of graphs that properly contains the class of chordal graphs.},
  author       = {Alistarh, Dan-Adrian and Ellen, Faith and Rybicki, Joel},
  booktitle    = {Structural Information and Communication Complexity},
  isbn         = {9783030795269},
  issn         = {16113349},
  location     = {Wrocław, Poland},
  pages        = {87--105},
  publisher    = {Springer Nature},
  title        = {{Wait-free approximate agreement on graphs}},
  doi          = {10.1007/978-3-030-79527-6_6},
  volume       = {12810},
  year         = {2021},
}

@article{9827,
  abstract     = {The Nearest neighbour search (NNS) is a fundamental problem in many application domains dealing with multidimensional data. In a concurrent setting, where dynamic modifications are allowed, a linearizable implementation of the NNS is highly desirable.This paper introduces the LockFree-kD-tree (LFkD-tree ): a lock-free concurrent kD-tree, which implements an abstract data type (ADT) that provides the operations Add, Remove, Contains, and NNS. Our implementation is linearizable. The operations in the LFkD-tree use single-word read and compare-and-swap (Image 1 ) atomic primitives, which are readily supported on available multi-core processors. We experimentally evaluate the LFkD-tree using several benchmarks comprising real-world and synthetic datasets. The experiments show that the presented design is scalable and achieves significant speed-up compared to the implementations of an existing sequential kD-tree and a recently proposed multidimensional indexing structure, PH-tree.},
  author       = {Chatterjee, Bapi and Walulya, Ivan and Tsigas, Philippas},
  issn         = {0304-3975},
  journal      = {Theoretical Computer Science},
  keywords     = {Concurrent data structure, kD-tree, Nearest neighbor search, Similarity search, Lock-free, Linearizability},
  pages        = {27--48},
  publisher    = {Elsevier},
  title        = {{Concurrent linearizable nearest neighbour search in LockFree-kD-tree}},
  doi          = {10.1016/j.tcs.2021.06.041},
  volume       = {886},
  year         = {2021},
}

@inproceedings{9933,
  abstract     = {In this paper, we study the power and limitations of component-stable algorithms in the low-space model of Massively Parallel Computation (MPC). Recently Ghaffari, Kuhn and Uitto (FOCS 2019) introduced the class of component-stable low-space MPC algorithms, which are, informally, defined as algorithms for which the outputs reported by the nodes in different connected components are required to be independent. This very natural notion was introduced to capture most (if not all) of the known efficient MPC algorithms to date, and it was the first general class of MPC algorithms for which one can show non-trivial conditional lower bounds. In this paper we enhance the framework of component-stable algorithms and investigate its effect on the complexity of randomized and deterministic low-space MPC. Our key contributions include: 1) We revise and formalize the lifting approach of Ghaffari, Kuhn and Uitto. This requires a very delicate amendment of the notion of component stability, which allows us to fill in gaps in the earlier arguments. 2) We also extend the framework to obtain conditional lower bounds for deterministic algorithms and fine-grained lower bounds that depend on the maximum degree Δ. 3) We demonstrate a collection of natural graph problems for which non-component-stable algorithms break the conditional lower bound obtained for component-stable algorithms. This implies that, for both deterministic and randomized algorithms, component-stable algorithms are conditionally weaker than the non-component-stable ones.

Altogether our results imply that component-stability might limit the computational power of the low-space MPC model, paving the way for improved upper bounds that escape the conditional lower bound setting of Ghaffari, Kuhn, and Uitto.},
  author       = {Czumaj, Artur and Davies, Peter and Parter, Merav},
  booktitle    = {Proceedings of the 2021 ACM Symposium on Principles of Distributed Computing},
  isbn         = {9781450385480},
  location     = {Virtual, Italy},
  pages        = {481–491},
  publisher    = {Association for Computing Machinery},
  title        = {{Component stability in low-space massively parallel computation}},
  doi          = {10.1145/3465084.3467903},
  year         = {2021},
}

@inproceedings{9935,
  abstract     = {We present a deterministic O(log log log n)-round low-space Massively Parallel Computation (MPC) algorithm for the classical problem of (Δ+1)-coloring on n-vertex graphs. In this model, every machine has sublinear local space of size n^φ for any arbitrary constant φ \in (0,1). Our algorithm works under the relaxed setting where each machine is allowed to perform exponential local computations, while respecting the n^φ space and bandwidth limitations.

Our key technical contribution is a novel derandomization of the ingenious (Δ+1)-coloring local algorithm by Chang-Li-Pettie (STOC 2018, SIAM J. Comput. 2020). The Chang-Li-Pettie algorithm runs in T_local =poly(loglog n) rounds, which sets the state-of-the-art randomized round complexity for the problem in the local model. Our derandomization employs a combination of tools, notably pseudorandom generators (PRG) and bounded-independence hash functions.

The achieved round complexity of O(logloglog n) rounds matches the bound of log(T_local ), which currently serves an upper bound barrier for all known randomized algorithms for locally-checkable problems in this model. Furthermore, no deterministic sublogarithmic low-space MPC algorithms for the (Δ+1)-coloring problem have been known before.},
  author       = {Czumaj, Artur and Davies, Peter and Parter, Merav},
  booktitle    = {Proceedings of the 2021 ACM Symposium on Principles of Distributed Computing},
  isbn         = {978-1-4503-8548-0},
  location     = {Virtual, Italy},
  pages        = {469–479},
  publisher    = {Association for Computing Machinery},
  title        = {{Improved deterministic (Δ+1) coloring in low-space MPC}},
  doi          = {10.1145/3465084.3467937},
  year         = {2021},
}

@inproceedings{9951,
  abstract     = {There has recently been a surge of interest in the computational and complexity properties of the population model, which assumes n anonymous, computationally-bounded nodes, interacting at random, with the goal of jointly computing global predicates. Significant work has gone towards investigating majority or consensus dynamics in this model: that is, assuming that every node is initially in one of two states X or Y, determine which state had higher initial count.

In this paper, we consider a natural generalization of majority/consensus, which we call comparison : in its simplest formulation, we are given two baseline states, X and Y, present in any initial configuration in fixed, but possibly small counts. One of these states has higher count than the other: we will assume |X_0| > C |Y_0| for some constant C > 1. The challenge is to design a protocol by which nodes can quickly and reliably decide on which of the baseline states X_0 and Y_0 has higher initial count. We begin by analyzing a simple and general dynamics solving the above comparison problem, which uses O( log n ) states per node, and converges in O(log n) (parallel) time, with high probability, to a state where the whole population votes on opinions X or Y at rates proportional to the initial concentrations of |X_0| vs. |Y_0|. We then describe how this procedure can be bootstrapped to solve comparison, i.e. have every node in the population reach the "correct'' decision, with probability 1 - o(1), at the cost of O (log log n) additional states. Further, we prove that this dynamics is self-stabilizing, in the sense that it converges to the correct decision from arbitrary initial states, and leak-robust, in the sense that it can withstand spurious faulty reactions, which are known to occur in practical implementations of population protocols. Our analysis is based on a new martingale concentration result relating the discrete-time evolution of a population protocol to its expected (steady-state) analysis, which should be a useful tool when analyzing opinion dynamics and epidemic dissemination in the population model.},
  author       = {Alistarh, Dan-Adrian and Töpfer, Martin and Uznański, Przemysław},
  booktitle    = {Proceedings of the 2021 ACM Symposium on Principles of Distributed Computing},
  isbn         = {9781450385480},
  location     = {Virtual, Italy},
  pages        = {55--65},
  publisher    = {Association for Computing Machinery},
  title        = {{Comparison dynamics in population protocols}},
  doi          = {10.1145/3465084.3467915},
  year         = {2021},
}

@inproceedings{10853,
  abstract     = {Dynamic Connectivity is a fundamental algorithmic graph problem, motivated by a wide range of applications to social and communication networks and used as a building block in various other algorithms, such as the bi-connectivity and the dynamic minimal spanning tree problems. In brief, we wish to maintain the connected components of the graph under dynamic edge insertions and deletions. In the sequential case, the problem has been well-studied from both theoretical and practical perspectives. However, much less is known about efficient concurrent solutions to this problem. This is the gap we address in this paper. We start from one of the classic data structures used to solve this problem, the Euler Tour Tree. Our first contribution is a non-blocking single-writer implementation of it. We leverage this data structure to obtain the first truly concurrent generalization of dynamic connectivity, which preserves the time complexity of its sequential counterpart, but is also scalable in practice. To achieve this, we rely on three main techniques. The first is to ensure that connectivity queries, which usually dominate real-world workloads, are non-blocking. The second non-trivial technique expands the above idea by making all queries that do not change the connectivity structure non-blocking. The third ingredient is applying fine-grained locking for updating the connected components, which allows operations on disjoint components to occur in parallel. We evaluate the resulting algorithm on various workloads, executing on both real and synthetic graphs. The results show the efficiency of each of the proposed optimizations; the most efficient variant improves the performance of a coarse-grained based implementation on realistic scenarios up to 6x on average and up to 30x when connectivity queries dominate.},
  author       = {Fedorov, Alexander and Koval, Nikita and Alistarh, Dan-Adrian},
  booktitle    = {Proceedings of the 33rd ACM Symposium on Parallelism in Algorithms and Architectures},
  isbn         = {9781450380706},
  location     = {Virtual, Online},
  pages        = {208--220},
  publisher    = {Association for Computing Machinery},
  title        = {{A scalable concurrent algorithm for dynamic connectivity}},
  doi          = {10.1145/3409964.3461810},
  year         = {2021},
}

@inproceedings{10854,
  abstract     = {Consider a distributed task where the communication network is fixed but the local inputs given to the nodes of the distributed system may change over time. In this work, we explore the following question: if some of the local inputs change, can an existing solution be updated efficiently, in a dynamic and distributed manner?
To address this question, we define the batch dynamic CONGEST model in which we are given a bandwidth-limited communication network and a dynamic edge labelling defines the problem input. The task is to maintain a solution to a graph problem on the labelled graph under batch changes. We investigate, when a batch of alpha edge label changes arrive, - how much time as a function of alpha we need to update an existing solution, and - how much information the nodes have to keep in local memory between batches in order to update the solution quickly.
Our work lays the foundations for the theory of input-dynamic distributed network algorithms. We give a general picture of the complexity landscape in this model, design both universal algorithms and algorithms for concrete problems, and present a general framework for lower bounds. The diverse time complexity of our model spans from constant time, through time polynomial in alpha, and to alpha time, which we show to be enough for any task.},
  author       = {Foerster, Klaus-Tycho and Korhonen, Janne and Paz, Ami and Rybicki, Joel and Schmid, Stefan},
  booktitle    = {Abstract Proceedings of the 2021 ACM SIGMETRICS / International Conference on Measurement and Modeling of Computer Systems},
  isbn         = {9781450380720},
  location     = {Virtual, Online},
  pages        = {71--72},
  publisher    = {Association for Computing Machinery},
  title        = {{Input-dynamic distributed algorithms for communication networks}},
  doi          = {10.1145/3410220.3453923},
  year         = {2021},
}

@article{10855,
  abstract     = {Consider a distributed task where the communication network is fixed but the local inputs given to the nodes of the distributed system may change over time. In this work, we explore the following question: if some of the local inputs change, can an existing solution be updated efficiently, in a dynamic and distributed manner? To address this question, we define the batch dynamic \congest model in which we are given a bandwidth-limited communication network and a dynamic edge labelling defines the problem input. The task is to maintain a solution to a graph problem on the labeled graph under batch changes. We investigate, when a batch of α edge label changes arrive, \beginitemize \item how much time as a function of α we need to update an existing solution, and \item how much information the nodes have to keep in local memory between batches in order to update the solution quickly. \enditemize Our work lays the foundations for the theory of input-dynamic distributed network algorithms. We give a general picture of the complexity landscape in this model, design both universal algorithms and algorithms for concrete problems, and present a general framework for lower bounds. In particular, we derive non-trivial upper bounds for two selected, contrasting problems: maintaining a minimum spanning tree and detecting cliques.},
  author       = {Foerster, Klaus-Tycho and Korhonen, Janne and Paz, Ami and Rybicki, Joel and Schmid, Stefan},
  issn         = {2476-1249},
  journal      = {Proceedings of the ACM on Measurement and Analysis of Computing Systems},
  keywords     = {Computer Networks and Communications, Hardware and Architecture, Safety, Risk, Reliability and Quality, Computer Science (miscellaneous)},
  number       = {1},
  pages        = {1--33},
  publisher    = {Association for Computing Machinery},
  title        = {{Input-dynamic distributed algorithms for communication networks}},
  doi          = {10.1145/3447384},
  volume       = {5},
  year         = {2021},
}

@inproceedings{11436,
  abstract     = {Asynchronous distributed algorithms are a popular way to reduce synchronization costs in large-scale optimization, and in particular for neural network training. However, for nonsmooth and nonconvex objectives, few convergence guarantees exist beyond cases where closed-form proximal operator solutions are available. As training most popular deep neural networks corresponds to optimizing nonsmooth and nonconvex objectives, there is a pressing need for such convergence guarantees. In this paper, we analyze for the first time the convergence of stochastic asynchronous optimization for this general class of objectives. In particular, we focus on stochastic subgradient methods allowing for block variable partitioning, where the shared model is asynchronously updated by concurrent processes. To this end, we use a probabilistic model which captures key features of real asynchronous scheduling between concurrent processes. Under this model, we establish convergence with probability one to an invariant set for stochastic subgradient methods with momentum. From a practical perspective, one issue with the family of algorithms that we consider is that they are not efficiently supported by machine learning frameworks, which mostly focus on distributed data-parallel strategies. To address this, we propose a new implementation strategy for shared-memory based training of deep neural networks for a partitioned but shared model in single- and multi-GPU settings. Based on this implementation, we achieve on average1.2x speed-up in comparison to state-of-the-art training methods for popular image classification tasks, without compromising accuracy.},
  author       = {Kungurtsev, Vyacheslav and Egan, Malcolm and Chatterjee, Bapi and Alistarh, Dan-Adrian},
  booktitle    = {35th AAAI Conference on Artificial Intelligence, AAAI 2021},
  isbn         = {9781713835974},
  issn         = {2374-3468},
  location     = {Virtual, Online},
  number       = {9B},
  pages        = {8209--8216},
  publisher    = {AAAI Press},
  title        = {{Asynchronous optimization methods for efficient training of deep neural networks with guarantees}},
  volume       = {35},
  year         = {2021},
}

@inproceedings{11452,
  abstract     = {We study efficient distributed algorithms for the fundamental problem of principal component analysis and leading eigenvector computation on the sphere, when the data are randomly distributed among a set of computational nodes. We propose a new quantized variant of Riemannian gradient descent to solve this problem, and prove that the algorithm converges with high probability under a set of necessary spherical-convexity properties. We give bounds on the number of bits transmitted by the algorithm under common initialization schemes, and investigate the dependency on the problem dimension in each case.},
  author       = {Alimisis, Foivos and Davies, Peter and Vandereycken, Bart and Alistarh, Dan-Adrian},
  booktitle    = {Advances in Neural Information Processing Systems - 35th Conference on Neural Information Processing Systems},
  isbn         = {9781713845393},
  issn         = {1049-5258},
  location     = {Virtual, Online},
  pages        = {2823--2834},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Distributed principal component analysis with limited communication}},
  volume       = {4},
  year         = {2021},
}

@inproceedings{11458,
  abstract     = {The increasing computational requirements of deep neural networks (DNNs) have led to significant interest in obtaining DNN models that are sparse, yet accurate. Recent work has investigated the even harder case of sparse training, where the DNN weights are, for as much as possible, already sparse to reduce computational costs during training. Existing sparse training methods are often empirical and can have lower accuracy relative to the dense baseline. In this paper, we present a general approach called Alternating Compressed/DeCompressed (AC/DC) training of DNNs, demonstrate convergence for a variant of the algorithm, and show that AC/DC outperforms existing sparse training methods in accuracy at similar computational budgets; at high sparsity levels, AC/DC even outperforms existing methods that rely on accurate pre-trained dense models. An important property of AC/DC is that it allows co-training of dense and sparse models, yielding accurate sparse–dense model pairs at the end of the training process. This is useful in practice, where compressed variants may be desirable for deployment in resource-constrained settings without re-doing the entire training flow, and also provides us with insights into the accuracy gap between dense and compressed models. The code is available at: https://github.com/IST-DASLab/ACDC.},
  author       = {Peste, Elena-Alexandra and Iofinova, Eugenia B and Vladu, Adrian and Alistarh, Dan-Adrian},
  booktitle    = {35th Conference on Neural Information Processing Systems},
  isbn         = {9781713845393},
  issn         = {1049-5258},
  location     = {Virtual, Online},
  pages        = {8557--8570},
  publisher    = {Curran Associates},
  title        = {{AC/DC: Alternating Compressed/DeCompressed training of deep neural networks}},
  volume       = {34},
  year         = {2021},
}

@inproceedings{11463,
  abstract     = {Efficiently approximating local curvature information of the loss function is a key tool for optimization and compression of deep neural networks. Yet, most existing methods to approximate second-order information have high computational
or storage costs, which limits their practicality. In this work, we investigate matrix-free, linear-time approaches for estimating Inverse-Hessian Vector Products (IHVPs) for the case when the Hessian can be approximated as a sum of rank-one matrices, as in the classic approximation of the Hessian by the empirical Fisher matrix. We propose two new algorithms: the first is tailored towards network compression and can compute the IHVP for dimension d, if the Hessian is given as a sum of m rank-one matrices, using O(dm2) precomputation, O(dm) cost for computing the IHVP, and query cost O(m) for any single element of the inverse Hessian. The second algorithm targets an optimization setting, where we wish to compute the product between the inverse Hessian, estimated over a sliding window of optimization steps, and a given gradient direction, as required for preconditioned SGD. We give an algorithm with cost O(dm + m2) for computing the IHVP and O(dm + m3) for adding or removing any gradient from the sliding window. These
two algorithms yield state-of-the-art results for network pruning and optimization with lower computational overhead relative to existing second-order methods. Implementations are available at [9] and [17].},
  author       = {Frantar, Elias and Kurtic, Eldar and Alistarh, Dan-Adrian},
  booktitle    = {35th Conference on Neural Information Processing Systems},
  isbn         = {9781713845393},
  issn         = {1049-5258},
  location     = {Virtual, Online},
  pages        = {14873--14886},
  publisher    = {Curran Associates},
  title        = {{M-FAC: Efficient matrix-free approximations of second-order information}},
  volume       = {34},
  year         = {2021},
}

@inproceedings{11464,
  abstract     = {We consider a standard distributed optimisation setting where N machines, each holding a d-dimensional function
fi, aim to jointly minimise the sum of the functions ∑Ni=1fi(x). This problem arises naturally in large-scale distributed optimisation, where a standard solution is to apply variants of (stochastic) gradient descent. We focus on the communication complexity of this problem: our main result provides the first fully unconditional bounds on total number of bits which need to be sent and received by the N machines to solve this problem under point-to-point communication, within a given error-tolerance. Specifically, we show that Ω(Ndlogd/Nε) total bits need to be communicated between the machines to find an additive ϵ-approximation to the minimum of ∑Ni=1fi(x). The result holds for both deterministic and randomised algorithms, and, importantly, requires no assumptions on the algorithm structure. The lower bound is tight under certain restrictions on parameter values, and is matched within constant factors for quadratic objectives by a new variant of quantised gradient descent, which we describe and analyse. Our results bring over tools from communication complexity to distributed optimisation, which has potential for further applications.},
  author       = {Alistarh, Dan-Adrian and Korhonen, Janne},
  booktitle    = {35th Conference on Neural Information Processing Systems},
  isbn         = {9781713845393},
  issn         = {1049-5258},
  location     = {Virtual, Online},
  pages        = {7254--7266},
  publisher    = {Curran Associates},
  title        = {{Towards tight communication lower bounds for distributed optimisation}},
  volume       = {34},
  year         = {2021},
}

