@inproceedings{10435,
  abstract     = {Decentralized optimization is emerging as a viable alternative for scalable distributed machine learning, but also introduces new challenges in terms of synchronization costs. To this end, several communication-reduction techniques, such as non-blocking communication, quantization, and local steps, have been explored in the decentralized setting. Due to the complexity of analyzing optimization in such a relaxed setting, this line of work often assumes \emph{global} communication rounds, which require additional synchronization. In this paper, we consider decentralized optimization in the simpler, but harder to analyze, \emph{asynchronous gossip} model, in which communication occurs in discrete, randomly chosen pairings among nodes. Perhaps surprisingly, we show that a variant of SGD called \emph{SwarmSGD} still converges in this setting, even if \emph{non-blocking communication}, \emph{quantization}, and \emph{local steps} are all applied \emph{in conjunction}, and even if the node data distributions and underlying graph topology are both \emph{heterogenous}. Our analysis is based on a new connection with multi-dimensional load-balancing processes. We implement this algorithm and deploy it in a super-computing environment, showing that it can outperform previous decentralized methods in terms of end-to-end training time, and that it can even rival carefully-tuned large-batch SGD for certain tasks.},
  author       = {Nadiradze, Giorgi and Sabour, Amirmojtaba and Davies, Peter and Li, Shigang and Alistarh, Dan-Adrian},
  booktitle    = {35th Conference on Neural Information Processing Systems},
  location     = {Sydney, Australia},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Asynchronous decentralized SGD with quantized and local updates}},
  year         = {2021},
}

@inproceedings{11452,
  abstract     = {We study efficient distributed algorithms for the fundamental problem of principal component analysis and leading eigenvector computation on the sphere, when the data are randomly distributed among a set of computational nodes. We propose a new quantized variant of Riemannian gradient descent to solve this problem, and prove that the algorithm converges with high probability under a set of necessary spherical-convexity properties. We give bounds on the number of bits transmitted by the algorithm under common initialization schemes, and investigate the dependency on the problem dimension in each case.},
  author       = {Alimisis, Foivos and Davies, Peter and Vandereycken, Bart and Alistarh, Dan-Adrian},
  booktitle    = {Advances in Neural Information Processing Systems - 35th Conference on Neural Information Processing Systems},
  isbn         = {9781713845393},
  issn         = {1049-5258},
  location     = {Virtual, Online},
  pages        = {2823--2834},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Distributed principal component analysis with limited communication}},
  volume       = {4},
  year         = {2021},
}

@inproceedings{13147,
  abstract     = {We investigate fast and communication-efficient algorithms for the classic problem of minimizing a sum of strongly convex and smooth functions that are distributed among n
 different nodes, which can communicate using a limited number of bits. Most previous communication-efficient approaches for this problem are limited to first-order optimization, and therefore have \emph{linear} dependence on the condition number in their communication complexity. We show that this dependence is not inherent: communication-efficient methods can in fact have sublinear dependence on the condition number. For this, we design and analyze the first communication-efficient distributed variants of preconditioned gradient descent for Generalized Linear Models, and for Newton’s method. Our results rely on a new technique for quantizing both the preconditioner and the descent direction at each step of the algorithms, while controlling their convergence rate. We also validate our findings experimentally, showing faster convergence and reduced communication relative to previous methods.},
  author       = {Alimisis, Foivos and Davies, Peter and Alistarh, Dan-Adrian},
  booktitle    = {Proceedings of the 38th International Conference on Machine Learning},
  isbn         = {9781713845065},
  issn         = {2640-3498},
  location     = {Virtual},
  pages        = {196--206},
  publisher    = {ML Research Press},
  title        = {{Communication-efficient distributed optimization with quantized preconditioners}},
  volume       = {139},
  year         = {2021},
}

@article{9541,
  abstract     = {The Massively Parallel Computation (MPC) model is an emerging model that distills core aspects of distributed and parallel computation, developed as a tool to solve combinatorial (typically graph) problems in systems of many machines with limited space. Recent work has focused on the regime in which machines have sublinear (in n, the number of nodes in the input graph) space, with randomized algorithms presented for the fundamental problems of Maximal Matching and Maximal Independent Set. However, there have been no prior corresponding deterministic algorithms. A major challenge underlying the sublinear space setting is that the local space of each machine might be too small to store all edges incident to a single node. This poses a considerable obstacle compared to classical models in which each node is assumed to know and have easy access to its incident edges. To overcome this barrier, we introduce a new graph sparsification technique that deterministically computes a low-degree subgraph, with the additional property that solving the problem on this subgraph provides significant progress towards solving the problem for the original input graph. Using this framework to derandomize the well-known algorithm of Luby [SICOMP’86], we obtain O(log Δ + log log n)-round deterministic MPC algorithms for solving the problems of Maximal Matching and Maximal Independent Set with O(nɛ) space on each machine for any constant ɛ > 0. These algorithms also run in O(log Δ) rounds in the closely related model of CONGESTED CLIQUE, improving upon the state-of-the-art bound of O(log 2Δ) rounds by Censor-Hillel et al. [DISC’17].},
  author       = {Czumaj, Artur and Davies, Peter and Parter, Merav},
  issn         = {1549-6333},
  journal      = {ACM Transactions on Algorithms},
  number       = {2},
  publisher    = {Association for Computing Machinery},
  title        = {{Graph sparsification for derandomizing massively parallel computation with low space}},
  doi          = {10.1145/3451992},
  volume       = {17},
  year         = {2021},
}

@inproceedings{9543,
  abstract     = {We consider the problem ofdistributed mean estimation (DME), in which n machines are each given a local d-dimensional vector xv∈Rd, and must cooperate to estimate the mean of their inputs μ=1n∑nv=1xv, while minimizing total communication cost. DME is a fundamental construct in distributed machine learning, and there has been considerable work on variants of this problem, especially in the context of distributed variance reduction for stochastic gradients in parallel SGD. Previous work typically assumes an upper bound on the norm of the input vectors, and achieves an error bound in terms of this norm. However, in many real applications, the input vectors are concentrated around the correct output μ, but μ itself has large norm. In such cases, previous output error bounds perform poorly. In this paper, we show that output error bounds need not depend on input norm. We provide a method of quantization which allows distributed mean estimation to be performed with solution quality dependent only on the distance between inputs, not on input norm, and show an analogous result for distributed variance reduction. The technique is based on a new connection with lattice theory. We also provide lower bounds showing that the communication to error trade-off of our algorithms is asymptotically optimal. As the lattices achieving optimal bounds under l2-norm can be computationally impractical, we also present an extension which leverages easy-to-use cubic lattices, and is loose only up to a logarithmic factor ind. We show experimentally that our method yields practical improvements for common applications, relative to prior approaches.},
  author       = {Davies, Peter and Gurunanthan, Vijaykrishna and Moshrefi, Niusha  and Ashkboos, Saleh and Alistarh, Dan-Adrian},
  booktitle    = {9th International Conference on Learning Representations},
  location     = {Virtual},
  title        = {{New bounds for distributed mean estimation and variance reduction}},
  year         = {2021},
}

@inproceedings{9620,
  abstract     = {In this note, we introduce a distributed twist on the classic coupon collector problem: a set of m collectors wish to each obtain a set of n coupons; for this, they can each sample coupons uniformly at random, but can also meet in pairwise interactions, during which they can exchange coupons. By doing so, they hope to reduce the number of coupons that must be sampled by each collector in order to obtain a full set. This extension is natural when considering real-world manifestations of the coupon collector phenomenon, and has been remarked upon and studied empirically (Hayes and Hannigan 2006, Ahmad et al. 2014, Delmarcelle 2019).

We provide the first theoretical analysis for such a scenario. We find that “coupon collecting with friends” can indeed significantly reduce the number of coupons each collector must sample, and raises interesting connections to the more traditional variants of the problem. While our analysis is in most cases asymptotically tight, there are several open questions raised, regarding finer-grained analysis of both “coupon collecting with friends,” and of a long-studied variant of the original problem in which a collector requires multiple full sets of coupons.},
  author       = {Alistarh, Dan-Adrian and Davies, Peter},
  booktitle    = {Structural Information and Communication Complexity},
  isbn         = {9783030795269},
  issn         = {1611-3349},
  location     = {Wrocław, Poland},
  pages        = {3--12},
  publisher    = {Springer Nature},
  title        = {{Collecting coupons is faster with friends}},
  doi          = {10.1007/978-3-030-79527-6_1},
  volume       = {12810},
  year         = {2021},
}

@inproceedings{9933,
  abstract     = {In this paper, we study the power and limitations of component-stable algorithms in the low-space model of Massively Parallel Computation (MPC). Recently Ghaffari, Kuhn and Uitto (FOCS 2019) introduced the class of component-stable low-space MPC algorithms, which are, informally, defined as algorithms for which the outputs reported by the nodes in different connected components are required to be independent. This very natural notion was introduced to capture most (if not all) of the known efficient MPC algorithms to date, and it was the first general class of MPC algorithms for which one can show non-trivial conditional lower bounds. In this paper we enhance the framework of component-stable algorithms and investigate its effect on the complexity of randomized and deterministic low-space MPC. Our key contributions include: 1) We revise and formalize the lifting approach of Ghaffari, Kuhn and Uitto. This requires a very delicate amendment of the notion of component stability, which allows us to fill in gaps in the earlier arguments. 2) We also extend the framework to obtain conditional lower bounds for deterministic algorithms and fine-grained lower bounds that depend on the maximum degree Δ. 3) We demonstrate a collection of natural graph problems for which non-component-stable algorithms break the conditional lower bound obtained for component-stable algorithms. This implies that, for both deterministic and randomized algorithms, component-stable algorithms are conditionally weaker than the non-component-stable ones.

Altogether our results imply that component-stability might limit the computational power of the low-space MPC model, paving the way for improved upper bounds that escape the conditional lower bound setting of Ghaffari, Kuhn, and Uitto.},
  author       = {Czumaj, Artur and Davies, Peter and Parter, Merav},
  booktitle    = {Proceedings of the 2021 ACM Symposium on Principles of Distributed Computing},
  isbn         = {9781450385480},
  location     = {Virtual, Italy},
  pages        = {481–491},
  publisher    = {Association for Computing Machinery},
  title        = {{Component stability in low-space massively parallel computation}},
  doi          = {10.1145/3465084.3467903},
  year         = {2021},
}

@inproceedings{9935,
  abstract     = {We present a deterministic O(log log log n)-round low-space Massively Parallel Computation (MPC) algorithm for the classical problem of (Δ+1)-coloring on n-vertex graphs. In this model, every machine has sublinear local space of size n^φ for any arbitrary constant φ \in (0,1). Our algorithm works under the relaxed setting where each machine is allowed to perform exponential local computations, while respecting the n^φ space and bandwidth limitations.

Our key technical contribution is a novel derandomization of the ingenious (Δ+1)-coloring local algorithm by Chang-Li-Pettie (STOC 2018, SIAM J. Comput. 2020). The Chang-Li-Pettie algorithm runs in T_local =poly(loglog n) rounds, which sets the state-of-the-art randomized round complexity for the problem in the local model. Our derandomization employs a combination of tools, notably pseudorandom generators (PRG) and bounded-independence hash functions.

The achieved round complexity of O(logloglog n) rounds matches the bound of log(T_local ), which currently serves an upper bound barrier for all known randomized algorithms for locally-checkable problems in this model. Furthermore, no deterministic sublogarithmic low-space MPC algorithms for the (Δ+1)-coloring problem have been known before.},
  author       = {Czumaj, Artur and Davies, Peter and Parter, Merav},
  booktitle    = {Proceedings of the 2021 ACM Symposium on Principles of Distributed Computing},
  isbn         = {978-1-4503-8548-0},
  location     = {Virtual, Italy},
  pages        = {469–479},
  publisher    = {Association for Computing Machinery},
  title        = {{Improved deterministic (Δ+1) coloring in low-space MPC}},
  doi          = {10.1145/3465084.3467937},
  year         = {2021},
}

@inproceedings{7802,
  abstract     = {The Massively Parallel Computation (MPC) model is an emerging model which distills core  aspects of distributed and parallel computation. It has been developed as a tool to solve (typically graph) problems in systems where the input is distributed over many machines with limited space.
	
Recent work has focused on the regime in which machines have sublinear (in $n$, the number of nodes in the input graph) space, with randomized algorithms presented for fundamental graph problems of Maximal Matching and Maximal Independent Set. However, there have been no prior corresponding deterministic algorithms.
	
	A major challenge underlying the sublinear space setting is that the local space of each machine might be too small to store all the edges incident to a single node. This poses a considerable obstacle compared to the classical models in which each node is assumed to know and have easy access to its incident edges. To overcome this barrier we introduce a new graph sparsification technique that deterministically computes a low-degree subgraph with additional desired properties. The degree of the nodes in this subgraph is small in the sense that the edges of each node can be now stored on a single machine. This low-degree subgraph also has the property that solving the problem on this subgraph provides \emph{significant} global progress, i.e., progress towards solving the problem for the original input graph.
	
Using this framework to derandomize the well-known randomized algorithm of Luby [SICOMP'86], we obtain $O(\log \Delta+\log\log n)$-round deterministic MPC algorithms for solving the fundamental problems of Maximal Matching and Maximal Independent Set with $O(n^{\epsilon})$ space on each machine for any constant $\epsilon > 0$. Based on the recent work of Ghaffari et al. [FOCS'18], this additive $O(\log\log n)$ factor is conditionally essential. These algorithms can also be shown to run in $O(\log \Delta)$ rounds in the closely related model of CONGESTED CLIQUE, improving upon the state-of-the-art bound of $O(\log^2 \Delta)$ rounds by Censor-Hillel et al. [DISC'17].},
  author       = {Czumaj, Artur and Davies, Peter and Parter, Merav},
  booktitle    = {Proceedings of the 32nd ACM Symposium on Parallelism in Algorithms and Architectures (SPAA 2020)},
  location     = {Virtual Event, United States},
  number       = {7},
  pages        = {175--185},
  publisher    = {Association for Computing Machinery},
  title        = {{Graph sparsification for derandomizing massively parallel computation with low space}},
  doi          = {10.1145/3350755.3400282},
  year         = {2020},
}

@inproceedings{7803,
  abstract     = {We settle the complexity of the (Δ+1)-coloring and (Δ+1)-list coloring problems in the CONGESTED CLIQUE model by presenting a simple deterministic algorithm for both problems running in a constant number of rounds. This matches the complexity of the recent breakthrough randomized constant-round (Δ+1)-list coloring algorithm due to Chang et al. (PODC'19), and significantly improves upon the state-of-the-art O(logΔ)-round deterministic (Δ+1)-coloring bound of Parter (ICALP'18).
A remarkable property of our algorithm is its simplicity. Whereas the state-of-the-art randomized algorithms for this problem are based on the quite involved local coloring algorithm of Chang et al. (STOC'18), our algorithm can be described in just a few lines. At a high level, it applies a careful derandomization of a recursive procedure which partitions the nodes and their respective palettes into separate bins. We show that after O(1) recursion steps, the remaining uncolored subgraph within each bin has linear size, and thus can be solved locally by collecting it to a single node. This algorithm can also be implemented in the Massively Parallel Computation (MPC) model provided that each machine has linear (in n, the number of nodes in the input graph) space.
We also show an extension of our algorithm to the MPC regime in which machines have sublinear space: we present the first deterministic (Δ+1)-list coloring algorithm designed for sublinear-space MPC, which runs in O(logΔ+loglogn) rounds.},
  author       = {Czumaj, Artur and Davies, Peter and Parter, Merav},
  booktitle    = {Proceedings of the 2020 ACM Symposium on Principles of Distributed Computing},
  location     = {Salerno, Italy},
  pages        = {309--318},
  publisher    = {Association for Computing Machinery},
  title        = {{Simple, deterministic, constant-round coloring in the congested clique}},
  doi          = {10.1145/3382734.3405751},
  year         = {2020},
}

