@article{14364,
  abstract     = {We introduce extension-based proofs, a class of impossibility proofs that includes valency arguments. They are modelled as an interaction between a prover and a protocol. Using proofs based on combinatorial topology, it has been shown that it is impossible to deterministically solve -set agreement among  processes or approximate agreement on a cycle of length 4 among  processes in a wait-free manner in asynchronous models where processes communicate using objects that can be constructed from shared registers. However, it was unknown whether proofs based on simpler techniques were possible. We show that these impossibility results cannot be obtained by extension-based proofs in the iterated snapshot model and, hence, extension-based proofs are limited in power.},
  author       = {Alistarh, Dan-Adrian and Aspnes, James and Ellen, Faith and Gelashvili, Rati and Zhu, Leqi},
  issn         = {1095-7111},
  journal      = {SIAM Journal on Computing},
  number       = {4},
  pages        = {913--944},
  publisher    = {Society for Industrial and Applied Mathematics},
  title        = {{Why extension-based proofs fail}},
  doi          = {10.1137/20M1375851},
  volume       = {52},
  year         = {2023},
}

@inproceedings{14458,
  abstract     = {We show for the first time that large-scale generative pretrained transformer (GPT) family models can be pruned to at least 50% sparsity in one-shot, without any retraining, at minimal loss of accuracy. This is achieved via a new pruning method called SparseGPT, specifically designed to work efficiently and accurately on massive GPT-family models. We can execute SparseGPT on the largest available open-source models, OPT-175B and BLOOM-176B, in under 4.5 hours, and can reach 60% unstructured sparsity with negligible increase in perplexity: remarkably, more than 100 billion weights from these models can be ignored at inference time. SparseGPT generalizes to semi-structured (2:4 and 4:8) patterns, and is compatible with weight quantization approaches. The code is available at: https://github.com/IST-DASLab/sparsegpt.},
  author       = {Frantar, Elias and Alistarh, Dan-Adrian},
  booktitle    = {Proceedings of the 40th International Conference on Machine Learning},
  issn         = {2640-3498},
  location     = {Honolulu, Hawaii, HI, United States},
  pages        = {10323--10337},
  publisher    = {ML Research Press},
  title        = {{SparseGPT: Massive language models can be accurately pruned in one-shot}},
  volume       = {202},
  year         = {2023},
}

@inproceedings{14460,
  abstract     = {We provide an efficient implementation of the backpropagation algorithm, specialized to the case where the weights of the neural network being trained are sparse. Our algorithm is general, as it applies to arbitrary (unstructured) sparsity and common layer types (e.g., convolutional or linear). We provide a fast vectorized implementation on commodity CPUs, and show that it can yield speedups in end-to-end runtime experiments, both in transfer learning using already-sparsified networks, and in training sparse networks from scratch. Thus, our results provide the first support for sparse training on commodity hardware.},
  author       = {Nikdan, Mahdi and Pegolotti, Tommaso and Iofinova, Eugenia B and Kurtic, Eldar and Alistarh, Dan-Adrian},
  booktitle    = {Proceedings of the 40th International Conference on Machine Learning},
  issn         = {2640-3498},
  location     = {Honolulu, Hawaii, HI, United States},
  pages        = {26215--26227},
  publisher    = {ML Research Press},
  title        = {{SparseProp: Efficient sparse backpropagation for faster training of neural networks at the edge}},
  volume       = {202},
  year         = {2023},
}

@inproceedings{14461,
  abstract     = {Communication-reduction techniques are a popular way to improve scalability in data-parallel training of deep neural networks (DNNs). The recent emergence of large language models such as GPT has created the need for new approaches to exploit data-parallelism. Among these, fully-sharded data parallel (FSDP) training is highly popular, yet it still encounters scalability bottlenecks. One reason is that applying compression techniques to FSDP is challenging: as the vast majority of the communication involves the model’s weights, direct compression alters convergence and leads to accuracy loss. We present QSDP, a variant of FSDP which supports both gradient and weight quantization with theoretical guarantees, is simple to implement and has essentially no overheads. To derive QSDP we prove that a natural modification of SGD achieves convergence even when we only maintain quantized weights, and thus the domain over which we train consists of quantized points and is, therefore, highly non-convex. We validate this approach by training GPT-family models with up to 1.3 billion parameters on a multi-node cluster. Experiments show that QSDP preserves model accuracy, while completely removing the communication bottlenecks of FSDP, providing end-to-end speedups of up to 2.2x.},
  author       = {Markov, Ilia and Vladu, Adrian and Guo, Qi and Alistarh, Dan-Adrian},
  booktitle    = {Proceedings of the 40th International Conference on Machine Learning},
  issn         = {2640-3498},
  location     = {Honolulu, Hawaii, HI, United States},
  pages        = {24020--24044},
  publisher    = {ML Research Press},
  title        = {{Quantized distributed training of large models with convergence guarantees}},
  volume       = {202},
  year         = {2023},
}

@inproceedings{14771,
  abstract     = {Pruning—that is, setting a significant subset of the parameters of a neural network to zero—is one of the most popular methods of model compression. Yet, several recent works have raised the issue that pruning may induce or exacerbate bias in the output of the compressed model. Despite existing evidence for this phenomenon, the relationship between neural network pruning and induced bias is not well-understood. In this work, we systematically investigate and characterize this phenomenon in Convolutional Neural Networks for computer vision. First, we show that it is in fact possible to obtain highly-sparse models, e.g. with less than 10% remaining weights, which do not decrease in accuracy nor substantially increase in bias when compared to dense models. At the same time, we also find that, at higher sparsities, pruned models exhibit higher uncertainty in their outputs, as well as increased correlations, which we directly link to increased bias. We propose easy-to-use criteria which, based only on the uncompressed model, establish whether bias will increase with pruning, and identify the samples most susceptible to biased predictions post-compression. Our code can be found at https://github.com/IST-DASLab/pruned-vision-model-bias.},
  author       = {Iofinova, Eugenia B and Peste, Elena-Alexandra and Alistarh, Dan-Adrian},
  booktitle    = {2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  issn         = {2575-7075},
  location     = {Vancouver, BC, Canada},
  pages        = {24364--24373},
  publisher    = {IEEE},
  title        = {{Bias in pruned vision models: In-depth analysis and countermeasures}},
  doi          = {10.1109/cvpr52729.2023.02334},
  year         = {2023},
}

@inproceedings{13053,
  abstract     = {Deep neural networks (DNNs) often have to be compressed, via pruning and/or quantization, before they can be deployed in practical settings. In this work we propose a new compression-aware minimizer dubbed CrAM that modifies the optimization step in a principled way, in order to produce models whose local loss behavior is stable under compression operations such as pruning. Thus, dense models trained via CrAM should be compressible post-training, in a single step, without significant accuracy loss. Experimental results on standard benchmarks, such as residual networks for ImageNet classification and BERT models for language modelling, show that CrAM produces dense models that can be more accurate than the standard SGD/Adam-based baselines, but which are stable under weight pruning: specifically, we can prune models in one-shot to 70-80% sparsity with almost no accuracy loss, and to 90% with reasonable (∼1%) accuracy loss, which is competitive with gradual compression methods. Additionally, CrAM can produce sparse models which perform well for transfer learning, and it also works for semi-structured 2:4 pruning patterns supported by GPU hardware. The code for reproducing the results is available at this https URL .},
  author       = {Peste, Elena-Alexandra and Vladu, Adrian and Kurtic, Eldar and Lampert, Christoph and Alistarh, Dan-Adrian},
  booktitle    = {11th International Conference on Learning Representations },
  location     = {Kigali, Rwanda },
  title        = {{CrAM: A Compression-Aware Minimizer}},
  year         = {2023},
}

@phdthesis{13074,
  abstract     = {Deep learning has become an integral part of a large number of important applications, and many of the recent breakthroughs have been enabled by the ability to train very large models, capable to capture complex patterns and relationships from the data. At the same time, the massive sizes of modern deep learning models have made their deployment to smaller devices more challenging; this is particularly important, as in many applications the users rely on accurate deep learning predictions, but they only have access to devices with limited memory and compute power. One solution to this problem is to prune neural networks, by setting as many of their parameters as possible to zero, to obtain accurate sparse models with lower memory footprint. Despite the great research progress in obtaining sparse models that preserve accuracy, while satisfying memory and computational constraints, there are still many challenges associated with efficiently training sparse models, as well as understanding their generalization properties.

The focus of this thesis is to investigate how the training process of sparse models can be made more efficient, and to understand the differences between sparse and dense models in terms of how well they can generalize to changes in the data distribution. We first study a method for co-training sparse and dense models, at a lower cost compared to regular training. With our method we can obtain very accurate sparse networks, and dense models that can recover the baseline accuracy. Furthermore, we are able to more easily analyze the differences, at prediction level, between the sparse-dense model pairs. Next, we investigate the generalization properties of sparse neural networks in more detail, by studying how well different sparse models trained on a larger task can adapt to smaller, more specialized tasks, in a transfer learning scenario. Our analysis across multiple pruning methods and sparsity levels reveals that sparse models provide features that can transfer similarly to or better than the dense baseline. However, the choice of the pruning method plays an important role, and can influence the results when the features are fixed (linear finetuning), or when they are allowed to adapt to the new task (full finetuning). Using sparse models with fixed masks for finetuning on new tasks has an important practical advantage, as it enables training neural networks on smaller devices. However, one drawback of current pruning methods is that the entire training cycle has to be repeated to obtain the initial sparse model, for every sparsity target; in consequence, the entire training process is costly and also multiple models need to be stored. In the last part of the thesis we propose a method that can train accurate dense models that are compressible in a single step, to multiple sparsity levels, without additional finetuning. Our method results in sparse models that can be competitive with existing pruning methods, and which can also successfully generalize to new tasks.},
  author       = {Peste, Elena-Alexandra},
  issn         = {2663-337X},
  pages        = {147},
  publisher    = {Institute of Science and Technology Austria},
  title        = {{Efficiency and generalization of sparse neural networks}},
  doi          = {10.15479/at:ista:13074},
  year         = {2023},
}

@article{12566,
  abstract     = {Approximate agreement is one of the few variants of consensus that can be solved in a wait-free manner in asynchronous systems where processes communicate by reading and writing to shared memory. In this work, we consider a natural generalisation of approximate agreement on arbitrary undirected connected graphs. Each process is given a node of the graph as input and, if non-faulty, must output a node such that
– all the outputs are within distance 1 of one another, and
– each output value lies on a shortest path between two input values.
From prior work, it is known that there is no wait-free algorithm among  processes for this problem on any cycle of length , by reduction from 2-set agreement (Castañeda et al., 2018).

In this work, we investigate the solvability of this task on general graphs. We give a new, direct proof of the impossibility of approximate agreement on cycles of length , via a generalisation of Sperner's Lemma to convex polygons. We also extend the reduction from 2-set agreement to a larger class of graphs, showing that approximate agreement on these graphs is unsolvable. On the positive side, we present a wait-free algorithm for a different class of graphs, which properly contains the class of chordal graphs.},
  author       = {Alistarh, Dan-Adrian and Ellen, Faith and Rybicki, Joel},
  issn         = {0304-3975},
  journal      = {Theoretical Computer Science},
  number       = {2},
  publisher    = {Elsevier},
  title        = {{Wait-free approximate agreement on graphs}},
  doi          = {10.1016/j.tcs.2023.113733},
  volume       = {948},
  year         = {2023},
}

@inproceedings{11180,
  abstract     = {Designing and implementing efficient parallel priority schedulers is an active research area. An intriguing proposed design is the Multi-Queue: given n threads and m ≥ n distinct priority queues, task insertions are performed uniformly at random, while, to delete, a thread picks two queues uniformly at random, and removes the observed task of higher priority. This approach scales well, and has probabilistic rank guarantees: roughly, the rank of each task removed, relative to remaining tasks in all other queues, is O (m) in expectation. Yet, the performance of this pattern is below that of well-engineered schedulers, which eschew theoretical guarantees for practical efficiency.

We investigate whether it is possible to design and implement a Multi-Queue-based task scheduler that is both highly-efficient and has analytical guarantees. We propose a new variant called the Stealing Multi-Queue (SMQ), a cache-efficient variant of the Multi-Queue, which leverages both queue affinity---each thread has a local queue, from which tasks are usually removed; but, with some probability, threads also attempt to steal higher-priority tasks from the other queues---and task batching, that is, the processing of several tasks in a single insert / remove step. These ideas are well-known for task scheduling without priorities; our theoretical contribution is showing that, despite relaxations, this design can still provide rank guarantees, which in turn implies bounds on total work performed. We provide a general SMQ implementation which can surpass state-of-the-art schedulers such as OBIM and PMOD in terms of performance on popular graph-processing benchmarks. Notably, the performance improvement comes mainly from the superior rank guarantees provided by our scheduler, confirming that analytically-reasoned approaches can still provide performance improvements for priority task scheduling.},
  author       = {Postnikova, Anastasiia and Koval, Nikita and Nadiradze, Giorgi and Alistarh, Dan-Adrian},
  booktitle    = {Proceedings of the 27th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming},
  isbn         = {9781450392044},
  location     = {Seoul, Republic of Korea},
  pages        = {353--367},
  publisher    = {Association for Computing Machinery},
  title        = {{Multi-queues can be state-of-the-art priority schedulers}},
  doi          = {10.1145/3503221.3508432},
  year         = {2022},
}

@inproceedings{11183,
  abstract     = {Subgraph detection has recently been one of the most studied problems in the CONGEST model of distributed computing. In this work, we study the distributed complexity of problems closely related to subgraph detection, mainly focusing on induced subgraph detection. The main line of this work presents lower bounds and parameterized algorithms w.r.t structural parameters of the input graph:
- On general graphs, we give unconditional lower bounds for induced detection of cycles and patterns of treewidth 2 in CONGEST. Moreover, by adapting reductions from centralized parameterized complexity, we prove lower bounds in CONGEST for detecting patterns with a 4-clique, and for induced path detection conditional on the hardness of triangle detection in the congested clique.
- On graphs of bounded degeneracy, we show that induced paths can be detected fast in CONGEST using techniques from parameterized algorithms, while detecting cycles and patterns of treewidth 2 is hard.
- On graphs of bounded vertex cover number, we show that induced subgraph detection is easy in CONGEST for any pattern graph. More specifically, we adapt a centralized parameterized algorithm for a more general maximum common induced subgraph detection problem to the distributed setting. In addition to these induced subgraph detection results, we study various related problems in the CONGEST and congested clique models, including for multicolored versions of subgraph-detection-like problems.},
  author       = {Nikabadi, Amir and Korhonen, Janne},
  booktitle    = {25th International Conference on Principles of Distributed Systems},
  editor       = {Bramas, Quentin and Gramoli, Vincent and Milani, Alessia},
  isbn         = {9783959772198},
  issn         = {1868-8969},
  location     = {Strasbourg, France},
  publisher    = {Schloss Dagstuhl - Leibniz-Zentrum für Informatik},
  title        = {{Beyond distributed subgraph detection: Induced subgraphs, multicolored problems and graph parameters}},
  doi          = {10.4230/LIPIcs.OPODIS.2021.15},
  volume       = {217},
  year         = {2022},
}

@inproceedings{11184,
  abstract     = {Let G be a graph on n nodes. In the stochastic population protocol model, a collection of n indistinguishable, resource-limited nodes collectively solve tasks via pairwise interactions. In each interaction, two randomly chosen neighbors first read each other’s states, and then update their local states. A rich line of research has established tight upper and lower bounds on the complexity of fundamental tasks, such as majority and leader election, in this model, when G is a clique. Specifically, in the clique, these tasks can be solved fast, i.e., in n polylog n pairwise interactions, with high probability, using at most polylog n states per node.
In this work, we consider the more general setting where G is an arbitrary regular graph, and present a technique for simulating protocols designed for fully-connected networks in any connected regular graph. Our main result is a simulation that is efficient on many interesting graph families: roughly, the simulation overhead is polylogarithmic in the number of nodes, and quadratic in the conductance of the graph. As a sample application, we show that, in any regular graph with conductance φ, both leader election and exact majority can be solved in φ^{-2} ⋅ n polylog n pairwise interactions, with high probability, using at most φ^{-2} ⋅ polylog n states per node. This shows that there are fast and space-efficient population protocols for leader election and exact majority on graphs with good expansion properties. We believe our results will prove generally useful, as they allow efficient technology transfer between the well-mixed (clique) case, and the under-explored spatial setting.},
  author       = {Alistarh, Dan-Adrian and Gelashvili, Rati and Rybicki, Joel},
  booktitle    = {25th International Conference on Principles of Distributed Systems},
  editor       = {Bramas, Quentin and Gramoli, Vincent and Milani, Alessia},
  isbn         = {9783959772198},
  issn         = {1868-8969},
  location     = {Strasbourg, France},
  publisher    = {Schloss Dagstuhl - Leibniz-Zentrum für Informatik},
  title        = {{Fast graphical population protocols}},
  doi          = {10.4230/LIPIcs.OPODIS.2021.14},
  volume       = {217},
  year         = {2022},
}

@inproceedings{11844,
  abstract     = {In the stochastic population protocol model, we are given a connected graph with n nodes, and in every time step, a scheduler samples an edge of the graph uniformly at random and the nodes connected by this edge interact. A fundamental task in this model is stable leader election, in which all nodes start in an identical state and the aim is to reach a configuration in which (1) exactly one node is elected as leader and (2) this node remains as the unique leader no matter what sequence of interactions follows. On cliques, the complexity of this problem has recently been settled: time-optimal protocols stabilize in Θ(n log n) expected steps using Θ(log log n) states, whereas protocols that use O(1) states require Θ(n2) expected steps.

In this work, we investigate the complexity of stable leader election on general graphs. We provide the first non-trivial time lower bounds for leader election on general graphs, showing that, when moving beyond cliques, the complexity landscape of leader election becomes very diverse: the time required to elect a leader can range from O(1) to Θ(n3) expected steps. On the upper bound side, we first observe that there exists a protocol that is time-optimal on many graph families, but uses polynomially-many states. In contrast, we give a near-time-optimal protocol that uses only O(log2n) states that is at most a factor log n slower. Finally, we show that the constant-state protocol of Beauquier et al. [OPODIS 2013] is at most a factor n log n slower than the fast polynomial-state protocol. Moreover, among constant-state protocols, this protocol has near-optimal average case complexity on dense random graphs.},
  author       = {Alistarh, Dan-Adrian and Rybicki, Joel and Voitovych, Sasha},
  booktitle    = {Proceedings of the Annual ACM Symposium on Principles of Distributed Computing},
  isbn         = {9781450392624},
  location     = {Salerno, Italy},
  pages        = {246--256},
  publisher    = {Association for Computing Machinery},
  title        = {{Near-optimal leader election in population protocols on graphs}},
  doi          = {10.1145/3519270.3538435},
  year         = {2022},
}

@inproceedings{12299,
  abstract     = {Transfer learning is a classic paradigm by which models pretrained on large “upstream” datasets are adapted to yield good results on “downstream” specialized datasets. Generally, more accurate models on the “upstream” dataset tend to provide better transfer accuracy “downstream”. In this work, we perform an in-depth investigation of this phenomenon in the context of convolutional neural networks (CNNs) trained on the ImageNet dataset, which have been pruned-that is, compressed by sparsifiying their connections. We consider transfer using unstructured pruned models obtained by applying several state-of-the-art pruning methods, including magnitude-based, second-order, regrowth, lottery-ticket, and regularization approaches, in the context of twelve standard transfer tasks. In a nutshell, our study shows that sparse models can match or even outperform the transfer performance of dense models, even at high sparsities, and, while doing so, can lead to significant inference and even training speedups. At the same time, we observe and analyze significant differences in the behaviour of different pruning methods. The code is available at: https://github.com/IST-DASLab/sparse-imagenet-transfer.},
  author       = {Iofinova, Eugenia B and Peste, Elena-Alexandra and Kurtz, Mark and Alistarh, Dan-Adrian},
  booktitle    = {2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  issn         = {2575-7075},
  location     = {New Orleans, LA, United States},
  pages        = {12256--12266},
  publisher    = {Institute of Electrical and Electronics Engineers},
  title        = {{How well do sparse ImageNet models transfer?}},
  doi          = {10.1109/cvpr52688.2022.01195},
  year         = {2022},
}

@inproceedings{10854,
  abstract     = {Consider a distributed task where the communication network is fixed but the local inputs given to the nodes of the distributed system may change over time. In this work, we explore the following question: if some of the local inputs change, can an existing solution be updated efficiently, in a dynamic and distributed manner?
To address this question, we define the batch dynamic CONGEST model in which we are given a bandwidth-limited communication network and a dynamic edge labelling defines the problem input. The task is to maintain a solution to a graph problem on the labelled graph under batch changes. We investigate, when a batch of alpha edge label changes arrive, - how much time as a function of alpha we need to update an existing solution, and - how much information the nodes have to keep in local memory between batches in order to update the solution quickly.
Our work lays the foundations for the theory of input-dynamic distributed network algorithms. We give a general picture of the complexity landscape in this model, design both universal algorithms and algorithms for concrete problems, and present a general framework for lower bounds. The diverse time complexity of our model spans from constant time, through time polynomial in alpha, and to alpha time, which we show to be enough for any task.},
  author       = {Foerster, Klaus-Tycho and Korhonen, Janne and Paz, Ami and Rybicki, Joel and Schmid, Stefan},
  booktitle    = {Abstract Proceedings of the 2021 ACM SIGMETRICS / International Conference on Measurement and Modeling of Computer Systems},
  isbn         = {9781450380720},
  location     = {Virtual, Online},
  pages        = {71--72},
  publisher    = {Association for Computing Machinery},
  title        = {{Input-dynamic distributed algorithms for communication networks}},
  doi          = {10.1145/3410220.3453923},
  year         = {2021},
}

@article{10855,
  abstract     = {Consider a distributed task where the communication network is fixed but the local inputs given to the nodes of the distributed system may change over time. In this work, we explore the following question: if some of the local inputs change, can an existing solution be updated efficiently, in a dynamic and distributed manner? To address this question, we define the batch dynamic \congest model in which we are given a bandwidth-limited communication network and a dynamic edge labelling defines the problem input. The task is to maintain a solution to a graph problem on the labeled graph under batch changes. We investigate, when a batch of α edge label changes arrive, \beginitemize \item how much time as a function of α we need to update an existing solution, and \item how much information the nodes have to keep in local memory between batches in order to update the solution quickly. \enditemize Our work lays the foundations for the theory of input-dynamic distributed network algorithms. We give a general picture of the complexity landscape in this model, design both universal algorithms and algorithms for concrete problems, and present a general framework for lower bounds. In particular, we derive non-trivial upper bounds for two selected, contrasting problems: maintaining a minimum spanning tree and detecting cliques.},
  author       = {Foerster, Klaus-Tycho and Korhonen, Janne and Paz, Ami and Rybicki, Joel and Schmid, Stefan},
  issn         = {2476-1249},
  journal      = {Proceedings of the ACM on Measurement and Analysis of Computing Systems},
  keywords     = {Computer Networks and Communications, Hardware and Architecture, Safety, Risk, Reliability and Quality, Computer Science (miscellaneous)},
  number       = {1},
  pages        = {1--33},
  publisher    = {Association for Computing Machinery},
  title        = {{Input-dynamic distributed algorithms for communication networks}},
  doi          = {10.1145/3447384},
  volume       = {5},
  year         = {2021},
}

@inproceedings{11436,
  abstract     = {Asynchronous distributed algorithms are a popular way to reduce synchronization costs in large-scale optimization, and in particular for neural network training. However, for nonsmooth and nonconvex objectives, few convergence guarantees exist beyond cases where closed-form proximal operator solutions are available. As training most popular deep neural networks corresponds to optimizing nonsmooth and nonconvex objectives, there is a pressing need for such convergence guarantees. In this paper, we analyze for the first time the convergence of stochastic asynchronous optimization for this general class of objectives. In particular, we focus on stochastic subgradient methods allowing for block variable partitioning, where the shared model is asynchronously updated by concurrent processes. To this end, we use a probabilistic model which captures key features of real asynchronous scheduling between concurrent processes. Under this model, we establish convergence with probability one to an invariant set for stochastic subgradient methods with momentum. From a practical perspective, one issue with the family of algorithms that we consider is that they are not efficiently supported by machine learning frameworks, which mostly focus on distributed data-parallel strategies. To address this, we propose a new implementation strategy for shared-memory based training of deep neural networks for a partitioned but shared model in single- and multi-GPU settings. Based on this implementation, we achieve on average1.2x speed-up in comparison to state-of-the-art training methods for popular image classification tasks, without compromising accuracy.},
  author       = {Kungurtsev, Vyacheslav and Egan, Malcolm and Chatterjee, Bapi and Alistarh, Dan-Adrian},
  booktitle    = {35th AAAI Conference on Artificial Intelligence, AAAI 2021},
  isbn         = {9781713835974},
  issn         = {2374-3468},
  location     = {Virtual, Online},
  number       = {9B},
  pages        = {8209--8216},
  publisher    = {AAAI Press},
  title        = {{Asynchronous optimization methods for efficient training of deep neural networks with guarantees}},
  volume       = {35},
  year         = {2021},
}

@inproceedings{11452,
  abstract     = {We study efficient distributed algorithms for the fundamental problem of principal component analysis and leading eigenvector computation on the sphere, when the data are randomly distributed among a set of computational nodes. We propose a new quantized variant of Riemannian gradient descent to solve this problem, and prove that the algorithm converges with high probability under a set of necessary spherical-convexity properties. We give bounds on the number of bits transmitted by the algorithm under common initialization schemes, and investigate the dependency on the problem dimension in each case.},
  author       = {Alimisis, Foivos and Davies, Peter and Vandereycken, Bart and Alistarh, Dan-Adrian},
  booktitle    = {Advances in Neural Information Processing Systems - 35th Conference on Neural Information Processing Systems},
  isbn         = {9781713845393},
  issn         = {1049-5258},
  location     = {Virtual, Online},
  pages        = {2823--2834},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Distributed principal component analysis with limited communication}},
  volume       = {4},
  year         = {2021},
}

@inproceedings{11458,
  abstract     = {The increasing computational requirements of deep neural networks (DNNs) have led to significant interest in obtaining DNN models that are sparse, yet accurate. Recent work has investigated the even harder case of sparse training, where the DNN weights are, for as much as possible, already sparse to reduce computational costs during training. Existing sparse training methods are often empirical and can have lower accuracy relative to the dense baseline. In this paper, we present a general approach called Alternating Compressed/DeCompressed (AC/DC) training of DNNs, demonstrate convergence for a variant of the algorithm, and show that AC/DC outperforms existing sparse training methods in accuracy at similar computational budgets; at high sparsity levels, AC/DC even outperforms existing methods that rely on accurate pre-trained dense models. An important property of AC/DC is that it allows co-training of dense and sparse models, yielding accurate sparse–dense model pairs at the end of the training process. This is useful in practice, where compressed variants may be desirable for deployment in resource-constrained settings without re-doing the entire training flow, and also provides us with insights into the accuracy gap between dense and compressed models. The code is available at: https://github.com/IST-DASLab/ACDC.},
  author       = {Peste, Elena-Alexandra and Iofinova, Eugenia B and Vladu, Adrian and Alistarh, Dan-Adrian},
  booktitle    = {35th Conference on Neural Information Processing Systems},
  isbn         = {9781713845393},
  issn         = {1049-5258},
  location     = {Virtual, Online},
  pages        = {8557--8570},
  publisher    = {Curran Associates},
  title        = {{AC/DC: Alternating Compressed/DeCompressed training of deep neural networks}},
  volume       = {34},
  year         = {2021},
}

@inproceedings{11463,
  abstract     = {Efficiently approximating local curvature information of the loss function is a key tool for optimization and compression of deep neural networks. Yet, most existing methods to approximate second-order information have high computational
or storage costs, which limits their practicality. In this work, we investigate matrix-free, linear-time approaches for estimating Inverse-Hessian Vector Products (IHVPs) for the case when the Hessian can be approximated as a sum of rank-one matrices, as in the classic approximation of the Hessian by the empirical Fisher matrix. We propose two new algorithms: the first is tailored towards network compression and can compute the IHVP for dimension d, if the Hessian is given as a sum of m rank-one matrices, using O(dm2) precomputation, O(dm) cost for computing the IHVP, and query cost O(m) for any single element of the inverse Hessian. The second algorithm targets an optimization setting, where we wish to compute the product between the inverse Hessian, estimated over a sliding window of optimization steps, and a given gradient direction, as required for preconditioned SGD. We give an algorithm with cost O(dm + m2) for computing the IHVP and O(dm + m3) for adding or removing any gradient from the sliding window. These
two algorithms yield state-of-the-art results for network pruning and optimization with lower computational overhead relative to existing second-order methods. Implementations are available at [9] and [17].},
  author       = {Frantar, Elias and Kurtic, Eldar and Alistarh, Dan-Adrian},
  booktitle    = {35th Conference on Neural Information Processing Systems},
  isbn         = {9781713845393},
  issn         = {1049-5258},
  location     = {Virtual, Online},
  pages        = {14873--14886},
  publisher    = {Curran Associates},
  title        = {{M-FAC: Efficient matrix-free approximations of second-order information}},
  volume       = {34},
  year         = {2021},
}

@inproceedings{11464,
  abstract     = {We consider a standard distributed optimisation setting where N machines, each holding a d-dimensional function
fi, aim to jointly minimise the sum of the functions ∑Ni=1fi(x). This problem arises naturally in large-scale distributed optimisation, where a standard solution is to apply variants of (stochastic) gradient descent. We focus on the communication complexity of this problem: our main result provides the first fully unconditional bounds on total number of bits which need to be sent and received by the N machines to solve this problem under point-to-point communication, within a given error-tolerance. Specifically, we show that Ω(Ndlogd/Nε) total bits need to be communicated between the machines to find an additive ϵ-approximation to the minimum of ∑Ni=1fi(x). The result holds for both deterministic and randomised algorithms, and, importantly, requires no assumptions on the algorithm structure. The lower bound is tight under certain restrictions on parameter values, and is matched within constant factors for quadratic objectives by a new variant of quantised gradient descent, which we describe and analyse. Our results bring over tools from communication complexity to distributed optimisation, which has potential for further applications.},
  author       = {Alistarh, Dan-Adrian and Korhonen, Janne},
  booktitle    = {35th Conference on Neural Information Processing Systems},
  isbn         = {9781713845393},
  issn         = {1049-5258},
  location     = {Virtual, Online},
  pages        = {7254--7266},
  publisher    = {Curran Associates},
  title        = {{Towards tight communication lower bounds for distributed optimisation}},
  volume       = {34},
  year         = {2021},
}