@inproceedings{7802,
  abstract     = {The Massively Parallel Computation (MPC) model is an emerging model which distills core  aspects of distributed and parallel computation. It has been developed as a tool to solve (typically graph) problems in systems where the input is distributed over many machines with limited space.
	
Recent work has focused on the regime in which machines have sublinear (in $n$, the number of nodes in the input graph) space, with randomized algorithms presented for fundamental graph problems of Maximal Matching and Maximal Independent Set. However, there have been no prior corresponding deterministic algorithms.
	
	A major challenge underlying the sublinear space setting is that the local space of each machine might be too small to store all the edges incident to a single node. This poses a considerable obstacle compared to the classical models in which each node is assumed to know and have easy access to its incident edges. To overcome this barrier we introduce a new graph sparsification technique that deterministically computes a low-degree subgraph with additional desired properties. The degree of the nodes in this subgraph is small in the sense that the edges of each node can be now stored on a single machine. This low-degree subgraph also has the property that solving the problem on this subgraph provides \emph{significant} global progress, i.e., progress towards solving the problem for the original input graph.
	
Using this framework to derandomize the well-known randomized algorithm of Luby [SICOMP'86], we obtain $O(\log \Delta+\log\log n)$-round deterministic MPC algorithms for solving the fundamental problems of Maximal Matching and Maximal Independent Set with $O(n^{\epsilon})$ space on each machine for any constant $\epsilon > 0$. Based on the recent work of Ghaffari et al. [FOCS'18], this additive $O(\log\log n)$ factor is conditionally essential. These algorithms can also be shown to run in $O(\log \Delta)$ rounds in the closely related model of CONGESTED CLIQUE, improving upon the state-of-the-art bound of $O(\log^2 \Delta)$ rounds by Censor-Hillel et al. [DISC'17].},
  author       = {Czumaj, Artur and Davies, Peter and Parter, Merav},
  booktitle    = {Proceedings of the 32nd ACM Symposium on Parallelism in Algorithms and Architectures (SPAA 2020)},
  location     = {Virtual Event, United States},
  number       = {7},
  pages        = {175--185},
  publisher    = {Association for Computing Machinery},
  title        = {{Graph sparsification for derandomizing massively parallel computation with low space}},
  doi          = {10.1145/3350755.3400282},
  year         = {2020},
}

@inproceedings{7803,
  abstract     = {We settle the complexity of the (Δ+1)-coloring and (Δ+1)-list coloring problems in the CONGESTED CLIQUE model by presenting a simple deterministic algorithm for both problems running in a constant number of rounds. This matches the complexity of the recent breakthrough randomized constant-round (Δ+1)-list coloring algorithm due to Chang et al. (PODC'19), and significantly improves upon the state-of-the-art O(logΔ)-round deterministic (Δ+1)-coloring bound of Parter (ICALP'18).
A remarkable property of our algorithm is its simplicity. Whereas the state-of-the-art randomized algorithms for this problem are based on the quite involved local coloring algorithm of Chang et al. (STOC'18), our algorithm can be described in just a few lines. At a high level, it applies a careful derandomization of a recursive procedure which partitions the nodes and their respective palettes into separate bins. We show that after O(1) recursion steps, the remaining uncolored subgraph within each bin has linear size, and thus can be solved locally by collecting it to a single node. This algorithm can also be implemented in the Massively Parallel Computation (MPC) model provided that each machine has linear (in n, the number of nodes in the input graph) space.
We also show an extension of our algorithm to the MPC regime in which machines have sublinear space: we present the first deterministic (Δ+1)-list coloring algorithm designed for sublinear-space MPC, which runs in O(logΔ+loglogn) rounds.},
  author       = {Czumaj, Artur and Davies, Peter and Parter, Merav},
  booktitle    = {Proceedings of the 2020 ACM Symposium on Principles of Distributed Computing},
  location     = {Salerno, Italy},
  pages        = {309--318},
  publisher    = {Association for Computing Machinery},
  title        = {{Simple, deterministic, constant-round coloring in the congested clique}},
  doi          = {10.1145/3382734.3405751},
  year         = {2020},
}

@inproceedings{8191,
  abstract     = {There has been a significant amount of research on hardware and software support for efficient concurrent data structures; yet, the question of how to build correct, simple, and scalable data structures has not yet been definitively settled. In this paper, we revisit this question from a minimalist perspective, and ask: what is the smallest amount of synchronization required for correct and efficient concurrent search data structures, and how could this minimal synchronization support be provided in hardware?

To address these questions, we introduce memory tagging, a simple hardware mechanism which enables the programmer to "tag" a dynamic set of memory locations, at cache-line granularity, and later validate whether the memory has been concurrently modified, with the possibility of updating one of the underlying locations atomically if validation succeeds. We provide several examples showing that this mechanism can enable fast and arguably simple concurrent data structure designs, such as lists, binary search trees, balanced search trees, range queries, and Software Transactional Memory (STM) implementations. We provide an implementation of memory tags in the Graphite multi-core simulator, showing that the mechanism can be implemented entirely at the level of L1 cache, and that it can enable non-trivial speedups versus existing implementations of the above data structures.},
  author       = {Alistarh, Dan-Adrian and Brown, Trevor A and Singhal, Nandini},
  booktitle    = {Annual ACM Symposium on Parallelism in Algorithms and Architectures},
  isbn         = {9781450369350},
  location     = {Virtual Event, United States},
  number       = {7},
  pages        = {37--49},
  publisher    = {Association for Computing Machinery},
  title        = {{Memory tagging: Minimalist synchronization for scalable concurrent data structures}},
  doi          = {10.1145/3350755.3400213},
  year         = {2020},
}

@article{8268,
  abstract     = {Modern scientific instruments produce vast amounts of data, which can overwhelm the processing ability of computer systems. Lossy compression of data is an intriguing solution, but comes with its own drawbacks, such as potential signal loss, and the need for careful optimization of the compression ratio. In this work, we focus on a setting where this problem is especially acute: compressive sensing frameworks for interferometry and medical imaging. We ask the following question: can the precision of the data representation be lowered for all inputs, with recovery guarantees and practical performance Our first contribution is a theoretical analysis of the normalized Iterative Hard Thresholding (IHT) algorithm when all input data, meaning both the measurement matrix and the observation vector are quantized aggressively. We present a variant of low precision normalized IHT that, under mild conditions, can still provide recovery guarantees. The second contribution is the application of our quantization framework to radio astronomy and magnetic resonance imaging. We show that lowering the precision of the data can significantly accelerate image recovery. We evaluate our approach on telescope data and samples of brain images using CPU and FPGA implementations achieving up to a 9x speedup with negligible loss of recovery quality.},
  author       = {Gurel, Nezihe Merve and Kara, Kaan and Stojanov, Alen and Smith, Tyler and Lemmin, Thomas and Alistarh, Dan-Adrian and Puschel, Markus and Zhang, Ce},
  issn         = {19410476},
  journal      = {IEEE Transactions on Signal Processing},
  pages        = {4268--4282},
  publisher    = {IEEE},
  title        = {{Compressive sensing using iterative hard thresholding with low precision data representation: Theory and applications}},
  doi          = {10.1109/TSP.2020.3010355},
  volume       = {68},
  year         = {2020},
}

@inproceedings{8383,
  abstract     = {We introduce extension-based proofs, a class of impossibility proofs that includes valency arguments. They are modelled as an interaction between a prover and a protocol. Using proofs based on combinatorial topology, it has been shown that it is impossible to deterministically solve k-set agreement among n > k ≥ 2 processes in a wait-free manner. However, it was unknown whether proofs based on simpler techniques were possible. We explain why this impossibility result cannot be obtained by an extension-based proof and, hence, extension-based proofs are limited in power.},
  author       = {Alistarh, Dan-Adrian and Aspnes, James and Ellen, Faith and Gelashvili, Rati and Zhu, Leqi},
  booktitle    = {Proceedings of the 39th Symposium on Principles of Distributed Computing},
  isbn         = {9781450375825},
  location     = {Virtual, Italy},
  pages        = {54--56},
  publisher    = {Association for Computing Machinery},
  title        = {{Brief Announcement: Why Extension-Based Proofs Fail}},
  doi          = {10.1145/3382734.3405743},
  year         = {2020},
}

@inproceedings{8722,
  abstract     = {Load imbalance pervasively exists in distributed deep learning training systems, either caused by the inherent imbalance in learned tasks or by the system itself. Traditional synchronous Stochastic Gradient Descent (SGD)
achieves good accuracy for a wide variety of tasks, but relies on global synchronization to accumulate the gradients at every training step. In this paper, we propose eager-SGD, which relaxes the global synchronization for
decentralized accumulation. To implement eager-SGD, we propose to use two partial collectives: solo and majority. With solo allreduce, the faster processes contribute their gradients eagerly without waiting for the slower processes, whereas with majority allreduce, at least half of the participants must contribute gradients before continuing, all without using a central parameter server. We theoretically prove the convergence of the algorithms and describe the partial collectives in detail. Experimental results on load-imbalanced environments (CIFAR-10, ImageNet, and UCF101 datasets) show
that eager-SGD achieves 1.27x speedup over the state-of-the-art synchronous SGD, without losing accuracy.},
  author       = {Li, Shigang and Tal Ben-Nun, Tal Ben-Nun and Girolamo, Salvatore Di and Alistarh, Dan-Adrian and Hoefler, Torsten},
  booktitle    = {Proceedings of the 25th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming},
  location     = {San Diego, CA, United States},
  pages        = {45--61},
  publisher    = {Association for Computing Machinery},
  title        = {{Taming unbalanced training workloads in deep learning with partial collective operations}},
  doi          = {10.1145/3332466.3374528},
  year         = {2020},
}

@inproceedings{8724,
  abstract     = {We study the problem of learning from multiple untrusted data sources, a scenario of increasing practical relevance given the recent emergence of crowdsourcing and collaborative learning paradigms. Specifically, we analyze the situation in which a learning system obtains datasets from multiple sources, some of which might be biased or even adversarially perturbed. It is
known that in the single-source case, an adversary with the power to corrupt a fixed fraction of the training data can prevent PAC-learnability, that is, even in the limit of infinitely much training data, no learning system can approach the optimal test error. In this work we show that, surprisingly, the same is not true in the multi-source setting, where the adversary can arbitrarily
corrupt a fixed fraction of the data sources. Our main results are a generalization bound that provides finite-sample guarantees for this learning setting, as well as corresponding lower bounds. Besides establishing PAC-learnability our results also show that in a cooperative learning setting sharing data with other parties has provable benefits, even if some
participants are malicious. },
  author       = {Konstantinov, Nikola H and Frantar, Elias and Alistarh, Dan-Adrian and Lampert, Christoph},
  booktitle    = {Proceedings of the 37th International Conference on Machine Learning},
  issn         = {2640-3498},
  location     = {Online},
  pages        = {5416--5425},
  publisher    = {ML Research Press},
  title        = {{On the sample complexity of adversarial multi-source PAC learning}},
  volume       = {119},
  year         = {2020},
}

@inproceedings{8725,
  abstract     = {The design and implementation of efficient concurrent data structures have
seen significant attention. However, most of this work has focused on
concurrent data structures providing good \emph{worst-case} guarantees. In real
workloads, objects are often accessed at different rates, since access
distributions may be non-uniform. Efficient distribution-adaptive data
structures are known in the sequential case, e.g. the splay-trees; however,
they often are hard to translate efficiently in the concurrent case.
  In this paper, we investigate distribution-adaptive concurrent data
structures and propose a new design called the splay-list. At a high level, the
splay-list is similar to a standard skip-list, with the key distinction that
the height of each element adapts dynamically to its access rate: popular
elements ``move up,'' whereas rarely-accessed elements decrease in height. We
show that the splay-list provides order-optimal amortized complexity bounds for
a subset of operations while being amenable to efficient concurrent
implementation. Experimental results show that the splay-list can leverage
distribution-adaptivity to improve on the performance of classic concurrent
designs, and can outperform the only previously-known distribution-adaptive
design in certain settings.},
  author       = {Aksenov, Vitaly and Alistarh, Dan-Adrian and Drozdova, Alexandra and Mohtashami, Amirkeivan},
  booktitle    = {34th International Symposium on Distributed Computing},
  isbn         = {9783959771689},
  issn         = {1868-8969},
  location     = {Freiburg, Germany},
  pages        = {3:1--3:18},
  publisher    = {Schloss Dagstuhl - Leibniz-Zentrum für Informatik},
  title        = {{The splay-list: A distribution-adaptive concurrent skip-list}},
  doi          = {10.4230/LIPIcs.DISC.2020.3},
  volume       = {179},
  year         = {2020},
}

@inproceedings{7213,
  abstract     = {Persistent homology is a powerful tool in Topological Data Analysis (TDA) to capture the topological properties of data succinctly at different spatial resolutions. For graphical data, the shape, and structure of the neighborhood of individual data items (nodes) are an essential means of characterizing their properties. We propose the use of persistent homology methods to capture structural and topological properties of graphs and use it to address the problem of link prediction. We achieve encouraging results on nine different real-world datasets that attest to the potential of persistent homology-based methods for network analysis.},
  author       = {Bhatia, Sumit and Chatterjee, Bapi and Nathani, Deepak and Kaul, Manohar},
  booktitle    = {Complex Networks and their applications VIII},
  isbn         = {9783030366865},
  issn         = {18609503},
  location     = {Lisbon, Portugal},
  pages        = {27--39},
  publisher    = {Springer Nature},
  title        = {{A persistent homology perspective to the link prediction problem}},
  doi          = {10.1007/978-3-030-36687-2_3},
  volume       = {881},
  year         = {2020},
}

@article{7224,
  abstract     = {Habitat loss is one of the key drivers of the ongoing decline of biodiversity. However, ecologists still argue about how fragmentation of habitat (independent of habitat loss) affects species richness. The recently proposed habitat amount hypothesis posits that species richness only depends on the total amount of habitat in a local landscape. In contrast, empirical studies report contrasting patterns: some find positive and others negative effects of fragmentation per se on species richness. To explain this apparent disparity, we devise a stochastic, spatially explicit model of competitive species communities in heterogeneous habitats. The model shows that habitat loss and fragmentation have complex effects on species diversity in competitive communities. When the total amount of habitat is large, fragmentation per se tends to increase species diversity, but if the total amount of habitat is small, the situation is reversed: fragmentation per se decreases species diversity.},
  author       = {Rybicki, Joel and Abrego, Nerea and Ovaskainen, Otso},
  issn         = {1461-0248},
  journal      = {Ecology Letters},
  number       = {3},
  pages        = {506--517},
  publisher    = {Wiley},
  title        = {{Habitat fragmentation and species diversity in competitive communities}},
  doi          = {10.1111/ele.13450},
  volume       = {23},
  year         = {2020},
}

@inproceedings{7272,
  abstract     = {Many systems rely on optimistic concurrent search trees for multi-core scalability. In principle, optimistic trees have a simple performance story: searches are read-only and so run in parallel, with writes to shared memory occurring only when modifying the data structure. However, this paper shows that in practice, obtaining the full performance benefits of optimistic search trees is not so simple.

We focus on optimistic binary search trees (BSTs) and perform a detailed performance analysis of 10 state-of-the-art BSTs on large scale x86-64 hardware, using both microbenchmarks and an in-memory database system. We find and explain significant unexpected performance differences between BSTs with similar tree structure and search implementations, which we trace to subtle performance-degrading interactions of BSTs with systems software and hardware subsystems. We further derive a prescriptive approach to avoid this performance degradation, as well as algorithmic insights on optimistic BST design. Our work underlines the gap between the theory and practice of multi-core performance, and calls for further research to help bridge this gap.},
  author       = {Arbel-Raviv, Maya and Brown, Trevor A and Morrison, Adam},
  booktitle    = {Proceedings of the 2018 USENIX Annual Technical Conference},
  isbn         = {9781939133021},
  location     = {Boston, MA, United States},
  pages        = {295--306},
  publisher    = {USENIX Association},
  title        = {{Getting to the root of concurrent binary search tree performance}},
  year         = {2020},
}

@inproceedings{7605,
  abstract     = {Union-Find (or Disjoint-Set Union) is one of the fundamental problems in computer science; it has been well-studied from both theoretical and practical perspectives in the sequential case. Recently, there has been mounting interest in analyzing this problem in the concurrent scenario, and several asymptotically-efficient algorithms have been proposed. Yet, to date, there is very little known about the practical performance of concurrent Union-Find. This work addresses this gap. We evaluate and analyze the performance of several concurrent Union-Find algorithms and optimization strategies across a wide range of platforms (Intel, AMD, and ARM) and workloads (social, random, and road networks, as well as integrations into more complex algorithms). We first observe that, due to the limited computational cost, the number of induced cache misses is the critical determining factor for the performance of existing algorithms. We introduce new techniques to reduce this cost by storing node priorities implicitly and by using plain reads and writes in a way that does not affect the correctness of the algorithms. Finally, we show that Union-Find implementations are an interesting application for Transactional Memory (TM): one of the fastest algorithm variants we discovered is a sequential one that uses coarse-grained locking with the lock elision optimization to reduce synchronization cost and increase scalability. },
  author       = {Alistarh, Dan-Adrian and Fedorov, Alexander and Koval, Nikita},
  booktitle    = {23rd International Conference on Principles of Distributed Systems},
  isbn         = {9783959771337},
  issn         = {18688969},
  location     = {Neuchatal, Switzerland},
  pages        = {15:1--15:16},
  publisher    = {Schloss Dagstuhl - Leibniz-Zentrum für Informatik},
  title        = {{In search of the fastest concurrent union-find algorithm}},
  doi          = {10.4230/LIPIcs.OPODIS.2019.15},
  volume       = {153},
  year         = {2020},
}

@inproceedings{7635,
  abstract     = {Concurrent programming can be notoriously complex and error-prone. Programming bugs can arise from a variety of sources, such as operation re-reordering, or incomplete understanding of the memory model. A variety of formal and model checking methods have been developed to address this fundamental difficulty. While technically interesting, existing academic methods are still hard to apply to the large codebases typical of industrial deployments, which limits their practical impact.},
  author       = {Koval, Nikita and Sokolova, Mariia and Fedorov, Alexander and Alistarh, Dan-Adrian and Tsitelov, Dmitry},
  booktitle    = {Proceedings of the ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, PPOPP},
  isbn         = {9781450368186},
  location     = {San Diego, CA, United States},
  pages        = {423--424},
  publisher    = {Association for Computing Machinery},
  title        = {{Testing concurrency on the JVM with Lincheck}},
  doi          = {10.1145/3332466.3374503},
  year         = {2020},
}

@inproceedings{7636,
  abstract     = {Balanced search trees typically use key comparisons to guide their operations, and achieve logarithmic running time. By relying on numerical properties of the keys, interpolation search achieves lower search complexity and better performance. Although interpolation-based data structures were investigated in the past, their non-blocking concurrent variants have received very little attention so far.
In this paper, we propose the first non-blocking implementation of the classic interpolation search tree (IST) data structure. For arbitrary key distributions, the data structure ensures worst-case O(log n + p) amortized time for search, insertion and deletion traversals. When the input key distributions are smooth, lookups run in expected O(log log n + p) time, and insertion and deletion run in expected amortized O(log log n + p) time, where p is a bound on the number of threads. To improve the scalability of concurrent insertion and deletion, we propose a novel parallel rebuilding technique, which should be of independent interest.
We evaluate whether the theoretical improvements translate to practice by implementing the concurrent interpolation search tree, and benchmarking it on uniform and nonuniform key distributions, for dataset sizes in the millions to billions of keys. Relative to the state-of-the-art concurrent data structures, the concurrent interpolation search tree achieves performance improvements of up to 15% under high update rates, and of up to 50% under moderate update rates. Further, ISTs exhibit up to 2X less cache-misses, and consume 1.2 -- 2.6X less memory compared to the next best alternative on typical dataset sizes. We find that the results are surprisingly robust to distributional skew, which suggests that our data structure can be a promising alternative to classic concurrent search structures.},
  author       = {Brown, Trevor A and Prokopec, Aleksandar and Alistarh, Dan-Adrian},
  booktitle    = {Proceedings of the ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming},
  isbn         = {9781450368186},
  location     = {San Diego, CA, United States},
  pages        = {276--291},
  publisher    = {Association for Computing Machinery},
  title        = {{Non-blocking interpolation search trees with doubly-logarithmic running time}},
  doi          = {10.1145/3332466.3374542},
  year         = {2020},
}

@inproceedings{15074,
  abstract     = {We introduce a new graph problem, the token dropping game, and we show how to solve it efficiently in a distributed setting. We use the token dropping game as a tool to design an efficient distributed algorithm for the stable orientation problem, which is a special case of the more general locally optimal semi-matching problem. The prior work by Czygrinow et al. (DISC 2012) finds a locally optimal semi-matching in O(Δ⁵) rounds in graphs of maximum degree Δ, which directly implies an algorithm with the same runtime for stable orientations. We improve the runtime to O(Δ⁴) for stable orientations and prove a lower bound of Ω(Δ) rounds.},
  author       = {Brandt, Sebastian and Keller, Barbara and Rybicki, Joel and Suomela, Jukka and Uitto, Jara},
  booktitle    = {34th International Symposium on Distributed Computing},
  location     = {Virtual},
  publisher    = {Schloss Dagstuhl - Leibniz-Zentrum für Informatik},
  title        = {{Brief announcement: Efficient load-balancing through distributed token dropping}},
  doi          = {10.4230/LIPIcs.DISC.2020.40},
  volume       = {179},
  year         = {2020},
}

@inproceedings{15077,
  abstract     = {We consider the following dynamic load-balancing process: given an underlying graph G with n nodes, in each step t≥ 0, one unit of load is created, and placed at a randomly chosen graph node. In the same step, the chosen node picks a random neighbor, and the two nodes balance their loads by averaging them. We are interested in the expected gap between the minimum and maximum loads at nodes as the process progresses, and its dependence on n and on the graph structure. Variants of the above graphical balanced allocation process have been studied previously by Peres, Talwar, and Wieder [Peres et al., 2015], and by Sauerwald and Sun [Sauerwald and Sun, 2015]. These authors left as open the question of characterizing the gap in the case of cycle graphs in the dynamic case, where weights are created during the algorithm’s execution. For this case, the only known upper bound is of 𝒪(n log n), following from a majorization argument due to [Peres et al., 2015], which analyzes a related graphical allocation process. In this paper, we provide an upper bound of 𝒪 (√n log n) on the expected gap of the above process for cycles of length n. We introduce a new potential analysis technique, which enables us to bound the difference in load between k-hop neighbors on the cycle, for any k ≤ n/2. We complement this with a "gap covering" argument, which bounds the maximum value of the gap by bounding its value across all possible subsets of a certain structure, and recursively bounding the gaps within each subset. We provide analytical and experimental evidence that our upper bound on the gap is tight up to a logarithmic factor.},
  author       = {Alistarh, Dan-Adrian and Nadiradze, Giorgi and Sabour, Amirmojtaba},
  booktitle    = {47th International Colloquium on Automata, Languages, and Programming},
  location     = {Saarbrücken, Germany, Virtual},
  publisher    = {Schloss Dagstuhl - Leibniz-Zentrum für Informatik},
  title        = {{Dynamic averaging load balancing on cycles}},
  doi          = {10.4230/LIPIcs.ICALP.2020.7},
  volume       = {168},
  year         = {2020},
}

@inproceedings{9415,
  abstract     = {Optimizing convolutional neural networks for fast inference has recently become an extremely active area of research. One of the go-to solutions in this context is weight pruning, which aims to reduce computational and memory footprint by removing large subsets of the connections in a neural network. Surprisingly, much less attention has been given to exploiting sparsity in the activation maps, which tend to be naturally sparse in many settings thanks to the structure of rectified linear (ReLU) activation functions. In this paper, we present an in-depth analysis of methods for maximizing the sparsity of the activations in a trained neural network, and show that, when coupled with an efficient sparse-input convolution algorithm, we can leverage this sparsity for significant performance gains. To induce highly sparse activation maps without accuracy loss, we introduce a new regularization technique, coupled with a new threshold-based sparsification method based on a parameterized activation function called Forced-Activation-Threshold Rectified Linear Unit (FATReLU). We examine the impact of our methods on popular image classification models, showing that most architectures can adapt to significantly sparser activation maps without any accuracy loss. Our second contribution is showing that these these compression gains can be translated into inference speedups: we provide a new algorithm to enable fast convolution operations over networks with sparse activations, and show that it can enable significant speedups for end-to-end inference on a range of popular models on the large-scale ImageNet image classification task on modern Intel CPUs, with little or no retraining cost. },
  author       = {Kurtz, Mark and Kopinsky, Justin and Gelashvili, Rati and Matveev, Alexander and Carr, John and Goin, Michael and Leiserson, William and Moore, Sage and Nell, Bill and Shavit, Nir and Alistarh, Dan-Adrian},
  booktitle    = {37th International Conference on Machine Learning, ICML 2020},
  issn         = {2640-3498},
  location     = {Online},
  pages        = {5533--5543},
  title        = {{Inducing and exploiting activation sparsity for fast neural network inference}},
  volume       = {119},
  year         = {2020},
}

@inproceedings{9631,
  abstract     = {The ability to leverage large-scale hardware parallelism has been one of the key enablers of the accelerated recent progress in machine learning. Consequently, there has been considerable effort invested into developing efficient parallel variants of classic machine learning algorithms. However, despite the wealth of knowledge on parallelization, some classic machine learning algorithms often prove hard to parallelize efficiently while maintaining convergence. In this paper, we focus on efficient parallel algorithms for the key machine learning task of inference on graphical models, in particular on the fundamental belief propagation algorithm. We address the challenge of efficiently parallelizing this classic paradigm by showing how to leverage scalable relaxed schedulers in this context. We present an extensive empirical study, showing that our approach outperforms previous parallel belief propagation implementations both in terms of scalability and in terms of wall-clock convergence time, on a range of practical applications.},
  author       = {Aksenov, Vitaly and Alistarh, Dan-Adrian and Korhonen, Janne},
  booktitle    = {Advances in Neural Information Processing Systems},
  isbn         = {9781713829546},
  issn         = {10495258},
  location     = {Vancouver, Canada},
  pages        = {22361--22372},
  publisher    = {Curran Associates},
  title        = {{Scalable belief propagation via relaxed scheduling}},
  volume       = {33},
  year         = {2020},
}

@inproceedings{9632,
  abstract     = {Second-order information, in the form of Hessian- or Inverse-Hessian-vector products, is a fundamental tool for solving optimization problems. Recently, there has been significant interest in utilizing this information in the context of deep
neural networks; however, relatively little is known about the quality of existing approximations in this context. Our work examines this question, identifies issues with existing approaches, and proposes a method called WoodFisher to compute a faithful and efficient estimate of the inverse Hessian. Our main application is to neural network compression, where we build on the classic Optimal Brain Damage/Surgeon framework. We demonstrate that WoodFisher significantly outperforms popular state-of-the-art methods for oneshot pruning. Further, even when iterative, gradual pruning is allowed, our method results in a gain in test accuracy over the state-of-the-art approaches, for standard image classification datasets such as ImageNet ILSVRC. We examine how our method can be extended to take into account first-order information, as well as
illustrate its ability to automatically set layer-wise pruning thresholds and perform compression in the limited-data regime. The code is available at the following link, https://github.com/IST-DASLab/WoodFisher.},
  author       = {Singh, Sidak Pal and Alistarh, Dan-Adrian},
  booktitle    = {Advances in Neural Information Processing Systems},
  isbn         = {9781713829546},
  issn         = {10495258},
  location     = {Vancouver, Canada},
  pages        = {18098--18109},
  publisher    = {Curran Associates},
  title        = {{WoodFisher: Efficient second-order approximation for neural network compression}},
  volume       = {33},
  year         = {2020},
}

@inproceedings{6673,
  abstract     = {Several classic problems in graph processing and computational geometry are solved via incremental algorithms, which split computation into a series of small tasks acting on shared state, which gets updated progressively. While the sequential variant of such algorithms usually specifies a fixed (but sometimes random) order in which the tasks should be performed, a standard approach to parallelizing such algorithms is to relax this constraint to allow for out-of-order parallel execution. This is the case for parallel implementations of Dijkstra's single-source shortest-paths (SSSP) algorithm, and for parallel Delaunay mesh triangulation. While many software frameworks parallelize incremental computation in this way, it is still not well understood whether this relaxed ordering approach can still provide any complexity guarantees. In this paper, we address this problem, and analyze the efficiency guarantees provided by a range of incremental algorithms when parallelized via relaxed schedulers. We show that, for algorithms such as Delaunay mesh triangulation and sorting by insertion, schedulers with a maximum relaxation factor of k in terms of the maximum priority inversion allowed will introduce a maximum amount of wasted work of O(łog n poly(k)), where n is the number of tasks to be executed. For SSSP, we show that the additional work is O(poly(k), dmax / wmin), where dmax is the maximum distance between two nodes, and wmin is the minimum such distance. In practical settings where n >> k, this suggests that the overheads of relaxation will be outweighed by the improved scalability of the relaxed scheduler. On the negative side, we provide lower bounds showing that certain algorithms will inherently incur a non-trivial amount of wasted work due to scheduler relaxation, even for relatively benign relaxed schedulers.},
  author       = {Alistarh, Dan-Adrian and Nadiradze, Giorgi and Koval, Nikita},
  booktitle    = {31st ACM Symposium on Parallelism in Algorithms and Architectures},
  isbn         = {9781450361842},
  location     = {Phoenix, AZ, United States},
  pages        = {145--154},
  publisher    = {ACM Press},
  title        = {{Efficiency guarantees for parallel incremental algorithms under relaxed schedulers}},
  doi          = {10.1145/3323165.3323201},
  year         = {2019},
}