@inproceedings{6676,
  abstract     = {It is impossible to deterministically solve wait-free consensus in an asynchronous system. The classic proof uses a valency argument, which constructs an infinite execution by repeatedly extending a finite execution. We introduce extension-based proofs, a class of impossibility proofs that are modelled as an interaction between a prover and a protocol and that include valency arguments.

Using proofs based on combinatorial topology, it has been shown that it is impossible to deterministically solve k-set agreement among n > k ≥ 2 processes in a wait-free manner. However, it was unknown whether proofs based on simpler techniques were possible. We show that this impossibility result cannot be obtained by an extension-based proof and, hence, extension-based proofs are limited in power.},
  author       = {Alistarh, Dan-Adrian and Aspnes, James and Ellen, Faith and Gelashvili, Rati and Zhu, Leqi},
  booktitle    = {Proceedings of the 51st Annual ACM SIGACT Symposium on Theory of Computing},
  isbn         = {9781450367059},
  location     = {Phoenix, AZ, United States},
  pages        = {986--996},
  publisher    = {ACM Press},
  title        = {{Why extension-based proofs fail}},
  doi          = {10.1145/3313276.3316407},
  year         = {2019},
}

@article{6759,
  abstract     = {We consider the graph class Grounded-L corresponding to graphs that admit an intersection representation by L-shaped curves, where additionally the topmost points of each curve are assumed to belong to a common horizontal line. We prove that Grounded-L graphs admit an equivalent characterisation in terms of vertex ordering with forbidden patterns. 
We also compare this class to related intersection classes, such as the grounded segment graphs, the monotone L-graphs (a.k.a. max point-tolerance graphs), or the outer-1-string graphs. We give constructions showing that these classes are all distinct and satisfy only trivial or previously known inclusions.},
  author       = {Jelínek, Vít and Töpfer, Martin},
  issn         = {10778926},
  journal      = {Electronic Journal of Combinatorics},
  number       = {3},
  publisher    = {Electronic Journal of Combinatorics},
  title        = {{On grounded L-graphs and their relatives}},
  doi          = {10.37236/8096},
  volume       = {26},
  year         = {2019},
}

@inproceedings{6931,
  abstract     = {Consider a distributed system with n processors out of which f can be Byzantine faulty. In the
approximate agreement task, each processor i receives an input value xi and has to decide on an
output value yi such that
1. the output values are in the convex hull of the non-faulty processors’ input values,
2. the output values are within distance d of each other.


Classically, the values are assumed to be from an m-dimensional Euclidean space, where m ≥ 1.
In this work, we study the task in a discrete setting, where input values with some structure
expressible as a graph. Namely, the input values are vertices of a finite graph G and the goal is to
output vertices that are within distance d of each other in G, but still remain in the graph-induced
convex hull of the input values. For d = 0, the task reduces to consensus and cannot be solved with
a deterministic algorithm in an asynchronous system even with a single crash fault. For any d ≥ 1,
we show that the task is solvable in asynchronous systems when G is chordal and n > (ω + 1)f,
where ω is the clique number of G. In addition, we give the first Byzantine-tolerant algorithm for a
variant of lattice agreement. For synchronous systems, we show tight resilience bounds for the exact
variants of these and related tasks over a large class of combinatorial structures.},
  author       = {Nowak, Thomas and Rybicki, Joel},
  booktitle    = {33rd International Symposium on Distributed Computing},
  keywords     = {consensus, approximate agreement, Byzantine faults, chordal graphs, lattice agreement},
  location     = {Budapest, Hungary},
  pages        = {29:1----29:17},
  publisher    = {Schloss Dagstuhl - Leibniz-Zentrum für Informatik},
  title        = {{Byzantine approximate agreement on graphs}},
  doi          = {10.4230/LIPICS.DISC.2019.29},
  volume       = {146},
  year         = {2019},
}

@inproceedings{6933,
  abstract     = {We design fast deterministic algorithms for distance computation in the CONGESTED CLIQUE model. Our key contributions include:

 - A (2+ε)-approximation for all-pairs shortest paths problem in O(log²n / ε) rounds on unweighted undirected graphs. With a small additional additive factor, this also applies for weighted graphs. This is the first sub-polynomial constant-factor approximation for APSP in this model.
 - A (1+ε)-approximation for multi-source shortest paths problem from O(√n) sources in O(log² n / ε) rounds on weighted undirected graphs. This is the first sub-polynomial algorithm obtaining this approximation for a set of sources of polynomial size.

Our main techniques are new distance tools that are obtained via improved algorithms for sparse matrix multiplication, which we leverage to construct efficient hopsets and shortest paths. Furthermore, our techniques extend to additional distance problems for which we improve upon the state-of-the-art, including diameter approximation, and an exact single-source shortest paths algorithm for weighted undirected graphs in Õ(n^{1/6}) rounds.},
  author       = {Censor-Hillel, Keren and Dory, Michal and Korhonen, Janne and Leitersdorf, Dean},
  booktitle    = {Proceedings of the 2019 ACM Symposium on Principles of Distributed Computin},
  isbn         = {9781450362177},
  location     = {Toronto, ON, Canada},
  pages        = {74--83},
  publisher    = {ACM},
  title        = {{Fast approximate shortest paths in the congested clique}},
  doi          = {10.1145/3293611.3331633},
  year         = {2019},
}

@inproceedings{6935,
  abstract     = {This paper investigates the power of preprocessing in the CONGEST model. Schmid and Suomela (ACM HotSDN 2013) introduced the SUPPORTED CONGEST model to study the application of distributed algorithms in Software-Defined Networks (SDNs). In this paper, we show that a large class of lower bounds in the CONGEST model still hold in the SUPPORTED model, highlighting the robustness of these bounds. This also raises the question how much does
preprocessing help in the CONGEST model.},
  author       = {Foerster, Klaus-Tycho and Korhonen, Janne and Rybicki, Joel and Schmid, Stefan},
  booktitle    = {Proceedings of the 2019 ACM Symposium on Principles of Distributed Computing},
  isbn         = {9781450362177},
  location     = {Toronto, ON, Canada},
  pages        = {259--261},
  publisher    = {ACM},
  title        = {{Does preprocessing help under congestion?}},
  doi          = {10.1145/3293611.3331581},
  year         = {2019},
}

@article{6936,
  abstract     = {A key challenge for community ecology is to understand to what extent observational data can be used to infer the underlying community assembly processes. As different processes can lead to similar or even identical patterns, statistical analyses of non‐manipulative observational data never yield undisputable causal inference on the underlying processes. Still, most empirical studies in community ecology are based on observational data, and hence understanding under which circumstances such data can shed light on assembly processes is a central concern for community ecologists. We simulated a spatial agent‐based model that generates variation in metacommunity dynamics across multiple axes, including the four classic metacommunity paradigms as special cases. We further simulated a virtual ecologist who analysed snapshot data sampled from the simulations using eighteen output metrics derived from beta‐diversity and habitat variation indices, variation partitioning and joint species distribution modelling. Our results indicated two main axes of variation in the output metrics. The first axis of variation described whether the landscape has patchy or continuous variation, and thus was essentially independent of the properties of the species community. The second axis of variation related to the level of predictability of the metacommunity. The most predictable communities were niche‐based metacommunities inhabiting static landscapes with marked environmental heterogeneity, such as metacommunities following the species sorting paradigm or the mass effects paradigm. The most unpredictable communities were neutral‐based metacommunities inhabiting dynamics landscapes with little spatial heterogeneity, such as metacommunities following the neutral or patch sorting paradigms. The output metrics from joint species distribution modelling yielded generally the highest resolution to disentangle among the simulated scenarios. Yet, the different types of statistical approaches utilized in this study carried complementary information, and thus our results suggest that the most comprehensive evaluation of metacommunity structure can be obtained by combining them.
},
  author       = {Ovaskainen, Otso and Rybicki, Joel and Abrego, Nerea},
  issn         = {1600-0587},
  journal      = {Ecography},
  number       = {11},
  pages        = {1877--1886},
  publisher    = {Wiley},
  title        = {{What can observational data reveal about metacommunity processes?}},
  doi          = {10.1111/ecog.04444},
  volume       = {42},
  year         = {2019},
}

@article{6972,
  abstract     = {We give fault-tolerant algorithms for establishing synchrony in distributed systems in which each of thennodes has its own clock. Our algorithms operate in a very strong fault model: we require self-stabilisation, i.e.,the initial state of the system may be arbitrary, and there can be up to f<n/3 ongoing Byzantine faults, i.e.,nodes that deviate from the protocol in an arbitrary manner. Furthermore, we assume that the local clocks ofthe nodes may progress at different speeds (clock drift) and communication has bounded delay. In this model,we study the pulse synchronisation problem, where the task is to guarantee that eventually all correct nodesgenerate well-separated local pulse events (i.e., unlabelled logical clock ticks) in a synchronised manner.Compared to prior work, we achieveexponentialimprovements in stabilisation time and the number ofcommunicated bits, and give the first sublinear-time algorithm for the problem:•In the deterministic setting, the state-of-the-art solutions stabilise in timeΘ(f)and have each nodebroadcastΘ(flogf)bits per time unit. We exponentially reduce the number of bits broadcasted pertime unit toΘ(logf)while retaining the same stabilisation time.•In the randomised setting, the state-of-the-art solutions stabilise in timeΘ(f)and have each nodebroadcastO(1)bits per time unit. We exponentially reduce the stabilisation time to polylogfwhileeach node broadcasts polylogfbits per time unit.These results are obtained by means of a recursive approach reducing the above task ofself-stabilisingpulse synchronisation in thebounded-delaymodel tonon-self-stabilisingbinary consensus in thesynchro-nousmodel. In general, our approach introduces at most logarithmic overheads in terms of stabilisation timeand broadcasted bits over the underlying consensus routine.},
  author       = {Lenzen, Christoph and Rybicki, Joel},
  issn         = {0004-5411},
  journal      = {Journal of the ACM},
  number       = {5},
  publisher    = {ACM},
  title        = {{Self-stabilising Byzantine clock synchronisation is almost as easy as consensus}},
  doi          = {10.1145/3339471},
  volume       = {66},
  year         = {2019},
}

@inproceedings{7122,
  abstract     = {Data-rich applications in machine-learning and control have motivated an intense research on large-scale optimization. Novel algorithms have been proposed and shown to have optimal convergence rates in terms of iteration counts. However, their practical performance is severely degraded by the cost of exchanging high-dimensional gradient vectors between computing nodes. Several gradient compression heuristics have recently been proposed to reduce communications, but few theoretical results exist that quantify how they impact algorithm convergence. This paper establishes and strengthens the convergence guarantees for gradient descent under a family of gradient compression techniques. For convex optimization problems, we derive admissible step sizes and quantify both the number of iterations and the number of bits that need to be exchanged to reach a target accuracy. Finally, we validate the performance of different gradient compression techniques in simulations. The numerical results highlight the properties of different gradient compression algorithms and confirm that fast convergence with limited information exchange is possible.},
  author       = {Khirirat, Sarit and Johansson, Mikael and Alistarh, Dan-Adrian},
  booktitle    = {2018 IEEE Conference on Decision and Control},
  isbn         = {9781538613955},
  issn         = {0743-1546},
  location     = {Miami Beach, FL, United States},
  publisher    = {IEEE},
  title        = {{Gradient compression for communication-limited convex optimization}},
  doi          = {10.1109/cdc.2018.8619625},
  year         = {2019},
}

@inproceedings{7201,
  abstract     = {Applying machine learning techniques to the quickly growing data in science and industry requires highly-scalable algorithms. Large datasets are most commonly processed "data parallel" distributed across many nodes. Each node's contribution to the overall gradient is summed using a global allreduce. This allreduce is the single communication and thus scalability bottleneck for most machine learning workloads. We observe that frequently, many gradient values are (close to) zero, leading to sparse of sparsifyable communications. To exploit this insight, we analyze, design, and implement a set of communication-efficient protocols for sparse input data, in conjunction with efficient machine learning algorithms which can leverage these primitives. Our communication protocols generalize standard collective operations, by allowing processes to contribute arbitrary sparse input data vectors. Our generic communication library, SparCML1, extends MPI to support additional features, such as non-blocking (asynchronous) operations and low-precision data representations. As such, SparCML and its techniques will form the basis of future highly-scalable machine learning frameworks.},
  author       = {Renggli, Cedric and Ashkboos, Saleh and Aghagolzadeh, Mehdi and Alistarh, Dan-Adrian and Hoefler, Torsten},
  booktitle    = {International Conference for High Performance Computing, Networking, Storage and Analysis, SC},
  isbn         = {9781450362290},
  issn         = {21674337},
  location     = {Denver, CO, Unites States},
  publisher    = {ACM},
  title        = {{SparCML: High-performance sparse communication for machine learning}},
  doi          = {10.1145/3295500.3356222},
  year         = {2019},
}

@article{7214,
  abstract     = {Background: Many cancer genomes are extensively rearranged with highly aberrant chromosomal karyotypes. Structural and copy number variations in cancer genomes can be determined via abnormal mapping of sequenced reads to the reference genome. Recently it became possible to reconcile both of these types of large-scale variations into a karyotype graph representation of the rearranged cancer genomes. Such a representation, however, does not directly describe the linear and/or circular structure of the underlying rearranged cancer chromosomes, thus limiting possible analysis of cancer genomes somatic evolutionary process as well as functional genomic changes brought by the large-scale genome rearrangements.

Results: Here we address the aforementioned limitation by introducing a novel methodological framework for recovering rearranged cancer chromosomes from karyotype graphs. For a cancer karyotype graph we formulate an Eulerian Decomposition Problem (EDP) of finding a collection of linear and/or circular rearranged cancer chromosomes that are determined by the graph. We derive and prove computational complexities for several variations of the EDP. We then demonstrate that Eulerian decomposition of the cancer karyotype graphs is not always unique and present the Consistent Contig Covering Problem (CCCP) of recovering unambiguous cancer contigs from the cancer karyotype graph, and describe a novel algorithm CCR capable of solving CCCP in polynomial time. We apply CCR on a prostate cancer dataset and demonstrate that it is capable of consistently recovering large cancer contigs even when underlying cancer genomes are highly rearranged.

Conclusions: CCR can recover rearranged cancer contigs from karyotype graphs thereby addressing existing limitation in inferring chromosomal structures of rearranged cancer genomes and advancing our understanding of both patient/cancer-specific as well as the overall genetic instability in cancer.},
  author       = {Aganezov, Sergey and Zban, Ilya and Aksenov, Vitalii and Alexeev, Nikita and Schatz, Michael C.},
  issn         = {14712105},
  journal      = {BMC Bioinformatics},
  publisher    = {BMC},
  title        = {{Recovering rearranged cancer chromosomes from karyotype graphs}},
  doi          = {10.1186/s12859-019-3208-4},
  volume       = {20},
  year         = {2019},
}

@inproceedings{7228,
  abstract     = {Traditional concurrent programming involves manipulating shared mutable state. Alternatives to this programming style are communicating sequential processes (CSP) and actor models, which share data via explicit communication. These models have been known for almost half a century, and have recently had started to gain significant traction among modern programming languages. The common abstraction for communication between several processes is the channel. Although channels are similar to producer-consumer data structures, they have different semantics and support additional operations, such as the select expression. Despite their growing popularity, most known implementations of channels use lock-based data structures and can be rather inefficient.

In this paper, we present the first efficient lock-free algorithm for implementing a communication channel for CSP programming. We provide implementations and experimental results in the Kotlin and Go programming languages. Our new algorithm outperforms existing implementations on many workloads, while providing non-blocking progress guarantee. Our design can serve as an example of how to construct general communication data structures for CSP and actor models. },
  author       = {Koval, Nikita and Alistarh, Dan-Adrian and Elizarov, Roman},
  booktitle    = {25th Anniversary of Euro-Par},
  isbn         = {978-3-0302-9399-4},
  issn         = {1611-3349},
  location     = {Göttingen, Germany},
  pages        = {317--333},
  publisher    = {Springer Nature},
  title        = {{Scalable FIFO channels for programming via communicating sequential processes}},
  doi          = {10.1007/978-3-030-29400-7_23},
  volume       = {11725},
  year         = {2019},
}

@inproceedings{7437,
  abstract     = {Most of today's distributed machine learning systems assume reliable networks: whenever two machines exchange information (e.g., gradients or models), the network should guarantee the delivery of the message. At the same time, recent work exhibits the impressive tolerance of machine learning algorithms to errors or noise arising from relaxed communication or synchronization. In this paper, we connect these two trends, and consider the following question: Can we design machine learning systems that are tolerant to network unreliability during training? With this motivation, we focus on a theoretical problem of independent interest-given a standard distributed parameter server architecture, if every communication between the worker and the server has a non-zero probability p of being dropped, does there exist an algorithm that still converges, and at what speed? The technical contribution of this paper is a novel theoretical analysis proving that distributed learning over unreliable network can achieve comparable convergence rate to centralized or distributed learning over reliable networks. Further, we prove that the influence of the packet drop rate diminishes with the growth of the number of parameter servers. We map this theoretical result onto a real-world scenario, training deep neural networks over an unreliable network layer, and conduct network simulation to validate the system improvement by allowing the networks to be unreliable.},
  author       = {Yu, Chen and Tang, Hanlin and Renggli, Cedric and Kassing, Simon and Singla, Ankit and Alistarh, Dan-Adrian and Zhang, Ce and Liu, Ji},
  booktitle    = {36th International Conference on Machine Learning, ICML 2019},
  isbn         = {9781510886988},
  location     = {Long Beach, CA, United States},
  pages        = {12481--12512},
  publisher    = {IMLS},
  title        = {{Distributed learning over unreliable networks}},
  volume       = {2019-June},
  year         = {2019},
}

@inproceedings{7542,
  abstract     = {We present a novel class of convolutional neural networks (CNNs) for set functions,i.e., data indexed with the powerset of a finite set. The convolutions are derivedas linear, shift-equivariant functions for various notions of shifts on set functions.The framework is fundamentally different from graph convolutions based on theLaplacian, as it provides not one but several basic shifts, one for each element inthe ground set. Prototypical experiments with several set function classificationtasks on synthetic datasets and on datasets derived from real-world hypergraphsdemonstrate the potential of our new powerset CNNs.},
  author       = {Wendler, Chris and Alistarh, Dan-Adrian and Püschel, Markus},
  issn         = {1049-5258},
  location     = {Vancouver, Canada},
  pages        = {927--938},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Powerset convolutional neural networks}},
  volume       = {32},
  year         = {2019},
}

@inproceedings{5947,
  abstract     = {Graph algorithms applied in many applications, including social networks, communication networks, VLSI design, graphics, and several others, require dynamic modifications - addition and removal of vertices and/or edges - in the graph. This paper presents a novel concurrent non-blocking algorithm to implement a dynamic unbounded directed graph in a shared-memory machine. The addition and removal operations of vertices and edges are lock-free. For a finite sized graph, the lookup operations are wait-free. Most significant component of the presented algorithm is the reachability query in a concurrent graph. The reachability queries in our algorithm are obstruction-free and thus impose minimal additional synchronization cost over other operations. We prove that each of the data structure operations are linearizable. We extensively evaluate a sample C/C++ implementation of the algorithm through a number of micro-benchmarks. The experimental results show that the proposed algorithm scales well with the number of threads and on an average provides 5 to 7x performance improvement over a concurrent graph implementation using coarse-grained locking.},
  author       = {Chatterjee, Bapi and Peri, Sathya and Sa, Muktikanta and Singhal, Nandini},
  booktitle    = {ACM International Conference Proceeding Series},
  isbn         = {978-1-4503-6094-4 },
  location     = {Bangalore, India},
  pages        = {168--177},
  publisher    = {ACM},
  title        = {{A simple and practical concurrent non-blocking unbounded graph with linearizable reachability queries}},
  doi          = {10.1145/3288599.3288617},
  year         = {2019},
}

@misc{6485,
  abstract     = {Traditional concurrent programming involves manipulating shared mutable state. Alternatives to this programming style are communicating sequential processes (CSP) [1] and actor [2] models, which share data via explicit communication. Rendezvous channelis the common abstraction for communication between several processes, where senders and receivers perform a rendezvous handshake as a part of their protocol (senders wait for receivers and vice versa). Additionally to this, channels support the select expression. In this work, we present the first efficient lock-free channel algorithm, and compare it against Go [3] and Kotlin [4] baseline implementations.},
  author       = {Koval, Nikita and Alistarh, Dan-Adrian and Elizarov, Roman},
  booktitle    = {Proceedings of the 24th Symposium on Principles and Practice of Parallel Programming},
  isbn         = {9781450362252},
  location     = {Washington, NY, United States},
  pages        = {417--418},
  publisher    = {ACM Press},
  title        = {{Lock-free channels for programming via communicating sequential processes}},
  doi          = {10.1145/3293883.3297000},
  year         = {2019},
}

@inproceedings{7812,
  abstract     = {Deep neural networks (DNNs) continue to make significant advances, solving tasks from image classification to translation or reinforcement learning. One aspect of the field receiving considerable attention is efficiently executing deep models in resource-constrained environments, such as mobile or embedded devices. This paper focuses on this problem, and proposes two new compression methods, which jointly leverage weight quantization and distillation of larger teacher networks into smaller student networks. The first method we propose is called quantized distillation and leverages distillation during the training process, by incorporating distillation loss, expressed with respect to the teacher, into the training of a student network whose weights are quantized to a limited set of levels. The second method,  differentiable quantization, optimizes the location of quantization points through stochastic gradient descent, to better fit the behavior of the teacher model.  We validate both methods through experiments on convolutional and recurrent architectures. We show that quantized shallow students can reach similar accuracy levels to full-precision teacher models, while providing order of magnitude compression, and inference speedup that is linear in the depth reduction. In sum, our results enable DNNs for resource-constrained environments to leverage architecture and accuracy advances developed on more powerful devices.},
  author       = {Polino, Antonio and Pascanu, Razvan and Alistarh, Dan-Adrian},
  booktitle    = {6th International Conference on Learning Representations},
  location     = {Vancouver, Canada},
  title        = {{Model compression via distillation and quantization}},
  year         = {2018},
}

@inproceedings{85,
  abstract     = {Concurrent accesses to shared data structures must be synchronized to avoid data races. Coarse-grained synchronization, which locks the entire data structure, is easy to implement but does not scale. Fine-grained synchronization can scale well, but can be hard to reason about. Hand-over-hand locking, in which operations are pipelined as they traverse the data structure, combines fine-grained synchronization with ease of use. However, the traditional implementation suffers from inherent overheads. This paper introduces snapshot-based synchronization (SBS), a novel hand-over-hand locking mechanism. SBS decouples the synchronization state from the data, significantly improving cache utilization. Further, it relies on guarantees provided by pipelining to minimize synchronization that requires cross-thread communication. Snapshot-based synchronization thus scales much better than traditional hand-over-hand locking, while maintaining the same ease of use.},
  author       = {Gilad, Eran and Brown, Trevor A and Oskin, Mark and Etsion, Yoav},
  issn         = {03029743},
  location     = {Turin, Italy},
  pages        = {465 -- 479},
  publisher    = {Springer},
  title        = {{Snapshot based synchronization: A fast replacement for Hand-over-Hand locking}},
  doi          = {10.1007/978-3-319-96983-1_33},
  volume       = {11014},
  year         = {2018},
}

@inproceedings{7116,
  abstract     = {Training deep learning models has received tremendous research interest recently. In particular, there has been intensive research on reducing the communication cost of training when using multiple computational devices, through reducing the precision of the underlying data representation. Naturally, such methods induce system trade-offs—lowering communication precision could de-crease communication overheads and improve scalability; but, on the other hand, it can also reduce the accuracy of training. In this paper, we study this trade-off space, and ask:Can low-precision communication consistently improve the end-to-end performance of training modern neural networks, with no accuracy loss?From the performance point of view, the answer to this question may appear deceptively easy: compressing communication through low precision should help when the ratio between communication and computation is high. However, this answer is less straightforward when we try to generalize this principle across various neural network architectures (e.g., AlexNet vs. ResNet),number of GPUs (e.g., 2 vs. 8 GPUs), machine configurations(e.g., EC2 instances vs. NVIDIA DGX-1), communication primitives (e.g., MPI vs. NCCL), and even different GPU architectures(e.g., Kepler vs. Pascal). Currently, it is not clear how a realistic realization of all these factors maps to the speed up provided by low-precision communication. In this paper, we conduct an empirical study to answer this question and report the insights.},
  author       = {Grubic, Demjan and Tam, Leo and Alistarh, Dan-Adrian and Zhang, Ce},
  booktitle    = {Proceedings of the 21st International Conference on Extending Database Technology},
  isbn         = {9783893180783},
  issn         = {2367-2005},
  location     = {Vienna, Austria},
  pages        = {145--156},
  publisher    = {OpenProceedings},
  title        = {{Synchronous multi-GPU training for deep learning with low-precision communications: An empirical study}},
  doi          = {10.5441/002/EDBT.2018.14},
  year         = {2018},
}

@inproceedings{7123,
  abstract     = {Population protocols are a popular model of distributed computing, in which n agents with limited local state interact randomly, and cooperate to collectively compute global predicates. Inspired by recent developments in DNA programming, an extensive series of papers, across different communities, has examined the computability and complexity characteristics of this model. Majority, or consensus, is a central task in this model, in which agents need to collectively reach a decision as to which one of two states A or B had a higher initial count. Two metrics are important: the time that a protocol requires to stabilize to an output decision, and the state space size that each agent requires to do so. It is known that majority requires Ω(log log n) states per agent to allow for fast (poly-logarithmic time) stabilization, and that O(log2 n) states are sufficient. Thus, there is an exponential gap between the space upper and lower bounds for this problem. This paper addresses this question.

On the negative side, we provide a new lower bound of Ω(log n) states for any protocol which stabilizes in O(n1–c) expected time, for any constant c > 0. This result is conditional on monotonicity and output assumptions, satisfied by all known protocols. Technically, it represents a departure from previous lower bounds, in that it does not rely on the existence of dense configurations. Instead, we introduce a new generalized surgery technique to prove the existence of incorrect executions for any algorithm which would contradict the lower bound. Subsequently, our lower bound also applies to general initial configurations, including ones with a leader. On the positive side, we give a new algorithm for majority which uses O(log n) states, and stabilizes in O(log2 n) expected time. Central to the algorithm is a new leaderless phase clock technique, which allows agents to synchronize in phases of Θ(n log n) consecutive interactions using O(log n) states per agent, exploiting a new connection between population protocols and power-of-two-choices load balancing mechanisms. We also employ our phase clock to build a leader election algorithm with a state space of size O(log n), which stabilizes in O(log2 n) expected time.},
  author       = {Alistarh, Dan-Adrian and Aspnes, James and Gelashvili, Rati},
  booktitle    = {Proceedings of the 29th Annual ACM-SIAM Symposium on Discrete Algorithms},
  isbn         = {9781611975031},
  location     = {New Orleans, LA, United States},
  pages        = {2221--2239},
  publisher    = {ACM},
  title        = {{Space-optimal majority in population protocols}},
  doi          = {10.1137/1.9781611975031.144},
  year         = {2018},
}

@article{76,
  abstract     = {Consider a fully-connected synchronous distributed system consisting of n nodes, where up to f nodes may be faulty and every node starts in an arbitrary initial state. In the synchronous C-counting problem, all nodes need to eventually agree on a counter that is increased by one modulo C in each round for given C&gt;1. In the self-stabilising firing squad problem, the task is to eventually guarantee that all non-faulty nodes have simultaneous responses to external inputs: if a subset of the correct nodes receive an external “go” signal as input, then all correct nodes should agree on a round (in the not-too-distant future) in which to jointly output a “fire” signal. Moreover, no node should generate a “fire” signal without some correct node having previously received a “go” signal as input. We present a framework reducing both tasks to binary consensus at very small cost. For example, we obtain a deterministic algorithm for self-stabilising Byzantine firing squads with optimal resilience f&lt;n/3, asymptotically optimal stabilisation and response time O(f), and message size O(log f). As our framework does not restrict the type of consensus routines used, we also obtain efficient randomised solutions.},
  author       = {Lenzen, Christoph and Rybicki, Joel},
  journal      = {Distributed Computing},
  publisher    = {Springer},
  title        = {{Near-optimal self-stabilising counting and firing squads}},
  doi          = {10.1007/s00446-018-0342-6},
  year         = {2018},
}