@article{536,
  abstract     = {We consider the problem of consensus in the challenging classic model. In this model, the adversary is adaptive; it can choose which processors crash at any point during the course of the algorithm. Further, communication is via asynchronous message passing: there is no known upper bound on the time to send a message from one processor to another, and all messages and coin flips are seen by the adversary. We describe a new randomized consensus protocol with expected message complexity O(n2log2n) when fewer than n / 2 processes may fail by crashing. This is an almost-linear improvement over the best previously known protocol, and within logarithmic factors of a known Ω(n2) message lower bound. The protocol further ensures that no process sends more than O(nlog3n) messages in expectation, which is again within logarithmic factors of optimal. We also present a generalization of the algorithm to an arbitrary number of failures t, which uses expected O(nt+t2log2t) total messages. Our approach is to build a message-efficient, resilient mechanism for aggregating individual processor votes, implementing the message-passing equivalent of a weak shared coin. Roughly, in our protocol, a processor first announces its votes to small groups, then propagates them to increasingly larger groups as it generates more and more votes. To bound the number of messages that an individual process might have to send or receive, the protocol progressively increases the weight of generated votes. The main technical challenge is bounding the impact of votes that are still “in flight” (generated, but not fully propagated) on the final outcome of the shared coin, especially since such votes might have different weights. We achieve this by leveraging the structure of the algorithm, and a technical argument based on martingale concentration bounds. Overall, we show that it is possible to build an efficient message-passing implementation of a shared coin, and in the process (almost-optimally) solve the classic consensus problem in the asynchronous message-passing model.},
  author       = {Alistarh, Dan-Adrian and Aspnes, James and King, Valerie and Saia, Jared},
  issn         = {01782770},
  journal      = {Distributed Computing},
  number       = {6},
  pages        = {489--501},
  publisher    = {Springer},
  title        = {{Communication-efficient randomized consensus}},
  doi          = {10.1007/s00446-017-0315-1},
  volume       = {31},
  year         = {2018},
}

@inproceedings{5961,
  abstract     = {The area of machine learning has made considerable progress over the past decade, enabled by the widespread availability of large datasets, as well as by improved algorithms and models. Given the large computational demands of machine learning workloads, parallelism, implemented either through single-node concurrency or through multi-node distribution, has been a third key ingredient to advances in machine learning.
The goal of this tutorial is to provide the audience with an overview of standard distribution techniques in machine learning, with an eye towards the intriguing trade-offs between synchronization and communication costs of distributed machine learning algorithms, on the one hand, and their convergence, on the other.The tutorial will focus on parallelization strategies for the fundamental stochastic gradient descent (SGD) algorithm, which is a key tool when training machine learning models, from classical instances such as linear regression, to state-of-the-art neural network architectures.
The tutorial will describe the guarantees provided by this algorithm in the sequential case, and then move on to cover both shared-memory and message-passing parallelization strategies, together with the guarantees they provide, and corresponding trade-offs. The presentation will conclude with a broad overview of ongoing research in distributed and concurrent machine learning. The tutorial will assume no prior knowledge beyond familiarity with basic concepts in algebra and analysis.
},
  author       = {Alistarh, Dan-Adrian},
  booktitle    = {Proceedings of the 2018 ACM Symposium on Principles of Distributed Computing  - PODC '18},
  isbn         = {9781450357951},
  location     = {Egham, United Kingdom},
  pages        = {487--488},
  publisher    = {ACM Press},
  title        = {{A brief tutorial on distributed and concurrent machine learning}},
  doi          = {10.1145/3212734.3212798},
  year         = {2018},
}

@inproceedings{5962,
  abstract     = {Stochastic Gradient Descent (SGD) is a fundamental algorithm in machine learning, representing the optimization backbone for training several classic models, from regression to neural networks. Given the recent practical focus on distributed machine learning, significant work has been dedicated to the convergence properties of this algorithm under the inconsistent and noisy updates arising from execution in a distributed environment. However, surprisingly, the convergence properties of this classic algorithm in the standard shared-memory model are still not well-understood. In this work, we address this gap, and provide new convergence bounds for lock-free concurrent stochastic gradient descent, executing in the classic asynchronous shared memory model, against a strong adaptive adversary. Our results give improved upper and lower bounds on the "price of asynchrony'' when executing the fundamental SGD algorithm in a concurrent setting. They show that this classic optimization tool can converge faster and with a wider range of parameters than previously known under asynchronous iterations. At the same time, we exhibit a fundamental trade-off between the maximum delay in the system and the rate at which SGD can converge, which governs the set of parameters under which this algorithm can still work efficiently.},
  author       = {Alistarh, Dan-Adrian and De Sa, Christopher and Konstantinov, Nikola H},
  booktitle    = {Proceedings of the 2018 ACM Symposium on Principles of Distributed Computing  - PODC '18},
  isbn         = {9781450357951},
  location     = {Egham, United Kingdom},
  pages        = {169--178},
  publisher    = {ACM Press},
  title        = {{The convergence of stochastic gradient descent in asynchronous shared memory}},
  doi          = {10.1145/3212734.3212763},
  year         = {2018},
}

@inproceedings{5963,
  abstract     = {There has been significant progress in understanding the parallelism inherent to iterative sequential algorithms: for many classic algorithms, the depth of the dependence structure is now well understood, and scheduling techniques have been developed to exploit this shallow dependence structure for efficient parallel implementations. A related, applied research strand has studied methods by which certain iterative task-based algorithms can be efficiently parallelized via relaxed concurrent priority schedulers. These allow for high concurrency when inserting and removing tasks, at the cost of executing superfluous work due to the relaxed semantics of the scheduler. In this work, we take a step towards unifying these two research directions, by showing that there exists a family of relaxed priority schedulers that can efficiently and deterministically execute classic iterative algorithms such as greedy maximal independent set (MIS) and matching. Our primary result shows that, given a randomized scheduler with an expected relaxation factor of k in terms of the maximum allowed priority inversions on a task, and any graph on n vertices, the scheduler is able to execute greedy MIS with only an additive factor of \poly(k) expected additional iterations compared to an exact (but not scalable) scheduler. This counter-intuitive result demonstrates that the overhead of relaxation when computing MIS is not dependent on the input size or structure of the input graph. Experimental results show that this overhead can be clearly offset by the gain in performance due to the highly scalable scheduler. In sum, we present an efficient method to deterministically parallelize iterative sequential algorithms, with provable runtime guarantees in terms of the number of executed tasks to completion.},
  author       = {Alistarh, Dan-Adrian and Brown, Trevor A and Kopinsky, Justin and Nadiradze, Giorgi},
  booktitle    = {Proceedings of the 2018 ACM Symposium on Principles of Distributed Computing  - PODC '18},
  isbn         = {9781450357951},
  location     = {Egham, United Kingdom},
  pages        = {377--386},
  publisher    = {ACM Press},
  title        = {{Relaxed schedulers can efficiently parallelize iterative algorithms}},
  doi          = {10.1145/3212734.3212756},
  year         = {2018},
}

@inproceedings{5964,
  abstract     = {A standard design pattern found in many concurrent data structures, such as hash tables or ordered containers, is an alternation of parallelizable sections that incur no data conflicts and critical sections that must run sequentially and are protected with locks. A lock can be viewed as a queue that arbitrates the order in which the critical sections are executed, and a natural question is whether we can use stochastic analysis to predict the resulting throughput. As a preliminary evidence to the affirmative, we describe a simple model that can be used to predict the throughput of coarse-grained lock-based algorithms. We show that our model works well for CLH lock, and we expect it to work for other popular lock designs such as TTAS, MCS, etc.},
  author       = {Aksenov, Vitaly and Alistarh, Dan-Adrian and Kuznetsov, Petr},
  booktitle    = {Proceedings of the 2018 ACM Symposium on Principles of Distributed Computing  - PODC '18},
  isbn         = {9781450357951},
  location     = {Egham, United Kingdom},
  pages        = {411--413},
  publisher    = {ACM Press},
  title        = {{Brief Announcement: Performance prediction for coarse-grained locking}},
  doi          = {10.1145/3212734.3212785},
  year         = {2018},
}

@inproceedings{5965,
  abstract     = {Relaxed concurrent data structures have become increasingly popular, due to their scalability in graph processing and machine learning applications (\citeNguyen13, gonzalez2012powergraph ). Despite considerable interest, there exist families of natural, high performing randomized relaxed concurrent data structures, such as the popular MultiQueue~\citeMQ pattern for implementing relaxed priority queue data structures, for which no guarantees are known in the concurrent setting~\citeAKLN17. Our main contribution is in showing for the first time that, under a set of analytic assumptions, a family of relaxed concurrent data structures, including variants of MultiQueues, but also a new approximate counting algorithm we call the MultiCounter, provides strong probabilistic guarantees on the degree of relaxation with respect to the sequential specification, in arbitrary concurrent executions. We formalize these guarantees via a new correctness condition called distributional linearizability, tailored to concurrent implementations with randomized relaxations. Our result is based on a new analysis of an asynchronous variant of the classic power-of-two-choices load balancing algorithm, in which placement choices can be based on inconsistent, outdated information (this result may be of independent interest). We validate our results empirically, showing that the MultiCounter algorithm can implement scalable relaxed timestamps.},
  author       = {Alistarh, Dan-Adrian and Brown, Trevor A and Kopinsky, Justin and Li, Jerry Z. and Nadiradze, Giorgi},
  booktitle    = {Proceedings of the 30th on Symposium on Parallelism in Algorithms and Architectures  - SPAA '18},
  isbn         = {9781450357999},
  location     = {Vienna, Austria},
  pages        = {133--142},
  publisher    = {ACM Press},
  title        = {{Distributionally linearizable data structures}},
  doi          = {10.1145/3210377.3210411},
  year         = {2018},
}

@inproceedings{5966,
  abstract     = {The transactional conflict problem arises in transactional systems whenever two or more concurrent transactions clash on a data item. While the standard solution to such conflicts is to immediately abort one of the transactions, some practical systems consider the alternative of delaying conflict resolution for a short interval, which may allow one of the transactions to commit. The challenge in the transactional conflict problem is to choose the optimal length of this delay interval so as to minimize the overall running time penalty for the conflicting transactions. In this paper, we propose a family of optimal online algorithms for the transactional conflict problem. Specifically, we consider variants of this problem which arise in different implementations of transactional systems, namely "requestor wins'' and "requestor aborts'' implementations: in the former, the recipient of a coherence request is aborted, whereas in the latter, it is the requestor which has to abort. Both strategies are implemented by real systems. We show that the requestor aborts case can be reduced to a classic instance of the ski rental problem, while the requestor wins case leads to a new version of this classical problem, for which we derive optimal deterministic and randomized algorithms. Moreover, we prove that, under a simplified adversarial model, our algorithms are constant-competitive with the offline optimum in terms of throughput. We validate our algorithmic results empirically through a hardware simulation of hardware transactional memory (HTM), showing that our algorithms can lead to non-trivial performance improvements for classic concurrent data structures.},
  author       = {Alistarh, Dan-Adrian and Haider, Syed Kamran and Kübler, Raphael and Nadiradze, Giorgi},
  booktitle    = {Proceedings of the 30th on Symposium on Parallelism in Algorithms and Architectures  - SPAA '18},
  isbn         = {9781450357999},
  location     = {Vienna, Austria},
  pages        = {383--392},
  publisher    = {ACM Press},
  title        = {{The transactional conflict problem}},
  doi          = {10.1145/3210377.3210406},
  year         = {2018},
}

@article{6001,
  abstract     = {The concurrent memory reclamation problem is that of devising a way for a deallocating thread to verify that no other concurrent threads hold references to a memory block being deallocated. To date, in the absence of automatic garbage collection, there is no satisfactory solution to this problem; existing tracking methods like hazard pointers, reference counters, or epoch-based techniques like RCU are either prohibitively expensive or require significant programming expertise to the extent that implementing them efficiently can be worthy of a publication. None of the existing techniques are automatic or even semi-automated.
In this article, we take a new approach to concurrent memory reclamation. Instead of manually tracking access to memory locations as done in techniques like hazard pointers, or restricting shared accesses to specific epoch boundaries as in RCU, our algorithm, called ThreadScan, leverages operating system signaling to automatically detect which memory locations are being accessed by concurrent threads.
Initial empirical evidence shows that ThreadScan scales surprisingly well and requires negligible programming effort beyond the standard use of Malloc and Free.},
  author       = {Alistarh, Dan-Adrian and Leiserson, William and Matveev, Alexander and Shavit, Nir},
  issn         = {2329-4949},
  journal      = {ACM Transactions on Parallel Computing},
  number       = {4},
  publisher    = {Association for Computing Machinery},
  title        = {{ThreadScan: Automatic and scalable memory reclamation}},
  doi          = {10.1145/3201897},
  volume       = {4},
  year         = {2018},
}

@inproceedings{6031,
  abstract     = {We introduce Clover, a new library for efficient computation using low-precision data, providing mathematical routines required by fundamental methods in optimization and sparse recovery. Our library faithfully implements variants of stochastic quantization that guarantee convergence at low precision, and supports data formats from 4-bit quantized to 32-bit IEEE-754 on current Intel processors. In particular, we show that 4-bit can be implemented efficiently using Intel AVX despite the lack of native support for this data format. Experimental results with dot product, matrix-vector multiplication (MVM), gradient descent (GD), and iterative hard thresholding (IHT) demonstrate that the attainable speedups are in many cases close to linear with respect to the reduction of precision due to reduced data movement. Finally, for GD and IHT, we show examples of absolute speedup achieved by 4-bit versus 32-bit, by iterating until a given target error is achieved.},
  author       = {Stojanov, Alen and Smith, Tyler Michael and Alistarh, Dan-Adrian and Puschel, Markus},
  booktitle    = {2018 IEEE International Workshop on Signal Processing Systems},
  location     = {Cape Town, South Africa},
  publisher    = {IEEE},
  title        = {{Fast quantized arithmetic on x86: Trading compute for data movement}},
  doi          = {10.1109/SiPS.2018.8598402},
  volume       = {2018-October},
  year         = {2018},
}

@inproceedings{6558,
  abstract     = {This paper studies the problem of distributed stochastic optimization in an adversarial setting where, out of m machines which allegedly compute stochastic gradients every iteration, an α-fraction are Byzantine, and may behave adversarially. Our main result is a variant of stochastic gradient descent (SGD) which finds ε-approximate minimizers of convex functions in T=O~(1/ε²m+α²/ε²) iterations. In contrast, traditional mini-batch SGD needs T=O(1/ε²m) iterations, but cannot tolerate Byzantine failures. Further, we provide a lower bound showing that, up to logarithmic factors, our algorithm is information-theoretically optimal both in terms of sample complexity and time complexity.},
  author       = {Alistarh, Dan-Adrian and Allen-Zhu, Zeyuan and Li, Jerry},
  booktitle    = {Advances in Neural Information Processing Systems},
  location     = {Montreal, Canada},
  pages        = {4613--4623},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Byzantine stochastic gradient descent}},
  volume       = {2018},
  year         = {2018},
}

@inproceedings{6589,
  abstract     = {Distributed training of massive machine learning models, in particular deep neural networks, via Stochastic Gradient Descent (SGD) is becoming commonplace. Several families of communication-reduction methods, such as quantization, large-batch methods, and gradient sparsification, have been proposed. To date, gradient sparsification methods--where each node sorts gradients by magnitude, and only communicates a subset of the components, accumulating the rest locally--are known to yield some of the largest practical gains. Such methods can reduce the amount of communication per step by up to \emph{three orders of magnitude}, while preserving model accuracy. Yet, this family of methods currently has no theoretical justification. This is the question we address in this paper. We prove that, under analytic assumptions, sparsifying gradients by magnitude with local error correction provides convergence guarantees, for both convex and non-convex smooth objectives, for data-parallel SGD. The main insight is that sparsification methods implicitly maintain bounds on the maximum impact of stale updates, thanks to selection by magnitude. Our analysis and empirical validation also reveal that these methods do require analytical conditions to converge well, justifying existing heuristics.},
  author       = {Alistarh, Dan-Adrian and Hoefler, Torsten and Johansson, Mikael and Konstantinov, Nikola H and Khirirat, Sarit and Renggli, Cedric},
  booktitle    = {Advances in Neural Information Processing Systems 31},
  location     = {Montreal, Canada},
  pages        = {5973--5983},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{The convergence of sparsified gradient methods}},
  volume       = {Volume 2018},
  year         = {2018},
}

@inproceedings{397,
  abstract     = {Concurrent sets with range query operations are highly desirable in applications such as in-memory databases. However, few set implementations offer range queries. Known techniques for augmenting data structures with range queries (or operations that can be used to build range queries) have numerous problems that limit their usefulness. For example, they impose high overhead or rely heavily on garbage collection. In this work, we show how to augment data structures with highly efficient range queries, without relying on garbage collection. We identify a property of epoch-based memory reclamation algorithms that makes them ideal for implementing range queries, and produce three algorithms, which use locks, transactional memory and lock-free techniques, respectively. Our algorithms are applicable to more data structures than previous work, and are shown to be highly efficient on a large scale Intel system. },
  author       = {Arbel Raviv, Maya and Brown, Trevor A},
  isbn         = {978-1-4503-4982-6},
  location     = {Vienna, Austria},
  number       = {1},
  pages        = {14 -- 27},
  publisher    = {ACM},
  title        = {{Harnessing epoch-based reclamation for efficient range queries}},
  doi          = {10.1145/3178487.3178489},
  volume       = {53},
  year         = {2018},
}

@article{43,
  abstract     = {The initial amount of pathogens required to start an infection within a susceptible host is called the infective dose and is known to vary to a large extent between different pathogen species. We investigate the hypothesis that the differences in infective doses are explained by the mode of action in the underlying mechanism of pathogenesis: Pathogens with locally acting mechanisms tend to have smaller infective doses than pathogens with distantly acting mechanisms. While empirical evidence tends to support the hypothesis, a formal theoretical explanation has been lacking. We give simple analytical models to gain insight into this phenomenon and also investigate a stochastic, spatially explicit, mechanistic within-host model for toxin-dependent bacterial infections. The model shows that pathogens secreting locally acting toxins have smaller infective doses than pathogens secreting diffusive toxins, as hypothesized. While local pathogenetic mechanisms require smaller infective doses, pathogens with distantly acting toxins tend to spread faster and may cause more damage to the host. The proposed model can serve as a basis for the spatially explicit analysis of various virulence factors also in the context of other problems in infection dynamics.},
  author       = {Rybicki, Joel and Kisdi, Eva and Anttila, Jani},
  journal      = {PNAS},
  number       = {42},
  pages        = {10690 -- 10695},
  publisher    = {National Academy of Sciences},
  title        = {{Model of bacterial toxin-dependent pathogenesis explains infective dose}},
  doi          = {10.1073/pnas.1721061115},
  volume       = {115},
  year         = {2018},
}

@inproceedings{791,
  abstract     = {Consider the following random process: we are given n queues, into which elements of increasing labels are inserted uniformly at random. To remove an element, we pick two queues at random, and remove the element of lower label (higher priority) among the two. The cost of a removal is the rank of the label removed, among labels still present in any of the queues, that is, the distance from the optimal choice at each step. Variants of this strategy are prevalent in state-of-the-art concurrent priority queue implementations. Nonetheless, it is not known whether such implementations provide any rank guarantees, even in a sequential model. We answer this question, showing that this strategy provides surprisingly strong guarantees: Although the single-choice process, where we always insert and remove from a single randomly chosen queue, has degrading cost, going to infinity as we increase the number of steps, in the two choice process, the expected rank of a removed element is O(n) while the expected worst-case cost is O(n log n). These bounds are tight, and hold irrespective of the number of steps for which we run the process. The argument is based on a new technical connection between &quot;heavily loaded&quot; balls-into-bins processes and priority scheduling. Our analytic results inspire a new concurrent priority queue implementation, which improves upon the state of the art in terms of practical performance.},
  author       = {Alistarh, Dan-Adrian and Kopinsky, Justin and Li, Jerry and Nadiradze, Giorgi},
  booktitle    = {Proceedings of the ACM Symposium on Principles of Distributed Computing},
  isbn         = {978-145034992-5},
  location     = {Washington, WA, USA},
  pages        = {283 -- 292},
  publisher    = {ACM},
  title        = {{The power of choice in priority scheduling}},
  doi          = {10.1145/3087801.3087810},
  volume       = {Part F129314},
  year         = {2017},
}

@inproceedings{487,
  abstract     = {In this paper we study network architecture for unlicensed cellular networking for outdoor coverage in TV white spaces. The main technology proposed for TV white spaces is 802.11af, a Wi-Fi variant adapted for TV frequencies. However, 802.11af is originally designed for improved indoor propagation. We show that long links, typical for outdoor use, exacerbate known Wi-Fi issues, such as hidden and exposed terminal, and significantly reduce its efficiency. Instead, we propose CellFi, an alternative architecture based on LTE. LTE is designed for long-range coverage and throughput efficiency, but it is also designed to operate in tightly controlled and centrally managed networks. CellFi overcomes these problems by designing an LTE-compatible spectrum database component, mandatory for TV white space networking, and introducing an interference management component for distributed coordination. CellFi interference management is compatible with existing LTE mechanisms, requires no explicit communication between base stations, and is more efficient than CSMA for long links. We evaluate our design through extensive real world evaluation on of-the-shelf LTE equipment and simulations. We show that, compared to 802.11af, it increases coverage by 40% and reduces median flow completion times by 2.3x.},
  author       = {Baig, Ghufran and Radunovic, Bozidar and Alistarh, Dan-Adrian and Balkwill, Matthew and Karagiannis, Thomas and Qiu, Lili},
  booktitle    = {Proceedings of the 2017 13th International Conference on emerging Networking EXperiments and Technologies},
  isbn         = {978-145035422-6},
  location     = {Incheon, South Korea},
  pages        = {2 -- 14},
  publisher    = {ACM},
  title        = {{Towards unlicensed cellular networks in TV white spaces}},
  doi          = {10.1145/3143361.3143367},
  year         = {2017},
}

@inproceedings{431,
  abstract     = {Parallel implementations of stochastic gradient descent (SGD) have received significant research attention, thanks to its excellent scalability properties. A fundamental barrier when parallelizing SGD is the high bandwidth cost of communicating gradient updates between nodes; consequently, several lossy compresion heuristics have been proposed, by which nodes only communicate quantized gradients. Although effective in practice, these heuristics do not always converge. In this paper, we propose Quantized SGD (QSGD), a family of compression schemes with convergence guarantees and good practical performance. QSGD allows the user to smoothly trade off communication bandwidth and convergence time: nodes can adjust the number of bits sent per iteration, at the cost of possibly higher variance. We show that this trade-off is inherent, in the sense that improving it past some threshold would violate information-theoretic lower bounds. QSGD guarantees convergence for convex and non-convex objectives, under asynchrony, and can be extended to stochastic variance-reduced techniques. When applied to training deep neural networks for image classification and automated speech recognition, QSGD leads to significant reductions in end-to-end training time. For instance, on 16GPUs, we can train the ResNet-152 network to full accuracy on ImageNet 1.8 × faster than the full-precision variant. },
  author       = {Alistarh, Dan-Adrian and Grubic, Demjan and Li, Jerry and Tomioka, Ryota and Vojnović, Milan},
  issn         = {10495258},
  location     = {Long Beach, CA, United States},
  pages        = {1710--1721},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{QSGD: Communication-efficient SGD via gradient quantization and encoding}},
  volume       = {2017},
  year         = {2017},
}

@inproceedings{432,
  abstract     = {Recently there has been significant interest in training machine-learning models at low precision: by reducing precision, one can reduce computation and communication by one order of magnitude. We examine training at reduced precision, both from a theoretical and practical perspective, and ask: is it possible to train models at end-to-end low precision with provable guarantees? Can this lead to consistent order-of-magnitude speedups? We mainly focus on linear models, and the answer is yes for linear models. We develop a simple framework called ZipML based on one simple but novel strategy called double sampling. Our ZipML framework is able to execute training at low precision with no bias, guaranteeing convergence, whereas naive quanti- zation would introduce significant bias. We val- idate our framework across a range of applica- tions, and show that it enables an FPGA proto- type that is up to 6.5 × faster than an implemen- tation using full 32-bit precision. We further de- velop a variance-optimal stochastic quantization strategy and show that it can make a significant difference in a variety of settings. When applied to linear models together with double sampling, we save up to another 1.7 × in data movement compared with uniform quantization. When training deep networks with quantized models, we achieve higher accuracy than the state-of-the- art XNOR-Net. },
  author       = {Zhang, Hantian and Li, Jerry and Kara, Kaan and Alistarh, Dan-Adrian and Liu, Ji and Zhang, Ce},
  booktitle    = {Proceedings of Machine Learning Research},
  isbn         = {978-151085514-4},
  location     = {Sydney, Australia},
  pages        = {4035 -- 4043},
  publisher    = {ML Research Press},
  title        = {{ZipML: Training linear models with end-to-end low precision, and a little bit of deep learning}},
  volume       = { 70},
  year         = {2017},
}

