@inproceedings{15011,
  abstract     = {Pruning large language models (LLMs) from the BERT family has emerged as a standard compression benchmark, and several pruning methods have been proposed for this task. The recent “Sparsity May Cry” (SMC) benchmark put into question the validity of all existing methods, exhibiting a more complex setup where many known pruning methods appear to fail. We revisit the question of accurate BERT-pruning during fine-tuning on downstream datasets, and propose a set of general guidelines for successful pruning, even on the challenging SMC benchmark. First, we perform a cost-vs-benefits analysis of pruning model components, such as the embeddings and the classification head; second, we provide a simple-yet-general way of scaling training, sparsification and learning rate schedules relative to the desired target sparsity; finally, we investigate the importance of proper parametrization for Knowledge Distillation in the context of LLMs. Our simple insights lead to state-of-the-art results, both on classic BERT-pruning benchmarks, as well as on the SMC benchmark, showing that even classic gradual magnitude pruning (GMP) can yield competitive results, with the right approach.},
  author       = {Kurtic, Eldar and Hoefler, Torsten and Alistarh, Dan-Adrian},
  booktitle    = {Proceedings of Machine Learning Research},
  issn         = {2640-3498},
  location     = {Hongkong, China},
  pages        = {542--553},
  publisher    = {ML Research Press},
  title        = {{How to prune your language model: Recovering accuracy on the "Sparsity May Cry" benchmark}},
  volume       = {234},
  year         = {2024},
}

@inproceedings{14458,
  abstract     = {We show for the first time that large-scale generative pretrained transformer (GPT) family models can be pruned to at least 50% sparsity in one-shot, without any retraining, at minimal loss of accuracy. This is achieved via a new pruning method called SparseGPT, specifically designed to work efficiently and accurately on massive GPT-family models. We can execute SparseGPT on the largest available open-source models, OPT-175B and BLOOM-176B, in under 4.5 hours, and can reach 60% unstructured sparsity with negligible increase in perplexity: remarkably, more than 100 billion weights from these models can be ignored at inference time. SparseGPT generalizes to semi-structured (2:4 and 4:8) patterns, and is compatible with weight quantization approaches. The code is available at: https://github.com/IST-DASLab/sparsegpt.},
  author       = {Frantar, Elias and Alistarh, Dan-Adrian},
  booktitle    = {Proceedings of the 40th International Conference on Machine Learning},
  issn         = {2640-3498},
  location     = {Honolulu, Hawaii, HI, United States},
  pages        = {10323--10337},
  publisher    = {ML Research Press},
  title        = {{SparseGPT: Massive language models can be accurately pruned in one-shot}},
  volume       = {202},
  year         = {2023},
}

@inproceedings{14459,
  abstract     = {Autoencoders are a popular model in many branches of machine learning and lossy data compression. However, their fundamental limits, the performance of gradient methods and the features learnt during optimization remain poorly understood, even in the two-layer setting. In fact, earlier work has considered either linear autoencoders or specific training regimes (leading to vanishing or diverging compression rates). Our paper addresses this gap by focusing on non-linear two-layer autoencoders trained in the challenging proportional regime in which the input dimension scales linearly with the size of the representation. Our results characterize the minimizers of the population risk, and show that such minimizers are achieved by gradient methods; their structure is also unveiled, thus leading to a concise description of the features obtained via training. For the special case of a sign activation function, our analysis establishes the fundamental limits for the lossy compression of Gaussian sources via (shallow) autoencoders. Finally, while the results are proved for Gaussian data, numerical simulations on standard datasets display the universality of the theoretical predictions.},
  author       = {Shevchenko, Aleksandr and Kögler, Kevin and Hassani, Hamed and Mondelli, Marco},
  booktitle    = {Proceedings of the 40th International Conference on Machine Learning},
  issn         = {2640-3498},
  location     = {Honolulu, Hawaii, HI, United States},
  pages        = {31151--31209},
  publisher    = {ML Research Press},
  title        = {{Fundamental limits of two-layer autoencoders, and achieving them with gradient methods}},
  volume       = {202},
  year         = {2023},
}

@inproceedings{14460,
  abstract     = {We provide an efficient implementation of the backpropagation algorithm, specialized to the case where the weights of the neural network being trained are sparse. Our algorithm is general, as it applies to arbitrary (unstructured) sparsity and common layer types (e.g., convolutional or linear). We provide a fast vectorized implementation on commodity CPUs, and show that it can yield speedups in end-to-end runtime experiments, both in transfer learning using already-sparsified networks, and in training sparse networks from scratch. Thus, our results provide the first support for sparse training on commodity hardware.},
  author       = {Nikdan, Mahdi and Pegolotti, Tommaso and Iofinova, Eugenia B and Kurtic, Eldar and Alistarh, Dan-Adrian},
  booktitle    = {Proceedings of the 40th International Conference on Machine Learning},
  issn         = {2640-3498},
  location     = {Honolulu, Hawaii, HI, United States},
  pages        = {26215--26227},
  publisher    = {ML Research Press},
  title        = {{SparseProp: Efficient sparse backpropagation for faster training of neural networks at the edge}},
  volume       = {202},
  year         = {2023},
}

@inproceedings{14461,
  abstract     = {Communication-reduction techniques are a popular way to improve scalability in data-parallel training of deep neural networks (DNNs). The recent emergence of large language models such as GPT has created the need for new approaches to exploit data-parallelism. Among these, fully-sharded data parallel (FSDP) training is highly popular, yet it still encounters scalability bottlenecks. One reason is that applying compression techniques to FSDP is challenging: as the vast majority of the communication involves the model’s weights, direct compression alters convergence and leads to accuracy loss. We present QSDP, a variant of FSDP which supports both gradient and weight quantization with theoretical guarantees, is simple to implement and has essentially no overheads. To derive QSDP we prove that a natural modification of SGD achieves convergence even when we only maintain quantized weights, and thus the domain over which we train consists of quantized points and is, therefore, highly non-convex. We validate this approach by training GPT-family models with up to 1.3 billion parameters on a multi-node cluster. Experiments show that QSDP preserves model accuracy, while completely removing the communication bottlenecks of FSDP, providing end-to-end speedups of up to 2.2x.},
  author       = {Markov, Ilia and Vladu, Adrian and Guo, Qi and Alistarh, Dan-Adrian},
  booktitle    = {Proceedings of the 40th International Conference on Machine Learning},
  issn         = {2640-3498},
  location     = {Honolulu, Hawaii, HI, United States},
  pages        = {24020--24044},
  publisher    = {ML Research Press},
  title        = {{Quantized distributed training of large models with convergence guarantees}},
  volume       = {202},
  year         = {2023},
}

@inproceedings{14462,
  abstract     = {We study fine-grained error bounds for differentially private algorithms for counting under continual observation. Our main insight is that the matrix mechanism when using lower-triangular matrices can be used in the continual observation model. More specifically, we give an explicit factorization for the counting matrix Mcount and upper bound the error explicitly. We also give a fine-grained analysis, specifying the exact constant in the upper bound. Our analysis is based on upper and lower bounds of the completely bounded norm (cb-norm) of Mcount
. Along the way, we improve the best-known bound of 28 years by Mathias (SIAM Journal on Matrix Analysis and Applications, 1993) on the cb-norm of Mcount for a large range of the dimension of Mcount. Furthermore, we are the first to give concrete error bounds for various problems under continual observation such as binary counting, maintaining a histogram, releasing an approximately cut-preserving synthetic graph, many graph-based statistics, and substring and episode counting. Finally, we note that our result can be used to get a fine-grained error bound for non-interactive local learning and the first lower bounds on the additive error for (ϵ,δ)-differentially-private counting under continual observation. Subsequent to this work, Henzinger et al. (SODA, 2023) showed that our factorization also achieves fine-grained mean-squared error.},
  author       = {Fichtenberger, Hendrik and Henzinger, Monika H and Upadhyay, Jalaj},
  booktitle    = {Proceedings of the 40th International Conference on Machine Learning},
  issn         = {2640-3498},
  location     = {Honolulu, Hawaii, HI, United States},
  pages        = {10072--10092},
  publisher    = {ML Research Press},
  title        = {{Constant matters: Fine-grained error bound on differentially private continual observation}},
  volume       = {202},
  year         = {2023},
}

@inproceedings{13239,
  abstract     = {Brains are thought to engage in predictive learning - learning to predict upcoming stimuli - to construct an internal model of their environment. This is especially notable for spatial navigation, as first described by Tolman’s latent learning tasks. However, predictive learning has also been observed in sensory cortex, in settings unrelated to spatial navigation. Apart from normative frameworks such as active inference or efficient coding, what could be the utility of learning to predict the patterns of occurrence of correlated stimuli? Here we show that prediction, and thereby the construction of an internal model of sequential stimuli, can bootstrap the learning process of a working memory task in a recurrent neural network. We implemented predictive learning alongside working memory match-tasks, and networks emerged to solve the prediction task first by encoding information across time to predict upcoming stimuli, and then eavesdropped on this solution to solve the matching task. Eavesdropping was most beneficial when neural resources were limited. Hence, predictive learning acts as a general neural mechanism to learn to store sensory information that can later be essential for working memory tasks.},
  author       = {Van Der Plas, Thijs L. and Vogels, Tim P and Manohar, Sanjay G.},
  booktitle    = {Proceedings of Machine Learning Research},
  issn         = {2640-3498},
  pages        = {518--531},
  publisher    = {ML Research Press},
  title        = {{Predictive learning enables neural networks to learn complex working memory tasks}},
  volume       = {199},
  year         = {2022},
}

@inproceedings{13241,
  abstract     = {Addressing fairness concerns about machine learning models is a crucial step towards their long-term adoption in real-world automated systems. Many approaches for training fair models from data have been developed and an implicit assumption about such algorithms is that they are able to recover a fair model, despite potential historical biases in the data. In this work we show a number of impossibility results that indicate that there is no learning algorithm that can recover a fair model when a proportion of the dataset is subject to arbitrary manipulations. Specifically, we prove that there are situations in which an adversary can force any learner to return a biased classifier, with or without degrading accuracy, and that the strength of this bias increases for learning problems with underrepresented protected groups in the data. Our results emphasize on the importance of studying further data corruption models of various strength and of establishing stricter data collection practices for fairness-aware learning.},
  author       = {Konstantinov, Nikola H and Lampert, Christoph},
  booktitle    = {Proceedings of Machine Learning Research},
  issn         = {2640-3498},
  pages        = {59--83},
  publisher    = {ML Research Press},
  title        = {{On the impossibility of fairness-aware learning from corrupted data}},
  volume       = {171},
  year         = {2022},
}

@inproceedings{13146,
  abstract     = {A recent line of work has analyzed the theoretical properties of deep neural networks via the Neural Tangent Kernel (NTK). In particular, the smallest eigenvalue of the NTK has been related to the memorization capacity, the global convergence of gradient descent algorithms and the generalization of deep nets. However, existing results either provide bounds in the two-layer setting or assume that the spectrum of the NTK matrices is bounded away from 0 for multi-layer networks. In this paper, we provide tight bounds on the smallest eigenvalue of NTK matrices for deep ReLU nets, both in the limiting case of infinite widths and for finite widths. In the finite-width setting, the network architectures we consider are fairly general: we require the existence of a wide layer with roughly order of N neurons, N being the number of data samples; and the scaling of the remaining layer widths is arbitrary (up to logarithmic factors). To obtain our results, we analyze various quantities of independent interest: we give lower bounds on the smallest singular value of hidden feature matrices, and upper bounds on the Lipschitz constant of input-output feature maps.},
  author       = {Nguyen, Quynh and Mondelli, Marco and Montufar, Guido},
  booktitle    = {Proceedings of the 38th International Conference on Machine Learning},
  isbn         = {9781713845065},
  issn         = {2640-3498},
  location     = {Virtual},
  pages        = {8119--8129},
  publisher    = {ML Research Press},
  title        = {{Tight bounds on the smallest Eigenvalue of the neural tangent kernel for deep ReLU networks}},
  volume       = {139},
  year         = {2021},
}

@inproceedings{13147,
  abstract     = {We investigate fast and communication-efficient algorithms for the classic problem of minimizing a sum of strongly convex and smooth functions that are distributed among n
 different nodes, which can communicate using a limited number of bits. Most previous communication-efficient approaches for this problem are limited to first-order optimization, and therefore have \emph{linear} dependence on the condition number in their communication complexity. We show that this dependence is not inherent: communication-efficient methods can in fact have sublinear dependence on the condition number. For this, we design and analyze the first communication-efficient distributed variants of preconditioned gradient descent for Generalized Linear Models, and for Newton’s method. Our results rely on a new technique for quantizing both the preconditioner and the descent direction at each step of the algorithms, while controlling their convergence rate. We also validate our findings experimentally, showing faster convergence and reduced communication relative to previous methods.},
  author       = {Alimisis, Foivos and Davies, Peter and Alistarh, Dan-Adrian},
  booktitle    = {Proceedings of the 38th International Conference on Machine Learning},
  isbn         = {9781713845065},
  issn         = {2640-3498},
  location     = {Virtual},
  pages        = {196--206},
  publisher    = {ML Research Press},
  title        = {{Communication-efficient distributed optimization with quantized preconditioners}},
  volume       = {139},
  year         = {2021},
}

@inproceedings{11651,
  abstract     = {Diffusions and related random walk procedures are of central importance in many areas of machine learning, data analysis, and applied mathematics. Because they spread mass agnostically at each step in an iterative manner, they can sometimes spread mass “too aggressively,” thereby failing to find the “right” clusters. We introduce a novel Capacity Releasing Diffusion (CRD) Process, which is both faster and stays more local than the classical spectral diffusion process. As an application, we use our CRD Process to develop an improved local algorithm for graph clustering. Our local graph clustering method can find local clusters in a model of clustering where one begins the CRD Process in a cluster whose vertices are connected better internally than externally by an O(log2n) factor, where n is the number of nodes in the cluster. Thus, our CRD Process is the first local graph clustering algorithm that is not subject to the well-known quadratic Cheeger barrier. Our result requires a certain smoothness condition, which we expect to be an artifact of our analysis. Our empirical evaluation demonstrates improved results, in particular for realistic social graphs where there are moderately good—but not very good—clusters.},
  author       = {Wang, Di and Fountoulakis, Kimon and Henzinger, Monika H and Mahoney, Michael W. and Rao ,  Satish},
  booktitle    = {Proceedings of the 34th International Conference on Machine Learning},
  issn         = {2640-3498},
  location     = {Sydney, Australia},
  pages        = {3598--3607},
  publisher    = {ML Research Press},
  title        = {{Capacity releasing diffusion for speed and locality}},
  volume       = {70},
  year         = {2017},
}

