[{"article_processing_charge":"No","publication":"11th International Conference on Learning Representations ","language":[{"iso":"eng"}],"year":"2023","acknowledged_ssus":[{"_id":"ScienComp"}],"status":"public","date_created":"2023-05-23T11:36:18Z","acknowledgement":"AP, EK, DA received funding from the European Research Council (ERC) under the European\r\nUnion’s Horizon 2020 research and innovation programme (grant agreement No 805223 ScaleML). AV acknowledges the support of the French Agence Nationale de la Recherche (ANR), under grant ANR-21-CE48-0016 (project COMCOPT). We further acknowledge the support from the Scientific Service Units (SSU) of ISTA through resources provided by Scientific Computing (SciComp)-","department":[{"_id":"GradSch"},{"_id":"DaAl"},{"_id":"ChLa"}],"arxiv":1,"oa":1,"quality_controlled":"1","title":"CrAM: A Compression-Aware Minimizer","main_file_link":[{"open_access":"1","url":"https://openreview.net/pdf?id=_eTZBs-yedr"}],"ec_funded":1,"external_id":{"arxiv":["2207.14200"]},"author":[{"id":"32D78294-F248-11E8-B48F-1D18A9856A87","first_name":"Elena-Alexandra","last_name":"Peste","full_name":"Peste, Elena-Alexandra"},{"full_name":"Vladu, Adrian","last_name":"Vladu","first_name":"Adrian"},{"full_name":"Kurtic, Eldar","first_name":"Eldar","last_name":"Kurtic","id":"47beb3a5-07b5-11eb-9b87-b108ec578218"},{"id":"40C20FD2-F248-11E8-B48F-1D18A9856A87","first_name":"Christoph","orcid":"0000-0001-8622-7887","last_name":"Lampert","full_name":"Lampert, Christoph"},{"id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","last_name":"Alistarh","orcid":"0000-0003-3650-940X","first_name":"Dan-Adrian","full_name":"Alistarh, Dan-Adrian"}],"related_material":{"record":[{"status":"public","relation":"dissertation_contains","id":"13074"}]},"conference":{"location":"Kigali, Rwanda ","name":"ICLR: International Conference on Learning Representations","start_date":"2023-05-01","end_date":"2023-05-05"},"oa_version":"Preprint","project":[{"grant_number":"805223","call_identifier":"H2020","name":"Elastic Coordination for Scalable Machine Learning","_id":"268A44D6-B435-11E9-9278-68D0E5697425"}],"_id":"13053","abstract":[{"lang":"eng","text":"Deep neural networks (DNNs) often have to be compressed, via pruning and/or quantization, before they can be deployed in practical settings. In this work we propose a new compression-aware minimizer dubbed CrAM that modifies the optimization step in a principled way, in order to produce models whose local loss behavior is stable under compression operations such as pruning. Thus, dense models trained via CrAM should be compressible post-training, in a single step, without significant accuracy loss. Experimental results on standard benchmarks, such as residual networks for ImageNet classification and BERT models for language modelling, show that CrAM produces dense models that can be more accurate than the standard SGD/Adam-based baselines, but which are stable under weight pruning: specifically, we can prune models in one-shot to 70-80% sparsity with almost no accuracy loss, and to 90% with reasonable (∼1%) accuracy loss, which is competitive with gradual compression methods. Additionally, CrAM can produce sparse models which perform well for transfer learning, and it also works for semi-structured 2:4 pruning patterns supported by GPU hardware. The code for reproducing the results is available at this https URL ."}],"date_updated":"2023-06-01T12:54:45Z","citation":{"apa":"Peste, E.-A., Vladu, A., Kurtic, E., Lampert, C., &#38; Alistarh, D.-A. (n.d.). CrAM: A Compression-Aware Minimizer. In <i>11th International Conference on Learning Representations </i>. Kigali, Rwanda .","short":"E.-A. Peste, A. Vladu, E. Kurtic, C. Lampert, D.-A. Alistarh, in:, 11th International Conference on Learning Representations , n.d.","ista":"Peste E-A, Vladu A, Kurtic E, Lampert C, Alistarh D-A. CrAM: A Compression-Aware Minimizer. 11th International Conference on Learning Representations . ICLR: International Conference on Learning Representations.","chicago":"Peste, Elena-Alexandra, Adrian Vladu, Eldar Kurtic, Christoph Lampert, and Dan-Adrian Alistarh. “CrAM: A Compression-Aware Minimizer.” In <i>11th International Conference on Learning Representations </i>, n.d.","ama":"Peste E-A, Vladu A, Kurtic E, Lampert C, Alistarh D-A. CrAM: A Compression-Aware Minimizer. In: <i>11th International Conference on Learning Representations </i>.","ieee":"E.-A. Peste, A. Vladu, E. Kurtic, C. Lampert, and D.-A. Alistarh, “CrAM: A Compression-Aware Minimizer,” in <i>11th International Conference on Learning Representations </i>, Kigali, Rwanda .","mla":"Peste, Elena-Alexandra, et al. “CrAM: A Compression-Aware Minimizer.” <i>11th International Conference on Learning Representations </i>."},"type":"conference","month":"05","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","date_published":"2023-05-01T00:00:00Z","publication_status":"accepted"},{"project":[{"grant_number":"665385","call_identifier":"H2020","name":"International IST Doctoral Program","_id":"2564DBCA-B435-11E9-9278-68D0E5697425"},{"grant_number":"805223","call_identifier":"H2020","_id":"268A44D6-B435-11E9-9278-68D0E5697425","name":"Elastic Coordination for Scalable Machine Learning"}],"publication_identifier":{"issn":["2663-337X"]},"author":[{"full_name":"Peste, Elena-Alexandra","id":"32D78294-F248-11E8-B48F-1D18A9856A87","last_name":"Peste","first_name":"Elena-Alexandra"}],"publication_status":"published","user_id":"8b945eb4-e2f2-11eb-945a-df72226e66a9","abstract":[{"text":"Deep learning has become an integral part of a large number of important applications, and many of the recent breakthroughs have been enabled by the ability to train very large models, capable to capture complex patterns and relationships from the data. At the same time, the massive sizes of modern deep learning models have made their deployment to smaller devices more challenging; this is particularly important, as in many applications the users rely on accurate deep learning predictions, but they only have access to devices with limited memory and compute power. One solution to this problem is to prune neural networks, by setting as many of their parameters as possible to zero, to obtain accurate sparse models with lower memory footprint. Despite the great research progress in obtaining sparse models that preserve accuracy, while satisfying memory and computational constraints, there are still many challenges associated with efficiently training sparse models, as well as understanding their generalization properties.\r\n\r\nThe focus of this thesis is to investigate how the training process of sparse models can be made more efficient, and to understand the differences between sparse and dense models in terms of how well they can generalize to changes in the data distribution. We first study a method for co-training sparse and dense models, at a lower cost compared to regular training. With our method we can obtain very accurate sparse networks, and dense models that can recover the baseline accuracy. Furthermore, we are able to more easily analyze the differences, at prediction level, between the sparse-dense model pairs. Next, we investigate the generalization properties of sparse neural networks in more detail, by studying how well different sparse models trained on a larger task can adapt to smaller, more specialized tasks, in a transfer learning scenario. Our analysis across multiple pruning methods and sparsity levels reveals that sparse models provide features that can transfer similarly to or better than the dense baseline. However, the choice of the pruning method plays an important role, and can influence the results when the features are fixed (linear finetuning), or when they are allowed to adapt to the new task (full finetuning). Using sparse models with fixed masks for finetuning on new tasks has an important practical advantage, as it enables training neural networks on smaller devices. However, one drawback of current pruning methods is that the entire training cycle has to be repeated to obtain the initial sparse model, for every sparsity target; in consequence, the entire training process is costly and also multiple models need to be stored. In the last part of the thesis we propose a method that can train accurate dense models that are compressible in a single step, to multiple sparsity levels, without additional finetuning. Our method results in sparse models that can be competitive with existing pruning methods, and which can also successfully generalize to new tasks.","lang":"eng"}],"_id":"13074","date_created":"2023-05-23T17:07:53Z","status":"public","publisher":"Institute of Science and Technology Austria","file":[{"file_id":"13087","relation":"main_file","creator":"epeste","date_updated":"2023-05-24T16:11:16Z","content_type":"application/pdf","file_size":2152072,"date_created":"2023-05-24T16:11:16Z","checksum":"6b3354968403cb9d48cc5a83611fb571","success":1,"access_level":"open_access","file_name":"PhD_Thesis_Alexandra_Peste_final.pdf"},{"file_size":1658293,"content_type":"application/zip","date_updated":"2023-05-24T16:12:59Z","creator":"epeste","relation":"source_file","file_id":"13088","file_name":"PhD_Thesis_APeste.zip","access_level":"closed","checksum":"8d0df94bbcf4db72c991f22503b3fd60","date_created":"2023-05-24T16:12:59Z"}],"page":"147","oa":1,"day":"23","department":[{"_id":"GradSch"},{"_id":"DaAl"},{"_id":"ChLa"}],"oa_version":"Published Version","supervisor":[{"full_name":"Lampert, Christoph","first_name":"Christoph","orcid":"0000-0001-8622-7887","last_name":"Lampert","id":"40C20FD2-F248-11E8-B48F-1D18A9856A87"},{"full_name":"Alistarh, Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","last_name":"Alistarh","orcid":"0000-0003-3650-940X","first_name":"Dan-Adrian"}],"file_date_updated":"2023-05-24T16:12:59Z","doi":"10.15479/at:ista:13074","alternative_title":["ISTA Thesis"],"related_material":{"record":[{"id":"11458","relation":"part_of_dissertation","status":"public"},{"status":"public","id":"13053","relation":"part_of_dissertation"},{"id":"12299","relation":"part_of_dissertation","status":"public"}]},"ec_funded":1,"ddc":["000"],"month":"05","date_published":"2023-05-23T00:00:00Z","type":"dissertation","has_accepted_license":"1","citation":{"ama":"Peste E-A. Efficiency and generalization of sparse neural networks. 2023. doi:<a href=\"https://doi.org/10.15479/at:ista:13074\">10.15479/at:ista:13074</a>","mla":"Peste, Elena-Alexandra. <i>Efficiency and Generalization of Sparse Neural Networks</i>. Institute of Science and Technology Austria, 2023, doi:<a href=\"https://doi.org/10.15479/at:ista:13074\">10.15479/at:ista:13074</a>.","ieee":"E.-A. Peste, “Efficiency and generalization of sparse neural networks,” Institute of Science and Technology Austria, 2023.","short":"E.-A. Peste, Efficiency and Generalization of Sparse Neural Networks, Institute of Science and Technology Austria, 2023.","apa":"Peste, E.-A. (2023). <i>Efficiency and generalization of sparse neural networks</i>. Institute of Science and Technology Austria. <a href=\"https://doi.org/10.15479/at:ista:13074\">https://doi.org/10.15479/at:ista:13074</a>","chicago":"Peste, Elena-Alexandra. “Efficiency and Generalization of Sparse Neural Networks.” Institute of Science and Technology Austria, 2023. <a href=\"https://doi.org/10.15479/at:ista:13074\">https://doi.org/10.15479/at:ista:13074</a>.","ista":"Peste E-A. 2023. Efficiency and generalization of sparse neural networks. Institute of Science and Technology Austria."},"date_updated":"2023-08-04T10:33:27Z","degree_awarded":"PhD","acknowledged_ssus":[{"_id":"ScienComp"}],"year":"2023","language":[{"iso":"eng"}],"article_processing_charge":"No","title":"Efficiency and generalization of sparse neural networks"},{"title":"Bias in pruned vision models: In-depth analysis and countermeasures","main_file_link":[{"open_access":"1","url":"https://doi.org/10.48550/arXiv.2304.12622"}],"acknowledgement":"The authors would like to sincerely thank Sara Hooker for her feedback during the development of this work. EI was supported in part by the FWF DK VGSCO, grant agreement number W1260-N35. AP and DA acknowledge generous ERC support, via Starting Grant 805223 ScaleML.","language":[{"iso":"eng"}],"year":"2023","article_processing_charge":"No","publication":"2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition","date_published":"2023-08-22T00:00:00Z","month":"08","citation":{"ama":"Iofinova EB, Peste E-A, Alistarh D-A. Bias in pruned vision models: In-depth analysis and countermeasures. In: <i>2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition</i>. IEEE; 2023:24364-24373. doi:<a href=\"https://doi.org/10.1109/cvpr52729.2023.02334\">10.1109/cvpr52729.2023.02334</a>","mla":"Iofinova, Eugenia B., et al. “Bias in Pruned Vision Models: In-Depth Analysis and Countermeasures.” <i>2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition</i>, IEEE, 2023, pp. 24364–73, doi:<a href=\"https://doi.org/10.1109/cvpr52729.2023.02334\">10.1109/cvpr52729.2023.02334</a>.","ieee":"E. B. Iofinova, E.-A. Peste, and D.-A. Alistarh, “Bias in pruned vision models: In-depth analysis and countermeasures,” in <i>2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition</i>, Vancouver, BC, Canada, 2023, pp. 24364–24373.","short":"E.B. Iofinova, E.-A. Peste, D.-A. Alistarh, in:, 2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition, IEEE, 2023, pp. 24364–24373.","apa":"Iofinova, E. B., Peste, E.-A., &#38; Alistarh, D.-A. (2023). Bias in pruned vision models: In-depth analysis and countermeasures. In <i>2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition</i> (pp. 24364–24373). Vancouver, BC, Canada: IEEE. <a href=\"https://doi.org/10.1109/cvpr52729.2023.02334\">https://doi.org/10.1109/cvpr52729.2023.02334</a>","chicago":"Iofinova, Eugenia B, Elena-Alexandra Peste, and Dan-Adrian Alistarh. “Bias in Pruned Vision Models: In-Depth Analysis and Countermeasures.” In <i>2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition</i>, 24364–73. IEEE, 2023. <a href=\"https://doi.org/10.1109/cvpr52729.2023.02334\">https://doi.org/10.1109/cvpr52729.2023.02334</a>.","ista":"Iofinova EB, Peste E-A, Alistarh D-A. 2023. Bias in pruned vision models: In-depth analysis and countermeasures. 2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition. CVPR: Conference on Computer Vision and Pattern Recognition, 24364–24373."},"date_updated":"2024-01-10T08:59:26Z","type":"conference","oa_version":"Preprint","ec_funded":1,"related_material":{"link":[{"relation":"software","url":"https://github.com/IST-DASLab/pruned-vision-model-bias"}]},"conference":{"location":"Vancouver, BC, Canada","end_date":"2023-06-24","name":"CVPR: Conference on Computer Vision and Pattern Recognition","start_date":"2023-06-17"},"doi":"10.1109/cvpr52729.2023.02334","arxiv":1,"oa":1,"quality_controlled":"1","department":[{"_id":"DaAl"},{"_id":"ChLa"}],"isi":1,"day":"22","status":"public","date_created":"2024-01-10T08:42:40Z","page":"24364-24373","publisher":"IEEE","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","publication_status":"published","_id":"14771","abstract":[{"lang":"eng","text":"Pruning—that is, setting a significant subset of the parameters of a neural network to zero—is one of the most popular methods of model compression. Yet, several recent works have raised the issue that pruning may induce or exacerbate bias in the output of the compressed model. Despite existing evidence for this phenomenon, the relationship between neural network pruning and induced bias is not well-understood. In this work, we systematically investigate and characterize this phenomenon in Convolutional Neural Networks for computer vision. First, we show that it is in fact possible to obtain highly-sparse models, e.g. with less than 10% remaining weights, which do not decrease in accuracy nor substantially increase in bias when compared to dense models. At the same time, we also find that, at higher sparsities, pruned models exhibit higher uncertainty in their outputs, as well as increased correlations, which we directly link to increased bias. We propose easy-to-use criteria which, based only on the uncompressed model, establish whether bias will increase with pruning, and identify the samples most susceptible to biased predictions post-compression. Our code can be found at https://github.com/IST-DASLab/pruned-vision-model-bias."}],"publication_identifier":{"eisbn":["9798350301298"],"eissn":["2575-7075"]},"project":[{"_id":"9B9290DE-BA93-11EA-9121-9846C619BF3A","name":"Vienna Graduate School on Computational Optimization","grant_number":" W1260-N35"},{"call_identifier":"H2020","grant_number":"805223","name":"Elastic Coordination for Scalable Machine Learning","_id":"268A44D6-B435-11E9-9278-68D0E5697425"}],"external_id":{"arxiv":["2304.12622"],"isi":["001062531308068"]},"author":[{"id":"f9a17499-f6e0-11ea-865d-fdf9a3f77117","last_name":"Iofinova","orcid":"0000-0002-7778-3221","first_name":"Eugenia B","full_name":"Iofinova, Eugenia B"},{"id":"32D78294-F248-11E8-B48F-1D18A9856A87","first_name":"Elena-Alexandra","last_name":"Peste","full_name":"Peste, Elena-Alexandra"},{"full_name":"Alistarh, Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","orcid":"0000-0003-3650-940X","last_name":"Alistarh","first_name":"Dan-Adrian"}]},{"type":"conference","date_updated":"2023-08-04T10:33:28Z","citation":{"apa":"Iofinova, E. B., Peste, E.-A., Kurtz, M., &#38; Alistarh, D.-A. (2022). How well do sparse ImageNet models transfer? In <i>2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition</i> (pp. 12256–12266). New Orleans, LA, United States: Institute of Electrical and Electronics Engineers. <a href=\"https://doi.org/10.1109/cvpr52688.2022.01195\">https://doi.org/10.1109/cvpr52688.2022.01195</a>","short":"E.B. Iofinova, E.-A. Peste, M. Kurtz, D.-A. Alistarh, in:, 2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition, Institute of Electrical and Electronics Engineers, 2022, pp. 12256–12266.","ista":"Iofinova EB, Peste E-A, Kurtz M, Alistarh D-A. 2022. How well do sparse ImageNet models transfer? 2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition. CVPR: Computer Vision and Pattern Recognition, 12256–12266.","chicago":"Iofinova, Eugenia B, Elena-Alexandra Peste, Mark Kurtz, and Dan-Adrian Alistarh. “How Well Do Sparse ImageNet Models Transfer?” In <i>2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition</i>, 12256–66. Institute of Electrical and Electronics Engineers, 2022. <a href=\"https://doi.org/10.1109/cvpr52688.2022.01195\">https://doi.org/10.1109/cvpr52688.2022.01195</a>.","ama":"Iofinova EB, Peste E-A, Kurtz M, Alistarh D-A. How well do sparse ImageNet models transfer? In: <i>2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition</i>. Institute of Electrical and Electronics Engineers; 2022:12256-12266. doi:<a href=\"https://doi.org/10.1109/cvpr52688.2022.01195\">10.1109/cvpr52688.2022.01195</a>","ieee":"E. B. Iofinova, E.-A. Peste, M. Kurtz, and D.-A. Alistarh, “How well do sparse ImageNet models transfer?,” in <i>2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition</i>, New Orleans, LA, United States, 2022, pp. 12256–12266.","mla":"Iofinova, Eugenia B., et al. “How Well Do Sparse ImageNet Models Transfer?” <i>2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition</i>, Institute of Electrical and Electronics Engineers, 2022, pp. 12256–66, doi:<a href=\"https://doi.org/10.1109/cvpr52688.2022.01195\">10.1109/cvpr52688.2022.01195</a>."},"scopus_import":"1","date_published":"2022-09-27T00:00:00Z","month":"09","ec_funded":1,"conference":{"location":"New Orleans, LA, United States","name":"CVPR: Computer Vision and Pattern Recognition","start_date":"2022-06-18","end_date":"2022-06-24"},"doi":"10.1109/cvpr52688.2022.01195","related_material":{"record":[{"relation":"dissertation_contains","id":"13074","status":"public"}]},"oa_version":"Preprint","acknowledgement":"he authors would like to sincerely thank Christoph Lampert and Nir Shavit for fruitful discussions during the development of this work, and Eldar Kurtic for experimental support. EI was supported in part by the FWF DK VGSCO, grant agreement number W1260-N35, while AP and DA acknowledge generous support by the ERC, via Starting Grant 805223 ScaleML.","title":"How well do sparse ImageNet models transfer?","main_file_link":[{"url":"https://doi.org/10.48550/arXiv.2111.13445","open_access":"1"}],"publication":"2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition","article_processing_charge":"No","language":[{"iso":"eng"}],"year":"2022","_id":"12299","abstract":[{"lang":"eng","text":"Transfer learning is a classic paradigm by which models pretrained on large “upstream” datasets are adapted to yield good results on “downstream” specialized datasets. Generally, more accurate models on the “upstream” dataset tend to provide better transfer accuracy “downstream”. In this work, we perform an in-depth investigation of this phenomenon in the context of convolutional neural networks (CNNs) trained on the ImageNet dataset, which have been pruned-that is, compressed by sparsifiying their connections. We consider transfer using unstructured pruned models obtained by applying several state-of-the-art pruning methods, including magnitude-based, second-order, regrowth, lottery-ticket, and regularization approaches, in the context of twelve standard transfer tasks. In a nutshell, our study shows that sparse models can match or even outperform the transfer performance of dense models, even at high sparsities, and, while doing so, can lead to significant inference and even training speedups. At the same time, we observe and analyze significant differences in the behaviour of different pruning methods. The code is available at: https://github.com/IST-DASLab/sparse-imagenet-transfer."}],"publication_status":"published","user_id":"4359f0d1-fa6c-11eb-b949-802e58b17ae8","author":[{"full_name":"Iofinova, Eugenia B","last_name":"Iofinova","orcid":"0000-0002-7778-3221","first_name":"Eugenia B","id":"f9a17499-f6e0-11ea-865d-fdf9a3f77117"},{"full_name":"Peste, Elena-Alexandra","last_name":"Peste","first_name":"Elena-Alexandra","id":"32D78294-F248-11E8-B48F-1D18A9856A87"},{"last_name":"Kurtz","first_name":"Mark","full_name":"Kurtz, Mark"},{"id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","last_name":"Alistarh","orcid":"0000-0003-3650-940X","first_name":"Dan-Adrian","full_name":"Alistarh, Dan-Adrian"}],"external_id":{"isi":["000870759105034"],"arxiv":["2111.13445"]},"publication_identifier":{"eissn":["2575-7075"]},"project":[{"grant_number":" W1260-N35","_id":"9B9290DE-BA93-11EA-9121-9846C619BF3A","name":"Vienna Graduate School on Computational Optimization"},{"_id":"268A44D6-B435-11E9-9278-68D0E5697425","name":"Elastic Coordination for Scalable Machine Learning","grant_number":"805223","call_identifier":"H2020"}],"department":[{"_id":"DaAl"},{"_id":"ChLa"}],"day":"27","isi":1,"oa":1,"arxiv":1,"quality_controlled":"1","publisher":"Institute of Electrical and Electronics Engineers","page":"12256-12266","date_created":"2023-01-16T10:06:00Z","status":"public"},{"main_file_link":[{"url":"https://www.jmlr.org/papers/v22/21-0366.html","open_access":"1"}],"title":"Sparsity in deep learning: Pruning and growth for efficient inference and training in neural networks","intvolume":"        22","acknowledgement":"We thank Doug Burger, Steve Scott, Marco Heddes, and the respective teams at Microsoft for inspiring discussions on the topic. We thank Angelika Steger for uplifting debates about the connections to biological brains, Sidak Pal Singh for his support regarding experimental results, and Utku Evci as well as Xin Wang for comments on previous versions of this\r\nwork. Special thanks go to Bernhard Schölkopf, our JMLR editor Samy Bengio, and the three anonymous reviewers who provided excellent comprehensive, pointed, and deep review comments that improved the quality of our manuscript significantly.","language":[{"iso":"eng"}],"year":"2021","publication":"Journal of Machine Learning Research","article_processing_charge":"No","scopus_import":"1","tmp":{"legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","short":"CC BY (4.0)","image":"/images/cc_by.png","name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)"},"ddc":["000"],"date_published":"2021-09-01T00:00:00Z","month":"09","type":"journal_article","has_accepted_license":"1","citation":{"ieee":"T. Hoefler, D.-A. Alistarh, T. Ben-Nun, N. Dryden, and E.-A. Peste, “Sparsity in deep learning: Pruning and growth for efficient inference and training in neural networks,” <i>Journal of Machine Learning Research</i>, vol. 22, no. 241. Journal of Machine Learning Research, pp. 1–124, 2021.","mla":"Hoefler, Torsten, et al. “Sparsity in Deep Learning: Pruning and Growth for Efficient Inference and Training in Neural Networks.” <i>Journal of Machine Learning Research</i>, vol. 22, no. 241, Journal of Machine Learning Research, 2021, pp. 1–124.","ama":"Hoefler T, Alistarh D-A, Ben-Nun T, Dryden N, Peste E-A. Sparsity in deep learning: Pruning and growth for efficient inference and training in neural networks. <i>Journal of Machine Learning Research</i>. 2021;22(241):1-124.","ista":"Hoefler T, Alistarh D-A, Ben-Nun T, Dryden N, Peste E-A. 2021. Sparsity in deep learning: Pruning and growth for efficient inference and training in neural networks. Journal of Machine Learning Research. 22(241), 1–124.","chicago":"Hoefler, Torsten, Dan-Adrian Alistarh, Tal Ben-Nun, Nikoli Dryden, and Elena-Alexandra Peste. “Sparsity in Deep Learning: Pruning and Growth for Efficient Inference and Training in Neural Networks.” <i>Journal of Machine Learning Research</i>. Journal of Machine Learning Research, 2021.","apa":"Hoefler, T., Alistarh, D.-A., Ben-Nun, T., Dryden, N., &#38; Peste, E.-A. (2021). Sparsity in deep learning: Pruning and growth for efficient inference and training in neural networks. <i>Journal of Machine Learning Research</i>. Journal of Machine Learning Research.","short":"T. Hoefler, D.-A. Alistarh, T. Ben-Nun, N. Dryden, E.-A. Peste, Journal of Machine Learning Research 22 (2021) 1–124."},"date_updated":"2022-05-13T09:36:08Z","file_date_updated":"2021-10-27T15:34:18Z","oa_version":"Published Version","article_type":"original","issue":"241","oa":1,"arxiv":1,"quality_controlled":"1","department":[{"_id":"DaAl"}],"day":"01","date_created":"2021-10-24T22:01:34Z","status":"public","page":"1-124","file":[{"file_name":"2021_JMachLearnRes_Hoefler.pdf","access_level":"open_access","success":1,"checksum":"3389d9d01fc58f8fb4c1a53e14a8abbf","date_created":"2021-10-27T15:34:18Z","file_size":3527521,"content_type":"application/pdf","creator":"cziletti","date_updated":"2021-10-27T15:34:18Z","relation":"main_file","file_id":"10192"}],"publisher":"Journal of Machine Learning Research","publication_status":"published","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","_id":"10180","abstract":[{"text":"The growing energy and performance costs of deep learning have driven the community to reduce the size of neural networks by selectively pruning components. Similarly to their biological counterparts, sparse networks generalize just as well, sometimes even better than, the original dense networks. Sparsity promises to reduce the memory footprint of regular networks to fit mobile devices, as well as shorten training time for ever growing networks. In this paper, we survey prior work on sparsity in deep learning and provide an extensive tutorial of sparsification for both inference and training. We describe approaches to remove and add elements of neural networks, different training strategies to achieve model sparsity, and mechanisms to exploit sparsity in practice. Our work distills ideas from more than 300 research papers and provides guidance to practitioners who wish to utilize sparsity today, as well as to researchers whose goal is to push the frontier forward. We include the necessary background on mathematical methods in sparsification, describe phenomena such as early structure adaptation, the intricate relations between sparsity and the training process, and show techniques for achieving acceleration on real hardware. We also define a metric of pruned parameter efficiency that could serve as a baseline for comparison of different sparse networks. We close by speculating on how sparsity can improve future workloads and outline major open problems in the field.","lang":"eng"}],"publication_identifier":{"eissn":["1533-7928"],"issn":["1532-4435"]},"volume":22,"author":[{"last_name":"Hoefler","first_name":"Torsten","full_name":"Hoefler, Torsten"},{"id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","last_name":"Alistarh","orcid":"0000-0003-3650-940X","first_name":"Dan-Adrian","full_name":"Alistarh, Dan-Adrian"},{"last_name":"Ben-Nun","first_name":"Tal","full_name":"Ben-Nun, Tal"},{"first_name":"Nikoli","last_name":"Dryden","full_name":"Dryden, Nikoli"},{"full_name":"Peste, Elena-Alexandra","id":"32D78294-F248-11E8-B48F-1D18A9856A87","first_name":"Elena-Alexandra","last_name":"Peste"}],"external_id":{"arxiv":["2102.00554"]}},{"related_material":{"record":[{"id":"13074","relation":"dissertation_contains","status":"public"}]},"conference":{"end_date":"2021-12-14","start_date":"2021-12-06","name":"NeurIPS: Neural Information Processing Systems","location":"Virtual, Online"},"ec_funded":1,"oa_version":"Published Version","date_updated":"2023-06-01T12:54:45Z","citation":{"ama":"Peste E-A, Iofinova EB, Vladu A, Alistarh D-A. AC/DC: Alternating Compressed/DeCompressed training of deep neural networks. In: <i>35th Conference on Neural Information Processing Systems</i>. Vol 34. Curran Associates; 2021:8557-8570.","ieee":"E.-A. Peste, E. B. Iofinova, A. Vladu, and D.-A. Alistarh, “AC/DC: Alternating Compressed/DeCompressed training of deep neural networks,” in <i>35th Conference on Neural Information Processing Systems</i>, Virtual, Online, 2021, vol. 34, pp. 8557–8570.","mla":"Peste, Elena-Alexandra, et al. “AC/DC: Alternating Compressed/DeCompressed Training of Deep Neural Networks.” <i>35th Conference on Neural Information Processing Systems</i>, vol. 34, Curran Associates, 2021, pp. 8557–70.","apa":"Peste, E.-A., Iofinova, E. B., Vladu, A., &#38; Alistarh, D.-A. (2021). AC/DC: Alternating Compressed/DeCompressed training of deep neural networks. In <i>35th Conference on Neural Information Processing Systems</i> (Vol. 34, pp. 8557–8570). Virtual, Online: Curran Associates.","short":"E.-A. Peste, E.B. Iofinova, A. Vladu, D.-A. Alistarh, in:, 35th Conference on Neural Information Processing Systems, Curran Associates, 2021, pp. 8557–8570.","ista":"Peste E-A, Iofinova EB, Vladu A, Alistarh D-A. 2021. AC/DC: Alternating Compressed/DeCompressed training of deep neural networks. 35th Conference on Neural Information Processing Systems. NeurIPS: Neural Information Processing Systems vol. 34, 8557–8570.","chicago":"Peste, Elena-Alexandra, Eugenia B Iofinova, Adrian Vladu, and Dan-Adrian Alistarh. “AC/DC: Alternating Compressed/DeCompressed Training of Deep Neural Networks.” In <i>35th Conference on Neural Information Processing Systems</i>, 34:8557–70. Curran Associates, 2021."},"type":"conference","month":"12","date_published":"2021-12-06T00:00:00Z","scopus_import":"1","article_processing_charge":"No","publication":"35th Conference on Neural Information Processing Systems","year":"2021","language":[{"iso":"eng"}],"acknowledged_ssus":[{"_id":"ScienComp"}],"acknowledgement":"This project has received funding from the European Research Council (ERC) under the European Union’s Horizon 2020 research and innovation programme (grant agreement No 805223 ScaleML), and a CNRS PEPS grant. This research was supported by the Scientific Service Units (SSU) of IST Austria through resources provided by Scientific Computing (SciComp). We would also like to thank Christoph Lampert for his feedback on an earlier version of this work, as well as for providing hardware for the Transformer-XL experiments.","intvolume":"        34","title":"AC/DC: Alternating Compressed/DeCompressed training of deep neural networks","main_file_link":[{"url":"https://proceedings.neurips.cc/paper/2021/file/48000647b315f6f00f913caa757a70b3-Paper.pdf","open_access":"1"}],"external_id":{"arxiv":["2106.12379"]},"author":[{"full_name":"Peste, Elena-Alexandra","first_name":"Elena-Alexandra","last_name":"Peste","id":"32D78294-F248-11E8-B48F-1D18A9856A87"},{"full_name":"Iofinova, Eugenia B","id":"f9a17499-f6e0-11ea-865d-fdf9a3f77117","orcid":"0000-0002-7778-3221","last_name":"Iofinova","first_name":"Eugenia B"},{"full_name":"Vladu, Adrian","first_name":"Adrian","last_name":"Vladu"},{"full_name":"Alistarh, Dan-Adrian","last_name":"Alistarh","orcid":"0000-0003-3650-940X","first_name":"Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87"}],"project":[{"_id":"268A44D6-B435-11E9-9278-68D0E5697425","name":"Elastic Coordination for Scalable Machine Learning","grant_number":"805223","call_identifier":"H2020"}],"volume":34,"publication_identifier":{"isbn":["9781713845393"],"issn":["1049-5258"]},"abstract":[{"lang":"eng","text":"The increasing computational requirements of deep neural networks (DNNs) have led to significant interest in obtaining DNN models that are sparse, yet accurate. Recent work has investigated the even harder case of sparse training, where the DNN weights are, for as much as possible, already sparse to reduce computational costs during training. Existing sparse training methods are often empirical and can have lower accuracy relative to the dense baseline. In this paper, we present a general approach called Alternating Compressed/DeCompressed (AC/DC) training of DNNs, demonstrate convergence for a variant of the algorithm, and show that AC/DC outperforms existing sparse training methods in accuracy at similar computational budgets; at high sparsity levels, AC/DC even outperforms existing methods that rely on accurate pre-trained dense models. An important property of AC/DC is that it allows co-training of dense and sparse models, yielding accurate sparse–dense model pairs at the end of the training process. This is useful in practice, where compressed variants may be desirable for deployment in resource-constrained settings without re-doing the entire training flow, and also provides us with insights into the accuracy gap between dense and compressed models. The code is available at: https://github.com/IST-DASLab/ACDC."}],"_id":"11458","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","publication_status":"published","page":"8557-8570","publisher":"Curran Associates","status":"public","date_created":"2022-06-20T12:11:53Z","day":"6","department":[{"_id":"GradSch"},{"_id":"DaAl"}],"quality_controlled":"1","arxiv":1,"oa":1}]