@inproceedings{14459,
  abstract     = {Autoencoders are a popular model in many branches of machine learning and lossy data compression. However, their fundamental limits, the performance of gradient methods and the features learnt during optimization remain poorly understood, even in the two-layer setting. In fact, earlier work has considered either linear autoencoders or specific training regimes (leading to vanishing or diverging compression rates). Our paper addresses this gap by focusing on non-linear two-layer autoencoders trained in the challenging proportional regime in which the input dimension scales linearly with the size of the representation. Our results characterize the minimizers of the population risk, and show that such minimizers are achieved by gradient methods; their structure is also unveiled, thus leading to a concise description of the features obtained via training. For the special case of a sign activation function, our analysis establishes the fundamental limits for the lossy compression of Gaussian sources via (shallow) autoencoders. Finally, while the results are proved for Gaussian data, numerical simulations on standard datasets display the universality of the theoretical predictions.},
  author       = {Shevchenko, Aleksandr and Kögler, Kevin and Hassani, Hamed and Mondelli, Marco},
  booktitle    = {Proceedings of the 40th International Conference on Machine Learning},
  issn         = {2640-3498},
  location     = {Honolulu, Hawaii, HI, United States},
  pages        = {31151--31209},
  publisher    = {ML Research Press},
  title        = {{Fundamental limits of two-layer autoencoders, and achieving them with gradient methods}},
  volume       = {202},
  year         = {2023},
}

@inproceedings{14921,
  abstract     = {Neural collapse (NC) refers to the surprising structure of the last layer of deep neural networks in the terminal phase of gradient descent training. Recently, an increasing amount of experimental evidence has pointed to the propagation of NC to earlier layers of neural networks. However, while the NC in the last layer is well studied theoretically, much less is known about its multi-layered counterpart - deep neural collapse (DNC). In particular, existing work focuses either on linear layers or only on the last two layers at the price of an extra assumption. Our paper fills this gap by generalizing the established analytical framework for NC - the unconstrained features model - to multiple non-linear layers. Our key technical contribution is to show that, in a deep unconstrained features model, the unique global optimum for binary classification exhibits all the properties typical of DNC. This explains the existing experimental evidence of DNC. We also empirically show that (i) by optimizing deep unconstrained features models via gradient descent, the resulting solution agrees well with our theory, and (ii) trained networks recover the unconstrained features suitable for the occurrence of DNC, thus supporting the validity of this modeling principle.},
  author       = {Súkeník, Peter and Mondelli, Marco and Lampert, Christoph},
  booktitle    = {37th Annual Conference on Neural Information Processing Systems},
  location     = {New Orleans, LA, United States},
  title        = {{Deep neural collapse is provably optimal for the deep unconstrained features model}},
  year         = {2023},
}

@inproceedings{14922,
  abstract     = {We propose a novel approach to concentration for non-independent random variables. The main idea is to ``pretend'' that the random variables are independent and pay a multiplicative price measuring how far they are from actually being independent. This price is encapsulated in the Hellinger integral between the joint and the product of the marginals, which is then upper bounded leveraging tensorisation properties. Our bounds represent a natural generalisation of concentration inequalities in the presence of dependence: we recover exactly the classical bounds (McDiarmid's inequality) when the random variables are independent. Furthermore, in a ``large deviations'' regime, we obtain the same decay in the probability as for the independent case, even when the random variables display non-trivial dependencies. To show this, we consider a number of applications of interest. First, we provide a bound for Markov chains with finite state space. Then, we consider the Simple Symmetric Random Walk, which is a non-contracting Markov chain, and a non-Markovian setting in which the stochastic process depends on its entire past. To conclude, we propose an application to Markov Chain Monte Carlo methods, where our approach leads to an improved lower bound on the minimum burn-in period required to reach a certain accuracy. In all of these settings, we provide a regime of parameters in which our bound fares better than what the state of the art can provide.},
  author       = {Esposito, Amedeo Roberto and Mondelli, Marco},
  booktitle    = {Proceedings of 2023 IEEE International Symposium on Information Theory},
  location     = {Taipei, Taiwan},
  publisher    = {IEEE},
  title        = {{Concentration without independence via information measures}},
  doi          = {10.1109/isit54713.2023.10206899},
  year         = {2023},
}

@inproceedings{14924,
  abstract     = {The stochastic heavy ball method (SHB), also known as stochastic gradient descent (SGD) with Polyak's momentum, is widely used in training neural networks. However, despite the remarkable success of such algorithm in practice, its theoretical characterization remains limited. In this paper, we focus on neural networks with two and three layers and provide a rigorous understanding of the properties of the solutions found by SHB: \emph{(i)} stability after dropping out part of the neurons, \emph{(ii)} connectivity along a low-loss path, and \emph{(iii)} convergence to the global optimum.
To achieve this goal, we take a mean-field view and relate the SHB dynamics to a certain partial differential equation in the limit of large network widths. This mean-field perspective has inspired a recent line of work focusing on SGD while, in contrast, our paper considers an algorithm with momentum. More specifically, after proving existence and uniqueness of the limit differential equations, we show convergence to the global optimum and give a quantitative bound between the mean-field limit and the SHB dynamics of a finite-width network. Armed with this last bound, we are able to establish the dropout-stability and connectivity of SHB solutions.},
  author       = {Wu, Diyuan and Kungurtsev, Vyacheslav and Mondelli, Marco},
  booktitle    = {Transactions on Machine Learning Research},
  publisher    = {ML Research Press},
  title        = {{Mean-field analysis for heavy ball methods: Dropout-stability, connectivity, and global convergence}},
  year         = {2023},
}

@article{13315,
  abstract     = {How do statistical dependencies in measurement noise influence high-dimensional inference? To answer this, we study the paradigmatic spiked matrix model of principal components analysis (PCA), where a rank-one matrix is corrupted by additive noise. We go beyond the usual independence assumption on the noise entries, by drawing the noise from a low-order polynomial orthogonal matrix ensemble. The resulting noise correlations make the setting relevant for applications but analytically challenging. We provide characterization of the Bayes optimal limits of inference in this model. If the spike is rotation invariant, we show that standard spectral PCA is optimal. However, for more general priors, both PCA and the existing approximate message-passing algorithm (AMP) fall short of achieving the information-theoretic limits, which we compute using the replica method from statistical physics. We thus propose an AMP, inspired by the theory of adaptive Thouless–Anderson–Palmer equations, which is empirically observed to saturate the conjectured theoretical limit. This AMP comes with a rigorous state evolution analysis tracking its performance. Although we focus on specific noise distributions, our methodology can be generalized to a wide class of trace matrix ensembles at the cost of more involved expressions. Finally, despite the seemingly strong assumption of rotation-invariant noise, our theory empirically predicts algorithmic performance on real data, pointing at strong universality properties.},
  author       = {Barbier, Jean and Camilli, Francesco and Mondelli, Marco and Sáenz, Manuel},
  issn         = {1091-6490},
  journal      = {Proceedings of the National Academy of Sciences of the United States of America},
  number       = {30},
  publisher    = {National Academy of Sciences},
  title        = {{Fundamental limits in structured principal component analysis and how to reach them}},
  doi          = {10.1073/pnas.2302028120},
  volume       = {120},
  year         = {2023},
}

@inproceedings{13321,
  abstract     = {We consider the problem of reconstructing the signal and the hidden variables from observations coming from a multi-layer network with rotationally invariant weight matrices. The multi-layer structure models inference from deep generative priors, and the rotational invariance imposed on the weights generalizes the i.i.d. Gaussian assumption by allowing for a complex correlation structure, which is typical in applications. In this work, we present a new class of approximate message passing (AMP) algorithms and give a state evolution recursion which precisely characterizes their performance in the large system limit. In contrast with the existing multi-layer VAMP (ML-VAMP) approach, our proposed AMP – dubbed multilayer rotationally invariant generalized AMP (ML-RI-GAMP) – provides a natural generalization beyond Gaussian designs, in the sense that it recovers the existing Gaussian AMP as a special case. Furthermore, ML-RI-GAMP exhibits a significantly lower complexity than ML-VAMP, as the computationally intensive singular value decomposition is replaced by an estimation of the moments of the design matrices. Finally, our numerical results show that this complexity gain comes at little to no cost in the performance of the algorithm.},
  author       = {Xu, Yizhou and Hou, Tian Qi and Liang, Shan Suo and Mondelli, Marco},
  booktitle    = {2023 IEEE Information Theory Workshop},
  isbn         = {9798350301496},
  issn         = {2475-4218},
  location     = {Saint-Malo, France},
  pages        = {294--298},
  publisher    = {Institute of Electrical and Electronics Engineers},
  title        = {{Approximate message passing for multi-layer estimation in rotationally invariant models}},
  doi          = {10.1109/ITW55543.2023.10160238},
  year         = {2023},
}

@inproceedings{12859,
  abstract     = {Machine learning models are vulnerable to adversarial perturbations, and a thought-provoking paper by Bubeck and Sellke has analyzed this phenomenon through the lens of over-parameterization: interpolating smoothly the data requires significantly more parameters than simply memorizing it. However, this "universal" law provides only a necessary condition for robustness, and it is unable to discriminate between models. In this paper, we address these gaps by focusing on empirical risk minimization in two prototypical settings, namely, random features and the neural tangent kernel (NTK). We prove that, for random features, the model is not robust for any degree of over-parameterization, even when the necessary condition coming from the universal law of robustness is satisfied. In contrast, for even activations, the NTK model meets the universal lower bound, and it is robust as soon as the necessary condition on over-parameterization is fulfilled. This also addresses a conjecture in prior work by Bubeck, Li and Nagaraj. Our analysis decouples the effect of the kernel of the model from an "interaction matrix", which describes the interaction with the test data and captures the effect of the activation. Our theoretical results are corroborated by numerical evidence on both synthetic and standard datasets (MNIST, CIFAR-10).},
  author       = {Bombari, Simone and Kiyani, Shayan and Mondelli, Marco},
  booktitle    = {Proceedings of the 40th International Conference on Machine Learning},
  location     = {Honolulu, HI, United States},
  pages        = {2738--2776},
  publisher    = {ML Research Press},
  title        = {{Beyond the universal law of robustness: Sharper laws for random features and neural tangent kernels}},
  volume       = {202},
  year         = {2023},
}

@article{11420,
  abstract     = {Understanding the properties of neural networks trained via stochastic gradient descent (SGD) is at the heart of the theory of deep learning. In this work, we take a mean-field view, and consider a two-layer ReLU network trained via noisy-SGD for a univariate regularized regression problem. Our main result is that SGD with vanishingly small noise injected in the gradients is biased towards a simple solution: at convergence, the ReLU network implements a piecewise linear map of the inputs, and the number of “knot” points -- i.e., points where the tangent of the ReLU network estimator changes -- between two consecutive training inputs is at most three. In particular, as the number of neurons of the network grows, the SGD dynamics is captured by the solution of a gradient flow and, at convergence, the distribution of the weights approaches the unique minimizer of a related free energy, which has a Gibbs form. Our key technical contribution consists in the analysis of the estimator resulting from this minimizer: we show that its second derivative vanishes everywhere, except at some specific locations which represent the “knot” points. We also provide empirical evidence that knots at locations distinct from the data points might occur, as predicted by our theory.},
  author       = {Shevchenko, Aleksandr and Kungurtsev, Vyacheslav and Mondelli, Marco},
  issn         = {1533-7928},
  journal      = {Journal of Machine Learning Research},
  number       = {130},
  pages        = {1--55},
  publisher    = {Journal of Machine Learning Research},
  title        = {{Mean-field analysis of piecewise linear solutions for wide ReLU networks}},
  volume       = {23},
  year         = {2022},
}

@article{10364,
  abstract     = {This paper characterizes the latency of the simplified successive-cancellation (SSC) decoding scheme for polar codes under hardware resource constraints. In particular, when the number of processing elements P that can perform SSC decoding operations in parallel is limited, as is the case in practice, the latency of SSC decoding is O(N1-1/μ + N/P log2 log2 N/P), where N is the block length of the code and μ is the scaling exponent of the channel. Three direct consequences of this bound are presented. First, in a fully-parallel implementation where P = N/2, the latency of SSC decoding is O(N1-1/μ), which is sublinear in the block length. This recovers a result from our earlier work. Second, in a fully-serial implementation where P = 1, the latency of SSC decoding scales as O(N log2 log2 N). The multiplicative constant is also calculated: we show that the latency of SSC decoding when P = 1 is given by (2 + o(1))N log2 log2 N. Third, in a semi-parallel implementation, the smallest P that gives the same latency as that of the fully-parallel implementation is P = N1/μ. The tightness of our bound on SSC decoding latency and the applicability of the foregoing results is validated through extensive simulations.},
  author       = {Hashemi, Seyyed Ali and Mondelli, Marco and Fazeli, Arman and Vardy, Alexander and Cioffi, John and Goldsmith, Andrea},
  issn         = {1558-2248},
  journal      = {IEEE Transactions on Wireless Communications},
  number       = {6},
  pages        = {3909--3920},
  publisher    = {Institute of Electrical and Electronics Engineers},
  title        = {{Parallelism versus latency in simplified successive-cancellation decoding of polar codes}},
  doi          = {10.1109/TWC.2021.3125626},
  volume       = {21},
  year         = {2022},
}

@inproceedings{12016,
  abstract     = {We consider the problem of coded distributed computing using polar codes. The average execution time of a coded computing system is related to the error probability for transmission over the binary erasure channel in recent work by Soleymani, Jamali and Mahdavifar, where the performance of binary linear codes is investigated. In this paper, we focus on polar codes and unveil a connection between the average execution time and the scaling exponent μ of the family of codes. In the finite-length characterization of polar codes, the scaling exponent is a key object capturing the speed of convergence to capacity. In particular, we show that (i) the gap between the normalized average execution time of polar codes and that of optimal MDS codes is O(n –1/μ ), and (ii) this upper bound can be improved to roughly O(n –1/2 ) by considering polar codes with large kernels. We conjecture that these bounds could be improved to O(n –2/μ ) and O(n –1 ), respectively, and provide a heuristic argument as well as numerical evidence supporting this view.},
  author       = {Fathollahi, Dorsa and Mondelli, Marco},
  booktitle    = {2022 IEEE International Symposium on Information Theory},
  isbn         = {9781665421591},
  issn         = {2157-8095},
  location     = {Espoo, Finland},
  pages        = {2154--2159},
  publisher    = {IEEE},
  title        = {{Polar coded computing: The role of the scaling exponent}},
  doi          = {10.1109/ISIT50566.2022.9834712},
  volume       = {2022},
  year         = {2022},
}

@article{12480,
  abstract     = {We consider the problem of estimating a signal from measurements obtained via a generalized linear model. We focus on estimators based on approximate message passing (AMP), a family of iterative algorithms with many appealing features: the performance of AMP in the high-dimensional limit can be succinctly characterized under suitable model assumptions; AMP can also be tailored to the empirical distribution of the signal entries, and for a wide class of estimation problems, AMP is conjectured to be optimal among all polynomial-time algorithms. However, a major issue of AMP is that in many models (such as phase retrieval), it requires an initialization correlated with the ground-truth signal and independent from the measurement matrix. Assuming that such an initialization is available is typically not realistic. In this paper, we solve this problem by proposing an AMP algorithm initialized with a spectral estimator. With such an initialization, the standard AMP analysis fails since the spectral estimator depends in a complicated way on the design matrix. Our main contribution is a rigorous characterization of the performance of AMP with spectral initialization in the high-dimensional limit. The key technical idea is to define and analyze a two-phase artificial AMP algorithm that first produces the spectral estimator, and then closely approximates the iterates of the true AMP. We also provide numerical results that demonstrate the validity of the proposed approach.},
  author       = {Mondelli, Marco and Venkataramanan, Ramji},
  issn         = {1742-5468},
  journal      = {Journal of Statistical Mechanics: Theory and Experiment},
  keywords     = {Statistics, Probability and Uncertainty, Statistics and Probability, Statistical and Nonlinear Physics},
  number       = {11},
  publisher    = {IOP Publishing},
  title        = {{Approximate message passing with spectral initialization for generalized linear models}},
  doi          = {10.1088/1742-5468/ac9828},
  volume       = {2022},
  year         = {2022},
}

@inproceedings{12537,
  abstract     = {The Neural Tangent Kernel (NTK) has emerged as a powerful tool to provide memorization, optimization and generalization guarantees in deep neural networks. A line of work has studied the NTK spectrum for two-layer and deep networks with at least a layer with Ω(N) neurons, N being the number of training samples. Furthermore, there is increasing evidence suggesting that deep networks with sub-linear layer widths are powerful memorizers and optimizers, as long as the number of parameters exceeds the number of samples. Thus, a natural open question is whether the NTK is well conditioned in such a challenging sub-linear setup. In this paper, we answer this question in the affirmative. Our key technical contribution is a lower bound on the smallest NTK eigenvalue for deep networks with the minimum possible over-parameterization: the number of parameters is roughly Ω(N) and, hence, the number of neurons is as little as Ω(N−−√). To showcase the applicability of our NTK bounds, we provide two results concerning memorization capacity and optimization guarantees for gradient descent training.},
  author       = {Bombari, Simone and Amani, Mohammad Hossein and Mondelli, Marco},
  booktitle    = {36th Conference on Neural Information Processing Systems},
  isbn         = {9781713871088},
  pages        = {7628--7640},
  publisher    = {Curran Associates},
  title        = {{Memorization and optimization in deep neural networks with minimum over-parameterization}},
  volume       = {35},
  year         = {2022},
}

@inproceedings{12540,
  abstract     = {We consider the problem of signal estimation in generalized linear models defined via rotationally invariant design matrices. Since these matrices can have an arbitrary spectral distribution, this model is well suited for capturing complex correlation structures which often arise in applications. We propose a novel family of approximate message passing (AMP) algorithms for signal estimation, and rigorously characterize their performance in the high-dimensional limit via a state evolution recursion. Our rotationally invariant AMP has complexity of the same order as the existing AMP derived under the restrictive assumption of a Gaussian design; our algorithm also recovers this existing AMP as a special case. Numerical results showcase a performance close to Vector AMP (which is conjectured to be Bayes-optimal in some settings), but obtained with a much lower complexity, as the proposed algorithm does not require a computationally expensive singular value decomposition.},
  author       = {Venkataramanan, Ramji and Kögler, Kevin and Mondelli, Marco},
  booktitle    = {Proceedings of the 39th International Conference on Machine Learning},
  location     = {Baltimore, MD, United States},
  publisher    = {ML Research Press},
  title        = {{Estimation in rotationally invariant generalized linear models via approximate message passing}},
  volume       = {162},
  year         = {2022},
}

@inproceedings{13146,
  abstract     = {A recent line of work has analyzed the theoretical properties of deep neural networks via the Neural Tangent Kernel (NTK). In particular, the smallest eigenvalue of the NTK has been related to the memorization capacity, the global convergence of gradient descent algorithms and the generalization of deep nets. However, existing results either provide bounds in the two-layer setting or assume that the spectrum of the NTK matrices is bounded away from 0 for multi-layer networks. In this paper, we provide tight bounds on the smallest eigenvalue of NTK matrices for deep ReLU nets, both in the limiting case of infinite widths and for finite widths. In the finite-width setting, the network architectures we consider are fairly general: we require the existence of a wide layer with roughly order of N neurons, N being the number of data samples; and the scaling of the remaining layer widths is arbitrary (up to logarithmic factors). To obtain our results, we analyze various quantities of independent interest: we give lower bounds on the smallest singular value of hidden feature matrices, and upper bounds on the Lipschitz constant of input-output feature maps.},
  author       = {Nguyen, Quynh and Mondelli, Marco and Montufar, Guido},
  booktitle    = {Proceedings of the 38th International Conference on Machine Learning},
  isbn         = {9781713845065},
  issn         = {2640-3498},
  location     = {Virtual},
  pages        = {8119--8129},
  publisher    = {ML Research Press},
  title        = {{Tight bounds on the smallest Eigenvalue of the neural tangent kernel for deep ReLU networks}},
  volume       = {139},
  year         = {2021},
}

@inproceedings{10053,
  abstract     = {This paper characterizes the latency of the simplified successive-cancellation (SSC) decoding scheme for polar codes under hardware resource constraints. In particular, when the number of processing elements P that can perform SSC decoding operations in parallel is limited, as is the case in practice, the latency of SSC decoding is O(N1−1 μ+NPlog2log2NP), where N is the block length of the code and μ is the scaling exponent of polar codes for the channel. Three direct consequences of this bound are presented. First, in a fully-parallel implementation where P=N2 , the latency of SSC decoding is O(N1−1/μ) , which is sublinear in the block length. This recovers a result from an earlier work. Second, in a fully-serial implementation where P=1 , the latency of SSC decoding scales as O(Nlog2log2N) . The multiplicative constant is also calculated: we show that the latency of SSC decoding when P=1 is given by (2+o(1))Nlog2log2N . Third, in a semi-parallel implementation, the smallest P that gives the same latency as that of the fully-parallel implementation is P=N1/μ . The tightness of our bound on SSC decoding latency and the applicability of the foregoing results is validated through extensive simulations.},
  author       = {Hashemi, Seyyed Ali and Mondelli, Marco and Fazeli, Arman and Vardy, Alexander and Cioffi, John and Goldsmith, Andrea},
  booktitle    = {2021 IEEE International Symposium on Information Theory},
  isbn         = {978-1-5386-8210-4},
  issn         = {2157-8095},
  location     = {Melbourne, Australia},
  pages        = {2369--2374},
  publisher    = {Institute of Electrical and Electronics Engineers},
  title        = {{Parallelism versus latency in simplified successive-cancellation decoding of polar codes}},
  doi          = {10.1109/ISIT45174.2021.9518153},
  year         = {2021},
}

@inproceedings{10593,
  abstract     = {We study the problem of estimating a rank-$1$ signal in the presence of rotationally invariant noise-a class of perturbations more general than Gaussian noise. Principal Component Analysis (PCA) provides a natural estimator, and sharp results on its performance have been obtained in the high-dimensional regime. Recently, an Approximate Message Passing (AMP) algorithm has been proposed as an alternative estimator with the potential to improve the accuracy of PCA. However, the existing analysis of AMP requires an initialization that is both correlated with the signal and independent of the noise, which is often unrealistic in practice. In this work, we combine the two methods, and propose to initialize AMP with PCA. Our main result is a rigorous asymptotic characterization of the performance of this estimator. Both the AMP algorithm and its analysis differ from those previously derived in the Gaussian setting: at every iteration, our AMP algorithm requires a specific term to account for PCA initialization, while in the Gaussian case, PCA initialization affects only the first iteration of AMP. The proof is based on a two-phase artificial AMP that first approximates the PCA estimator and then mimics the true AMP. Our numerical simulations show an excellent agreement between AMP results and theoretical predictions, and suggest an interesting open direction on achieving Bayes-optimal performance.},
  author       = {Mondelli, Marco and Venkataramanan, Ramji},
  booktitle    = {35th Conference on Neural Information Processing Systems},
  isbn         = {9781713845393},
  issn         = {1049-5258},
  location     = {Virtual},
  pages        = {29616--29629},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{PCA initialization for approximate message passing in rotationally invariant models}},
  volume       = {35},
  year         = {2021},
}

@inproceedings{10594,
  abstract     = {The question of how and why the phenomenon of mode connectivity occurs in training deep neural networks has gained remarkable attention in the research community. From a theoretical perspective, two possible explanations have been proposed: (i) the loss function has connected sublevel sets, and (ii) the solutions found by stochastic gradient descent are dropout stable. While these explanations provide insights into the phenomenon, their assumptions are not always satisfied in practice. In particular, the first approach requires the network to have one layer with order of N neurons (N being the number of training samples), while the second one requires the loss to be almost invariant after removing half of the neurons at each layer (up to some rescaling of the remaining ones). In this work, we improve both conditions by exploiting the quality of the features at every intermediate layer together with a milder over-parameterization condition. More specifically, we show that: (i) under generic assumptions on the features of intermediate layers, it suffices that the last two hidden layers have order of N−−√ neurons, and (ii) if subsets of features at each layer are linearly separable, then no over-parameterization is needed to show the connectivity. Our experiments confirm that the proposed condition ensures the connectivity of solutions found by stochastic gradient descent, even in settings where the previous requirements do not hold.},
  author       = {Nguyen, Quynh and Bréchet, Pierre and Mondelli, Marco},
  booktitle    = {35th Conference on Neural Information Processing Systems},
  isbn         = {9781713845393},
  issn         = {1049-5258},
  location     = {Virtual},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{When are solutions connected in deep networks?}},
  volume       = {35},
  year         = {2021},
}

@inproceedings{10595,
  abstract     = {A recent line of work has analyzed the theoretical properties of deep neural networks via the Neural Tangent Kernel (NTK). In particular, the smallest eigenvalue of the NTK has been related to the memorization capacity, the global convergence of gradient descent algorithms and the generalization of deep nets. However, existing results either provide bounds in the two-layer setting or assume that the spectrum of the NTK matrices is bounded away from 0 for multi-layer networks. In this paper, we provide tight bounds on the smallest eigenvalue of NTK matrices for deep ReLU nets, both in the limiting case of infinite widths and for finite widths. In the finite-width setting, the network architectures we consider are fairly general: we require the existence of a wide layer with roughly order of $N$ neurons, $N$ being the number of data samples; and the scaling of the remaining layer widths is arbitrary (up to logarithmic factors). To obtain our results, we analyze various quantities of independent interest: we give lower bounds on the smallest singular value of hidden feature matrices, and upper bounds on the Lipschitz constant of input-output feature maps.},
  author       = {Nguyen, Quynh and Mondelli, Marco and Montufar, Guido F},
  booktitle    = {Proceedings of the 38th International Conference on Machine Learning},
  editor       = {Meila, Marina and Zhang, Tong},
  location     = {Virtual},
  pages        = {8119--8129},
  publisher    = {ML Research Press},
  title        = {{Tight bounds on the smallest eigenvalue of the neural tangent kernel for deep ReLU networks}},
  volume       = {139},
  year         = {2021},
}

@inproceedings{10597,
  abstract     = {We thank Emmanuel Abbe and Min Ye for providing us the implementation of RPA decoding. D. Fathollahi and M. Mondelli are partially supported by the 2019 Lopez-Loreta Prize. N. Farsad is supported by Discovery Grant from the Natural Sciences and Engineering Research Council of Canada (NSERC) and Canada Foundation for Innovation (CFI), John R. Evans Leader Fund. S. A. Hashemi is supported by a Postdoctoral Fellowship from NSERC.},
  author       = {Fathollahi, Dorsa and Farsad, Nariman and Hashemi, Seyyed Ali and Mondelli, Marco},
  booktitle    = {2021 IEEE International Symposium on Information Theory},
  isbn         = {978-1-5386-8210-4},
  location     = {Virtual, Melbourne, Australia},
  pages        = {1082--1087},
  publisher    = {Institute of Electrical and Electronics Engineers},
  title        = {{Sparse multi-decoder recursive projection aggregation for Reed-Muller codes}},
  doi          = {10.1109/isit45174.2021.9517887},
  year         = {2021},
}

@inproceedings{10598,
  abstract     = { We consider the problem of estimating a signal from measurements obtained via a generalized linear model. We focus on estimators based on approximate message passing (AMP), a family of iterative algorithms with many appealing features: the performance of AMP in the high-dimensional limit can be succinctly characterized under suitable model assumptions; AMP can also be tailored to the empirical distribution of the signal entries, and for a wide class of estimation problems, AMP is conjectured to be optimal among all polynomial-time algorithms. However, a major issue of AMP is that in many models (such as phase retrieval), it requires an initialization correlated with the ground-truth signal and independent from the measurement matrix. Assuming that such an initialization is available is typically not realistic. In this paper, we solve this problem by proposing an AMP algorithm initialized with a spectral estimator. With such an initialization, the standard AMP analysis fails since the spectral estimator depends in a complicated way on the design matrix. Our main contribution is a rigorous characterization of the performance of AMP with spectral initialization in the high-dimensional limit. The key technical idea is to define and analyze a two-phase artificial AMP algorithm that first produces the spectral estimator, and then closely approximates the iterates of the true AMP. We also provide numerical results that demonstrate the validity of the proposed approach. },
  author       = {Mondelli, Marco and Venkataramanan, Ramji},
  booktitle    = {Proceedings of The 24th International Conference on Artificial Intelligence and Statistics},
  editor       = {Banerjee, Arindam and Fukumizu, Kenji},
  issn         = {2640-3498},
  location     = {Virtual, San Diego, CA, United States},
  pages        = {397--405},
  publisher    = {ML Research Press},
  title        = {{Approximate message passing with spectral initialization for generalized linear models}},
  volume       = {130},
  year         = {2021},
}