@article{12838,
  abstract     = {We study the problem of high-dimensional multiple packing in Euclidean space. Multiple packing is a natural generalization of sphere packing and is defined as follows. Let N > 0 and L ∈ Z ≽2 . A multiple packing is a set C of points in R n such that any point in R n lies in the intersection of at most L – 1 balls of radius √ nN around points in C . Given a well-known connection with coding theory, multiple packings can be viewed as the Euclidean analog of list-decodable codes, which are well-studied for finite fields. In this paper, we derive the best known lower bounds on the optimal density of list-decodable infinite constellations for constant L under a stronger notion called average-radius multiple packing. To this end, we apply tools from high-dimensional geometry and large deviation theory.},
  author       = {Zhang, Yihan and Vatedka, Shashank},
  issn         = {1557-9654},
  journal      = {IEEE Transactions on Information Theory},
  number       = {7},
  pages        = {4513--4527},
  publisher    = {IEEE},
  title        = {{Multiple packing: Lower bounds via infinite constellations}},
  doi          = {10.1109/TIT.2023.3260950},
  volume       = {69},
  year         = {2023},
}

@inproceedings{12859,
  abstract     = {Machine learning models are vulnerable to adversarial perturbations, and a thought-provoking paper by Bubeck and Sellke has analyzed this phenomenon through the lens of over-parameterization: interpolating smoothly the data requires significantly more parameters than simply memorizing it. However, this "universal" law provides only a necessary condition for robustness, and it is unable to discriminate between models. In this paper, we address these gaps by focusing on empirical risk minimization in two prototypical settings, namely, random features and the neural tangent kernel (NTK). We prove that, for random features, the model is not robust for any degree of over-parameterization, even when the necessary condition coming from the universal law of robustness is satisfied. In contrast, for even activations, the NTK model meets the universal lower bound, and it is robust as soon as the necessary condition on over-parameterization is fulfilled. This also addresses a conjecture in prior work by Bubeck, Li and Nagaraj. Our analysis decouples the effect of the kernel of the model from an "interaction matrix", which describes the interaction with the test data and captures the effect of the activation. Our theoretical results are corroborated by numerical evidence on both synthetic and standard datasets (MNIST, CIFAR-10).},
  author       = {Bombari, Simone and Kiyani, Shayan and Mondelli, Marco},
  booktitle    = {Proceedings of the 40th International Conference on Machine Learning},
  location     = {Honolulu, HI, United States},
  pages        = {2738--2776},
  publisher    = {ML Research Press},
  title        = {{Beyond the universal law of robustness: Sharper laws for random features and neural tangent kernels}},
  volume       = {202},
  year         = {2023},
}

@article{13269,
  abstract     = {This paper is a collection of results on combinatorial properties of codes for the Z-channel . A Z-channel with error fraction τ takes as input a length- n binary codeword and injects in an adversarial manner up to n τ asymmetric errors, i.e., errors that only zero out bits but do not flip 0’s to 1’s. It is known that the largest ( L - 1)-list-decodable code for the Z-channel with error fraction τ has exponential size (in n ) if τ is less than a critical value that we call the ( L - 1)- list-decoding Plotkin point and has constant size if τ is larger than the threshold. The ( L -1)-list-decoding Plotkin point is known to be L -1/L-1 – L -L/ L-1 , which equals 1/4 for unique-decoding with L -1 = 1. In this paper, we derive various results for the size of the largest codes above and below the list-decoding Plotkin point. In particular, we show that the largest ( L -1)-list-decodable code ε-above the Plotkin point, for any given sufficiently small positive constant ε > 0, has size Θ L (ε -3/2 ) for any L - 1 ≥ 1. We also devise upper and lower bounds on the exponential size of codes below the list-decoding Plotkin point.},
  author       = {Polyanskii, Nikita and Zhang, Yihan},
  issn         = {1557-9654},
  journal      = {IEEE Transactions on Information Theory},
  number       = {10},
  pages        = {6340--6357},
  publisher    = {Institute of Electrical and Electronics Engineers},
  title        = {{Codes for the Z-channel}},
  doi          = {10.1109/TIT.2023.3292219},
  volume       = {69},
  year         = {2023},
}

@article{13315,
  abstract     = {How do statistical dependencies in measurement noise influence high-dimensional inference? To answer this, we study the paradigmatic spiked matrix model of principal components analysis (PCA), where a rank-one matrix is corrupted by additive noise. We go beyond the usual independence assumption on the noise entries, by drawing the noise from a low-order polynomial orthogonal matrix ensemble. The resulting noise correlations make the setting relevant for applications but analytically challenging. We provide characterization of the Bayes optimal limits of inference in this model. If the spike is rotation invariant, we show that standard spectral PCA is optimal. However, for more general priors, both PCA and the existing approximate message-passing algorithm (AMP) fall short of achieving the information-theoretic limits, which we compute using the replica method from statistical physics. We thus propose an AMP, inspired by the theory of adaptive Thouless–Anderson–Palmer equations, which is empirically observed to saturate the conjectured theoretical limit. This AMP comes with a rigorous state evolution analysis tracking its performance. Although we focus on specific noise distributions, our methodology can be generalized to a wide class of trace matrix ensembles at the cost of more involved expressions. Finally, despite the seemingly strong assumption of rotation-invariant noise, our theory empirically predicts algorithmic performance on real data, pointing at strong universality properties.},
  author       = {Barbier, Jean and Camilli, Francesco and Mondelli, Marco and Sáenz, Manuel},
  issn         = {1091-6490},
  journal      = {Proceedings of the National Academy of Sciences of the United States of America},
  number       = {30},
  publisher    = {National Academy of Sciences},
  title        = {{Fundamental limits in structured principal component analysis and how to reach them}},
  doi          = {10.1073/pnas.2302028120},
  volume       = {120},
  year         = {2023},
}

@inproceedings{13321,
  abstract     = {We consider the problem of reconstructing the signal and the hidden variables from observations coming from a multi-layer network with rotationally invariant weight matrices. The multi-layer structure models inference from deep generative priors, and the rotational invariance imposed on the weights generalizes the i.i.d. Gaussian assumption by allowing for a complex correlation structure, which is typical in applications. In this work, we present a new class of approximate message passing (AMP) algorithms and give a state evolution recursion which precisely characterizes their performance in the large system limit. In contrast with the existing multi-layer VAMP (ML-VAMP) approach, our proposed AMP – dubbed multilayer rotationally invariant generalized AMP (ML-RI-GAMP) – provides a natural generalization beyond Gaussian designs, in the sense that it recovers the existing Gaussian AMP as a special case. Furthermore, ML-RI-GAMP exhibits a significantly lower complexity than ML-VAMP, as the computationally intensive singular value decomposition is replaced by an estimation of the moments of the design matrices. Finally, our numerical results show that this complexity gain comes at little to no cost in the performance of the algorithm.},
  author       = {Xu, Yizhou and Hou, Tian Qi and Liang, Shan Suo and Mondelli, Marco},
  booktitle    = {2023 IEEE Information Theory Workshop},
  isbn         = {9798350301496},
  issn         = {2475-4218},
  location     = {Saint-Malo, France},
  pages        = {294--298},
  publisher    = {Institute of Electrical and Electronics Engineers},
  title        = {{Approximate message passing for multi-layer estimation in rotationally invariant models}},
  doi          = {10.1109/ITW55543.2023.10160238},
  year         = {2023},
}

@inproceedings{14921,
  abstract     = {Neural collapse (NC) refers to the surprising structure of the last layer of deep neural networks in the terminal phase of gradient descent training. Recently, an increasing amount of experimental evidence has pointed to the propagation of NC to earlier layers of neural networks. However, while the NC in the last layer is well studied theoretically, much less is known about its multi-layered counterpart - deep neural collapse (DNC). In particular, existing work focuses either on linear layers or only on the last two layers at the price of an extra assumption. Our paper fills this gap by generalizing the established analytical framework for NC - the unconstrained features model - to multiple non-linear layers. Our key technical contribution is to show that, in a deep unconstrained features model, the unique global optimum for binary classification exhibits all the properties typical of DNC. This explains the existing experimental evidence of DNC. We also empirically show that (i) by optimizing deep unconstrained features models via gradient descent, the resulting solution agrees well with our theory, and (ii) trained networks recover the unconstrained features suitable for the occurrence of DNC, thus supporting the validity of this modeling principle.},
  author       = {Súkeník, Peter and Mondelli, Marco and Lampert, Christoph},
  booktitle    = {37th Annual Conference on Neural Information Processing Systems},
  location     = {New Orleans, LA, United States},
  title        = {{Deep neural collapse is provably optimal for the deep unconstrained features model}},
  year         = {2023},
}

@inproceedings{14922,
  abstract     = {We propose a novel approach to concentration for non-independent random variables. The main idea is to ``pretend'' that the random variables are independent and pay a multiplicative price measuring how far they are from actually being independent. This price is encapsulated in the Hellinger integral between the joint and the product of the marginals, which is then upper bounded leveraging tensorisation properties. Our bounds represent a natural generalisation of concentration inequalities in the presence of dependence: we recover exactly the classical bounds (McDiarmid's inequality) when the random variables are independent. Furthermore, in a ``large deviations'' regime, we obtain the same decay in the probability as for the independent case, even when the random variables display non-trivial dependencies. To show this, we consider a number of applications of interest. First, we provide a bound for Markov chains with finite state space. Then, we consider the Simple Symmetric Random Walk, which is a non-contracting Markov chain, and a non-Markovian setting in which the stochastic process depends on its entire past. To conclude, we propose an application to Markov Chain Monte Carlo methods, where our approach leads to an improved lower bound on the minimum burn-in period required to reach a certain accuracy. In all of these settings, we provide a regime of parameters in which our bound fares better than what the state of the art can provide.},
  author       = {Esposito, Amedeo Roberto and Mondelli, Marco},
  booktitle    = {Proceedings of 2023 IEEE International Symposium on Information Theory},
  location     = {Taipei, Taiwan},
  publisher    = {IEEE},
  title        = {{Concentration without independence via information measures}},
  doi          = {10.1109/isit54713.2023.10206899},
  year         = {2023},
}

@inproceedings{14923,
  abstract     = {We study the performance of a Bayesian statistician who estimates a rank-one signal corrupted by non-symmetric rotationally invariant noise with a generic distribution of singular values. As the signal-to-noise ratio and the noise structure are unknown, a Gaussian setup is incorrectly assumed. We derive the exact analytic expression for the error of the mismatched Bayes estimator and also provide the analysis of an approximate message passing (AMP) algorithm. The first result exploits the asymptotic behavior of spherical integrals for rectangular matrices and of low-rank matrix perturbations; the second one relies on the design and analysis of an auxiliary AMP. The numerical experiments show that there is a performance gap between the AMP and Bayes estimators, which is due to the incorrect estimation of the signal norm.},
  author       = {Fu, Teng and Liu, YuHao and Barbier, Jean and Mondelli, Marco and Liang, ShanSuo and Hou, TianQi},
  booktitle    = {Proceedings of 2023 IEEE International Symposium on Information Theory},
  location     = {Taipei, Taiwan},
  publisher    = {IEEE},
  title        = {{Mismatched estimation of non-symmetric rank-one matrices corrupted by structured noise}},
  doi          = {10.1109/isit54713.2023.10206671},
  year         = {2023},
}

@inproceedings{14924,
  abstract     = {The stochastic heavy ball method (SHB), also known as stochastic gradient descent (SGD) with Polyak's momentum, is widely used in training neural networks. However, despite the remarkable success of such algorithm in practice, its theoretical characterization remains limited. In this paper, we focus on neural networks with two and three layers and provide a rigorous understanding of the properties of the solutions found by SHB: \emph{(i)} stability after dropping out part of the neurons, \emph{(ii)} connectivity along a low-loss path, and \emph{(iii)} convergence to the global optimum.
To achieve this goal, we take a mean-field view and relate the SHB dynamics to a certain partial differential equation in the limit of large network widths. This mean-field perspective has inspired a recent line of work focusing on SGD while, in contrast, our paper considers an algorithm with momentum. More specifically, after proving existence and uniqueness of the limit differential equations, we show convergence to the global optimum and give a quantitative bound between the mean-field limit and the SHB dynamics of a finite-width network. Armed with this last bound, we are able to establish the dropout-stability and connectivity of SHB solutions.},
  author       = {Wu, Diyuan and Kungurtsev, Vyacheslav and Mondelli, Marco},
  booktitle    = {Transactions on Machine Learning Research},
  publisher    = {ML Research Press},
  title        = {{Mean-field analysis for heavy ball methods: Dropout-stability, connectivity, and global convergence}},
  year         = {2023},
}

@inproceedings{14083,
  abstract     = {In this work we consider the list-decodability and list-recoverability of arbitrary q-ary codes, for all integer values of q ≥ 2. A code is called (p,L)_q-list-decodable if every radius pn Hamming ball contains less than L codewords; (p,𝓁,L)_q-list-recoverability is a generalization where we place radius pn Hamming balls on every point of a combinatorial rectangle with side length 𝓁 and again stipulate that there be less than L codewords.
Our main contribution is to precisely calculate the maximum value of p for which there exist infinite families of positive rate (p,𝓁,L)_q-list-recoverable codes, the quantity we call the zero-rate threshold. Denoting this value by p_*, we in fact show that codes correcting a p_*+ε fraction of errors must have size O_ε(1), i.e., independent of n. Such a result is typically referred to as a "Plotkin bound." To complement this, a standard random code with expurgation construction shows that there exist positive rate codes correcting a p_*-ε fraction of errors. We also follow a classical proof template (typically attributed to Elias and Bassalygo) to derive from the zero-rate threshold other tradeoffs between rate and decoding radius for list-decoding and list-recovery.
Technically, proving the Plotkin bound boils down to demonstrating the Schur convexity of a certain function defined on the q-simplex as well as the convexity of a univariate function derived from it. We remark that an earlier argument claimed similar results for q-ary list-decoding; however, we point out that this earlier proof is flawed.},
  author       = {Resch, Nicolas and Yuan, Chen and Zhang, Yihan},
  booktitle    = {50th International Colloquium on Automata, Languages, and Programming},
  isbn         = {9783959772785},
  issn         = {1868-8969},
  location     = {Paderborn, Germany},
  publisher    = {Schloss Dagstuhl - Leibniz-Zentrum für Informatik},
  title        = {{Zero-rate thresholds and new capacity bounds for list-decoding and list-recovery}},
  doi          = {10.4230/LIPIcs.ICALP.2023.99},
  volume       = {261},
  year         = {2023},
}

@inproceedings{14459,
  abstract     = {Autoencoders are a popular model in many branches of machine learning and lossy data compression. However, their fundamental limits, the performance of gradient methods and the features learnt during optimization remain poorly understood, even in the two-layer setting. In fact, earlier work has considered either linear autoencoders or specific training regimes (leading to vanishing or diverging compression rates). Our paper addresses this gap by focusing on non-linear two-layer autoencoders trained in the challenging proportional regime in which the input dimension scales linearly with the size of the representation. Our results characterize the minimizers of the population risk, and show that such minimizers are achieved by gradient methods; their structure is also unveiled, thus leading to a concise description of the features obtained via training. For the special case of a sign activation function, our analysis establishes the fundamental limits for the lossy compression of Gaussian sources via (shallow) autoencoders. Finally, while the results are proved for Gaussian data, numerical simulations on standard datasets display the universality of the theoretical predictions.},
  author       = {Shevchenko, Aleksandr and Kögler, Kevin and Hassani, Hamed and Mondelli, Marco},
  booktitle    = {Proceedings of the 40th International Conference on Machine Learning},
  issn         = {2640-3498},
  location     = {Honolulu, Hawaii, HI, United States},
  pages        = {31151--31209},
  publisher    = {ML Research Press},
  title        = {{Fundamental limits of two-layer autoencoders, and achieving them with gradient methods}},
  volume       = {202},
  year         = {2023},
}

@article{14665,
  abstract     = {We derive lower bounds on the maximal rates for multiple packings in high-dimensional Euclidean spaces. For any N > 0 and L ∈ Z ≥2 , a multiple packing is a set C of points in R n such that any point in R n lies in the intersection of at most L - 1 balls of radius √ nN around points in C . This is a natural generalization of the sphere packing problem. We study the multiple packing problem for both bounded point sets whose points have norm at most √ nP for some constant P > 0, and unbounded point sets whose points are allowed to be anywhere in R n . Given a well-known connection with coding theory, multiple packings can be viewed as the Euclidean analog of list-decodable codes, which are well-studied over finite fields. We derive the best known lower bounds on the optimal multiple packing density. This is accomplished by establishing an inequality which relates the list-decoding error exponent for additive white Gaussian noise channels, a quantity of average-case nature, to the list-decoding radius, a quantity of worst-case nature. We also derive novel bounds on the list-decoding error exponent for infinite constellations and closed-form expressions for the list-decoding error exponents for the power-constrained AWGN channel, which may be of independent interest beyond multiple packing.},
  author       = {Zhang, Yihan and Vatedka, Shashank},
  issn         = {1557-9654},
  journal      = {IEEE Transactions on Information Theory},
  publisher    = {IEEE},
  title        = {{Multiple packing: Lower bounds via error exponents}},
  doi          = {10.1109/TIT.2023.3334032},
  year         = {2023},
}

@article{14751,
  abstract     = {We consider zero-error communication over a two-transmitter deterministic adversarial multiple access channel (MAC) governed by an adversary who has access to the transmissions of both senders (hence called omniscient ) and aims to maliciously corrupt the communication. None of the encoders, jammer and decoder is allowed to randomize using private or public randomness. This enforces a combinatorial nature of the problem. Our model covers a large family of channels studied in the literature, including all deterministic discrete memoryless noisy or noiseless MACs. In this work, given an arbitrary two-transmitter deterministic omniscient adversarial MAC, we characterize when the capacity region: 1) has nonempty interior (in particular, is two-dimensional); 2) consists of two line segments (in particular, has empty interior); 3) consists of one line segment (in particular, is one-dimensional); 4) or only contains (0,0) (in particular, is zero-dimensional). This extends a recent result by Wang et al. (201 9) from the point-to-point setting to the multiple access setting. Indeed, our converse arguments build upon their generalized Plotkin bound and involve delicate case analysis. One of the technical challenges is to take care of both “joint confusability” and “marginal confusability”. In particular, the treatment of marginal confusability does not follow from the point-to-point results by Wang et al. Our achievability results follow from random coding with expurgation.},
  author       = {Zhang, Yihan},
  issn         = {1557-9654},
  journal      = {IEEE Transactions on Information Theory},
  keywords     = {Computer Science Applications, Information Systems},
  number       = {7},
  pages        = {4093--4127},
  publisher    = {Institute of Electrical and Electronics Engineers},
  title        = {{Zero-error communication over adversarial MACs}},
  doi          = {10.1109/tit.2023.3257239},
  volume       = {69},
  year         = {2023},
}

@article{12233,
  abstract     = {A novel recursive list decoding (RLD) algorithm for Reed-Muller (RM) codes based on successive permutations (SP) of the codeword is presented. A low-complexity SP scheme applied to a subset of the symmetry group of RM codes is first proposed to carefully select a good codeword permutation on the fly. Then, the proposed SP technique is integrated into an improved RLD algorithm that initializes different decoding paths with random codeword permutations, which are sampled from the full symmetry group of RM codes. Finally, efficient latency and complexity reduction schemes are introduced that virtually preserve the error-correction performance of the proposed decoder. Simulation results demonstrate that at the target frame error rate of 10−3 for the RM code of length 256 with 163 information bits, the proposed decoder reduces 6% of the computational complexity and 22% of the decoding latency of the state-of-the-art semi-parallel simplified successive-cancellation decoder with fast Hadamard transform (SSC-FHT) that uses 96 permutations from the full symmetry group of RM codes, while relatively maintaining the error-correction performance and memory consumption of the semi-parallel permuted SSC-FHT decoder.},
  author       = {Doan, Nghia and Hashemi, Seyyed Ali and Mondelli, Marco and Gross, Warren J.},
  issn         = {1558-0857},
  journal      = {IEEE Transactions on Communications},
  number       = {11},
  pages        = {7134--7145},
  publisher    = {Institute of Electrical and Electronics Engineers},
  title        = {{Decoding Reed-Muller codes with successive codeword permutations}},
  doi          = {10.1109/tcomm.2022.3211101},
  volume       = {70},
  year         = {2022},
}

@article{12273,
  abstract     = {We study communication in the presence of a jamming adversary where quadratic power constraints are imposed on the transmitter and the jammer. The jamming signal is allowed to be a function of the codebook, and a noncausal but noisy observation of the transmitted codeword. For a certain range of the noise-to-signal ratios (NSRs) of the transmitter and the jammer, we are able to characterize the capacity of this channel under deterministic encoding or stochastic encoding, i.e., with no common randomness between the encoder/decoder pair. For the remaining NSR regimes, we determine the capacity under the assumption of a small amount of common randomness (at most 2log(n) bits in one sub-regime, and at most Ω(n) bits in the other sub-regime) available to the encoder-decoder pair. Our proof techniques involve a novel myopic list-decoding result for achievability, and a Plotkin-type push attack for the converse in a subregion of the NSRs, both of which may be of independent interest. We also give bounds on the strong secrecy capacity of this channel assuming that the jammer is simultaneously eavesdropping.},
  author       = {Zhang, Yihan and Vatedka, Shashank and Jaggi, Sidharth and Sarwate, Anand D.},
  issn         = {1557-9654},
  journal      = {IEEE Transactions on Information Theory},
  number       = {8},
  pages        = {4901--4948},
  publisher    = {Institute of Electrical and Electronics Engineers},
  title        = {{Quadratically constrained myopic adversarial channels}},
  doi          = {10.1109/tit.2022.3167554},
  volume       = {68},
  year         = {2022},
}

@article{12480,
  abstract     = {We consider the problem of estimating a signal from measurements obtained via a generalized linear model. We focus on estimators based on approximate message passing (AMP), a family of iterative algorithms with many appealing features: the performance of AMP in the high-dimensional limit can be succinctly characterized under suitable model assumptions; AMP can also be tailored to the empirical distribution of the signal entries, and for a wide class of estimation problems, AMP is conjectured to be optimal among all polynomial-time algorithms. However, a major issue of AMP is that in many models (such as phase retrieval), it requires an initialization correlated with the ground-truth signal and independent from the measurement matrix. Assuming that such an initialization is available is typically not realistic. In this paper, we solve this problem by proposing an AMP algorithm initialized with a spectral estimator. With such an initialization, the standard AMP analysis fails since the spectral estimator depends in a complicated way on the design matrix. Our main contribution is a rigorous characterization of the performance of AMP with spectral initialization in the high-dimensional limit. The key technical idea is to define and analyze a two-phase artificial AMP algorithm that first produces the spectral estimator, and then closely approximates the iterates of the true AMP. We also provide numerical results that demonstrate the validity of the proposed approach.},
  author       = {Mondelli, Marco and Venkataramanan, Ramji},
  issn         = {1742-5468},
  journal      = {Journal of Statistical Mechanics: Theory and Experiment},
  keywords     = {Statistics, Probability and Uncertainty, Statistics and Probability, Statistical and Nonlinear Physics},
  number       = {11},
  publisher    = {IOP Publishing},
  title        = {{Approximate message passing with spectral initialization for generalized linear models}},
  doi          = {10.1088/1742-5468/ac9828},
  volume       = {2022},
  year         = {2022},
}

@unpublished{12536,
  abstract     = {We consider the problem of estimating a rank-1 signal corrupted by structured rotationally invariant noise, and address the following question: how well do inference algorithms perform when the noise statistics is unknown and hence Gaussian noise is assumed? While the matched Bayes-optimal setting with unstructured noise is well understood, the analysis of this mismatched problem is only at its premises. In this paper, we make a step towards understanding the effect of the strong source of mismatch which is the noise statistics. Our main technical contribution is the rigorous analysis of a Bayes estimator and of an approximate message passing (AMP) algorithm, both of which incorrectly assume a Gaussian setup. The first result exploits the theory of spherical integrals and of low-rank matrix perturbations; the idea behind the second one is to design and analyze an artificial AMP which, by taking advantage of the flexibility in the denoisers, is able to "correct" the mismatch. Armed with these sharp asymptotic characterizations, we unveil a rich and often unexpected phenomenology. For example, despite AMP is in principle designed to efficiently compute the Bayes estimator, the former is outperformed by the latter in terms of mean-square error. We show that this performance gap is due to an incorrect estimation of the signal norm. In fact, when the SNR is large enough, the overlaps of the AMP and the Bayes estimator coincide, and they even match those of optimal estimators taking into account the structure of the noise.},
  author       = {Barbier, Jean and Hou, TianQi and Mondelli, Marco and Saenz, Manuel},
  booktitle    = {arXiv},
  title        = {{The price of ignorance: How much does it cost to forget noise structure in low-rank matrix estimation?}},
  doi          = {10.48550/arXiv.2205.10009},
  year         = {2022},
}

@inproceedings{12537,
  abstract     = {The Neural Tangent Kernel (NTK) has emerged as a powerful tool to provide memorization, optimization and generalization guarantees in deep neural networks. A line of work has studied the NTK spectrum for two-layer and deep networks with at least a layer with Ω(N) neurons, N being the number of training samples. Furthermore, there is increasing evidence suggesting that deep networks with sub-linear layer widths are powerful memorizers and optimizers, as long as the number of parameters exceeds the number of samples. Thus, a natural open question is whether the NTK is well conditioned in such a challenging sub-linear setup. In this paper, we answer this question in the affirmative. Our key technical contribution is a lower bound on the smallest NTK eigenvalue for deep networks with the minimum possible over-parameterization: the number of parameters is roughly Ω(N) and, hence, the number of neurons is as little as Ω(N−−√). To showcase the applicability of our NTK bounds, we provide two results concerning memorization capacity and optimization guarantees for gradient descent training.},
  author       = {Bombari, Simone and Amani, Mohammad Hossein and Mondelli, Marco},
  booktitle    = {36th Conference on Neural Information Processing Systems},
  isbn         = {9781713871088},
  pages        = {7628--7640},
  publisher    = {Curran Associates},
  title        = {{Memorization and optimization in deep neural networks with minimum over-parameterization}},
  volume       = {35},
  year         = {2022},
}

@article{12538,
  abstract     = {In this paper, we study the compression of a target two-layer neural network with N nodes into a compressed network with M<N nodes. More precisely, we consider the setting in which the weights of the target network are i.i.d. sub-Gaussian, and we minimize the population L_2 loss between the outputs of the target and of the compressed network, under the assumption of Gaussian inputs. By using tools from high-dimensional probability, we show that this non-convex problem can be simplified when the target network is sufficiently over-parameterized, and provide the error rate of this approximation as a function of the input dimension and N. In this mean-field limit, the simplified objective, as well as the optimal weights of the compressed network, does not depend on the realization of the target network, but only on expected scaling factors. Furthermore, for networks with ReLU activation, we conjecture that the optimum of the simplified optimization problem is achieved by taking weights on the Equiangular Tight Frame (ETF), while the scaling of the weights and the orientation of the ETF depend on the parameters of the target network. Numerical evidence is provided to support this conjecture.},
  author       = {Amani, Mohammad Hossein and Bombari, Simone and Mondelli, Marco and Pukdee, Rattana and Rini, Stefano},
  isbn         = {9781665483414},
  journal      = {IEEE Information Theory Workshop},
  location     = {Mumbai, India},
  pages        = {588--593},
  publisher    = {IEEE},
  title        = {{Sharp asymptotics on the compression of two-layer neural networks}},
  doi          = {10.1109/ITW54588.2022.9965870},
  year         = {2022},
}

@inproceedings{12540,
  abstract     = {We consider the problem of signal estimation in generalized linear models defined via rotationally invariant design matrices. Since these matrices can have an arbitrary spectral distribution, this model is well suited for capturing complex correlation structures which often arise in applications. We propose a novel family of approximate message passing (AMP) algorithms for signal estimation, and rigorously characterize their performance in the high-dimensional limit via a state evolution recursion. Our rotationally invariant AMP has complexity of the same order as the existing AMP derived under the restrictive assumption of a Gaussian design; our algorithm also recovers this existing AMP as a special case. Numerical results showcase a performance close to Vector AMP (which is conjectured to be Bayes-optimal in some settings), but obtained with a much lower complexity, as the proposed algorithm does not require a computationally expensive singular value decomposition.},
  author       = {Venkataramanan, Ramji and Kögler, Kevin and Mondelli, Marco},
  booktitle    = {Proceedings of the 39th International Conference on Machine Learning},
  location     = {Baltimore, MD, United States},
  publisher    = {ML Research Press},
  title        = {{Estimation in rotationally invariant generalized linear models via approximate message passing}},
  volume       = {162},
  year         = {2022},
}

