@inproceedings{12015,
  abstract     = {We study the problem of high-dimensional multiple packing in Euclidean space. Multiple packing is a natural generalization of sphere packing and is defined as follows. Let P, N > 0 and L∈Z≥2. A multiple packing is a set C of points in Bn(0–,nP−−−√) such that any point in ℝ n lies in the intersection of at most L – 1 balls of radius nN−−−√ around points in C. 1 In this paper, we derive two lower bounds on the largest possible density of a multiple packing. These bounds are obtained through a stronger notion called average-radius multiple packing. Specifically, we exactly pin down the asymptotics of (expurgated) Gaussian codes and (expurgated) spherical codes under average-radius multiple packing. To this end, we apply tools from high-dimensional geometry and large deviation theory. The bound for spherical codes matches the previous best known bound which was obtained for the standard (weaker) notion of multiple packing through a curious connection with error exponents [Bli99], [ZV21]. The bound for Gaussian codes suggests that they are strictly inferior to spherical codes.},
  author       = {Zhang, Yihan and Vatedka, Shashank},
  booktitle    = {2022 IEEE International Symposium on Information Theory},
  isbn         = {9781665421591},
  issn         = {2157-8095},
  location     = {Espoo, Finland},
  pages        = {3085--3090},
  publisher    = {IEEE},
  title        = {{Lower bounds for multiple packing}},
  doi          = {10.1109/ISIT50566.2022.9834443},
  volume       = {2022},
  year         = {2022},
}

@inproceedings{12016,
  abstract     = {We consider the problem of coded distributed computing using polar codes. The average execution time of a coded computing system is related to the error probability for transmission over the binary erasure channel in recent work by Soleymani, Jamali and Mahdavifar, where the performance of binary linear codes is investigated. In this paper, we focus on polar codes and unveil a connection between the average execution time and the scaling exponent μ of the family of codes. In the finite-length characterization of polar codes, the scaling exponent is a key object capturing the speed of convergence to capacity. In particular, we show that (i) the gap between the normalized average execution time of polar codes and that of optimal MDS codes is O(n –1/μ ), and (ii) this upper bound can be improved to roughly O(n –1/2 ) by considering polar codes with large kernels. We conjecture that these bounds could be improved to O(n –2/μ ) and O(n –1 ), respectively, and provide a heuristic argument as well as numerical evidence supporting this view.},
  author       = {Fathollahi, Dorsa and Mondelli, Marco},
  booktitle    = {2022 IEEE International Symposium on Information Theory},
  isbn         = {9781665421591},
  issn         = {2157-8095},
  location     = {Espoo, Finland},
  pages        = {2154--2159},
  publisher    = {IEEE},
  title        = {{Polar coded computing: The role of the scaling exponent}},
  doi          = {10.1109/ISIT50566.2022.9834712},
  volume       = {2022},
  year         = {2022},
}

@inproceedings{12017,
  abstract     = {In the classic adversarial communication problem, two parties communicate over a noisy channel in the presence of a malicious jamming adversary. The arbitrarily varying channels (AVCs) offer an elegant framework to study a wide range of interesting adversary models. The optimal throughput or capacity over such AVCs is intimately tied to the underlying adversary model; in some cases, capacity is unknown and the problem is known to be notoriously hard. The omniscient adversary, one which knows the sender’s entire channel transmission a priori, is one of such classic models of interest; the capacity under such an adversary remains an exciting open problem. The myopic adversary is a generalization of that model where the adversary’s observation may be corrupted over a noisy discrete memoryless channel. Through the adversary’s myopicity, one can unify the slew of different adversary models, ranging from the omniscient adversary to one that is completely blind to the transmission (the latter is the well known oblivious model where the capacity is fully characterized).In this work, we present new results on the capacity under both the omniscient and myopic adversary models. We completely characterize the positive capacity threshold over general AVCs with omniscient adversaries. The characterization is in terms of two key combinatorial objects: the set of completely positive distributions and the CP-confusability set. For omniscient AVCs with positive capacity, we present non-trivial lower and upper bounds on the capacity; unlike some of the previous bounds, our bounds hold under fairly general input and jamming constraints. Our lower bound improves upon the generalized Gilbert-Varshamov bound for general AVCs while the upper bound generalizes the well known Elias-Bassalygo bound (known for binary and q-ary alphabets). For the myopic AVCs, we build on prior results known for the so-called sufficiently myopic model, and present new results on the positive rate communication threshold over the so-called insufficiently myopic regime (a completely insufficient myopic adversary specializes to an omniscient adversary). We present interesting examples for the widely studied models of adversarial bit-flip and bit-erasure channels. In fact, for the bit-flip AVC with additive adversarial noise as well as random noise, we completely characterize the omniscient model capacity when the random noise is sufficiently large vis-a-vis the adversary’s budget.},
  author       = {Yadav, Anuj Kumar and Alimohammadi, Mohammadreza and Zhang, Yihan and Budkuley, Amitalok J. and Jaggi, Sidharth},
  booktitle    = {2022 IEEE International Symposium on Information Theory},
  isbn         = {9781665421591},
  issn         = {2157-8095},
  location     = {Espoo, Finland},
  pages        = {2535--2540},
  publisher    = {Institute of Electrical and Electronics Engineers},
  title        = {{New results on AVCs with omniscient and myopic adversaries}},
  doi          = {10.1109/ISIT50566.2022.9834632},
  volume       = {2022},
  year         = {2022},
}

@inproceedings{12018,
  abstract     = {We study the problem of characterizing the maximal rates of list decoding in Euclidean spaces for finite list sizes. For any positive integer L ≥ 2 and real N > 0, we say that a subset C⊂Rn is an (N,L – 1)-multiple packing or an (N,L– 1)-list decodable code if every Euclidean ball of radius nN−−−√ in ℝ n contains no more than L − 1 points of C. We study this problem with and without ℓ 2 norm constraints on C, and derive the best-known lower bounds on the maximal rate for (N,L−1) multiple packing. Our bounds are obtained via error exponents for list decoding over Additive White Gaussian Noise (AWGN) channels. We establish a curious inequality which relates the error exponent, a quantity of average-case nature, to the list-decoding radius, a quantity of worst-case nature. We derive various bounds on the error exponent for list decoding in both bounded and unbounded settings which could be of independent interest beyond multiple packing.},
  author       = {Zhang, Yihan and Vatedka, Shashank},
  booktitle    = {2022 IEEE International Symposium on Information Theory},
  isbn         = {9781665421591},
  issn         = {2157-8095},
  location     = {Espoo, Finland},
  pages        = {1324--1329},
  publisher    = {Institute of Electrical and Electronics Engineers},
  title        = {{Lower bounds on list decoding capacity using error exponents}},
  doi          = {10.1109/ISIT50566.2022.9834815},
  volume       = {2022},
  year         = {2022},
}

@inproceedings{12019,
  abstract     = {This paper studies combinatorial properties of codes for the Z-channel. A Z-channel with error fraction τ takes as input a length-n binary codeword and injects in an adversarial manner up to nτ asymmetric errors, i.e., errors that only zero out bits but do not flip 0’s to 1’s. It is known that the largest (L − 1)-list-decodable code for the Z-channel with error fraction τ has exponential (in n) size if τ is less than a critical value that we call the Plotkin point and has constant size if τ is larger than the threshold. The (L−1)-list-decoding Plotkin point is known to be L−1L−1−L−LL−1. In this paper, we show that the largest (L−1)-list-decodable code ε-above the Plotkin point has size Θ L (ε −3/2 ) for any L − 1 ≥ 1.},
  author       = {Polyanskii, Nikita and Zhang, Yihan},
  booktitle    = {2022 IEEE International Symposium on Information Theory},
  isbn         = {9781665421591},
  issn         = {2157-8095},
  location     = {Espoo, Finland},
  pages        = {2553--2558},
  publisher    = {Institute of Electrical and Electronics Engineers},
  title        = {{List-decodable zero-rate codes for the Z-channel}},
  doi          = {10.1109/ISIT50566.2022.9834829},
  volume       = {2022},
  year         = {2022},
}

@article{12233,
  abstract     = {A novel recursive list decoding (RLD) algorithm for Reed-Muller (RM) codes based on successive permutations (SP) of the codeword is presented. A low-complexity SP scheme applied to a subset of the symmetry group of RM codes is first proposed to carefully select a good codeword permutation on the fly. Then, the proposed SP technique is integrated into an improved RLD algorithm that initializes different decoding paths with random codeword permutations, which are sampled from the full symmetry group of RM codes. Finally, efficient latency and complexity reduction schemes are introduced that virtually preserve the error-correction performance of the proposed decoder. Simulation results demonstrate that at the target frame error rate of 10−3 for the RM code of length 256 with 163 information bits, the proposed decoder reduces 6% of the computational complexity and 22% of the decoding latency of the state-of-the-art semi-parallel simplified successive-cancellation decoder with fast Hadamard transform (SSC-FHT) that uses 96 permutations from the full symmetry group of RM codes, while relatively maintaining the error-correction performance and memory consumption of the semi-parallel permuted SSC-FHT decoder.},
  author       = {Doan, Nghia and Hashemi, Seyyed Ali and Mondelli, Marco and Gross, Warren J.},
  issn         = {1558-0857},
  journal      = {IEEE Transactions on Communications},
  number       = {11},
  pages        = {7134--7145},
  publisher    = {Institute of Electrical and Electronics Engineers},
  title        = {{Decoding Reed-Muller codes with successive codeword permutations}},
  doi          = {10.1109/tcomm.2022.3211101},
  volume       = {70},
  year         = {2022},
}

@article{12273,
  abstract     = {We study communication in the presence of a jamming adversary where quadratic power constraints are imposed on the transmitter and the jammer. The jamming signal is allowed to be a function of the codebook, and a noncausal but noisy observation of the transmitted codeword. For a certain range of the noise-to-signal ratios (NSRs) of the transmitter and the jammer, we are able to characterize the capacity of this channel under deterministic encoding or stochastic encoding, i.e., with no common randomness between the encoder/decoder pair. For the remaining NSR regimes, we determine the capacity under the assumption of a small amount of common randomness (at most 2log(n) bits in one sub-regime, and at most Ω(n) bits in the other sub-regime) available to the encoder-decoder pair. Our proof techniques involve a novel myopic list-decoding result for achievability, and a Plotkin-type push attack for the converse in a subregion of the NSRs, both of which may be of independent interest. We also give bounds on the strong secrecy capacity of this channel assuming that the jammer is simultaneously eavesdropping.},
  author       = {Zhang, Yihan and Vatedka, Shashank and Jaggi, Sidharth and Sarwate, Anand D.},
  issn         = {1557-9654},
  journal      = {IEEE Transactions on Information Theory},
  number       = {8},
  pages        = {4901--4948},
  publisher    = {Institute of Electrical and Electronics Engineers},
  title        = {{Quadratically constrained myopic adversarial channels}},
  doi          = {10.1109/tit.2022.3167554},
  volume       = {68},
  year         = {2022},
}

@article{12480,
  abstract     = {We consider the problem of estimating a signal from measurements obtained via a generalized linear model. We focus on estimators based on approximate message passing (AMP), a family of iterative algorithms with many appealing features: the performance of AMP in the high-dimensional limit can be succinctly characterized under suitable model assumptions; AMP can also be tailored to the empirical distribution of the signal entries, and for a wide class of estimation problems, AMP is conjectured to be optimal among all polynomial-time algorithms. However, a major issue of AMP is that in many models (such as phase retrieval), it requires an initialization correlated with the ground-truth signal and independent from the measurement matrix. Assuming that such an initialization is available is typically not realistic. In this paper, we solve this problem by proposing an AMP algorithm initialized with a spectral estimator. With such an initialization, the standard AMP analysis fails since the spectral estimator depends in a complicated way on the design matrix. Our main contribution is a rigorous characterization of the performance of AMP with spectral initialization in the high-dimensional limit. The key technical idea is to define and analyze a two-phase artificial AMP algorithm that first produces the spectral estimator, and then closely approximates the iterates of the true AMP. We also provide numerical results that demonstrate the validity of the proposed approach.},
  author       = {Mondelli, Marco and Venkataramanan, Ramji},
  issn         = {1742-5468},
  journal      = {Journal of Statistical Mechanics: Theory and Experiment},
  keywords     = {Statistics, Probability and Uncertainty, Statistics and Probability, Statistical and Nonlinear Physics},
  number       = {11},
  publisher    = {IOP Publishing},
  title        = {{Approximate message passing with spectral initialization for generalized linear models}},
  doi          = {10.1088/1742-5468/ac9828},
  volume       = {2022},
  year         = {2022},
}

@unpublished{12536,
  abstract     = {We consider the problem of estimating a rank-1 signal corrupted by structured rotationally invariant noise, and address the following question: how well do inference algorithms perform when the noise statistics is unknown and hence Gaussian noise is assumed? While the matched Bayes-optimal setting with unstructured noise is well understood, the analysis of this mismatched problem is only at its premises. In this paper, we make a step towards understanding the effect of the strong source of mismatch which is the noise statistics. Our main technical contribution is the rigorous analysis of a Bayes estimator and of an approximate message passing (AMP) algorithm, both of which incorrectly assume a Gaussian setup. The first result exploits the theory of spherical integrals and of low-rank matrix perturbations; the idea behind the second one is to design and analyze an artificial AMP which, by taking advantage of the flexibility in the denoisers, is able to "correct" the mismatch. Armed with these sharp asymptotic characterizations, we unveil a rich and often unexpected phenomenology. For example, despite AMP is in principle designed to efficiently compute the Bayes estimator, the former is outperformed by the latter in terms of mean-square error. We show that this performance gap is due to an incorrect estimation of the signal norm. In fact, when the SNR is large enough, the overlaps of the AMP and the Bayes estimator coincide, and they even match those of optimal estimators taking into account the structure of the noise.},
  author       = {Barbier, Jean and Hou, TianQi and Mondelli, Marco and Saenz, Manuel},
  booktitle    = {arXiv},
  title        = {{The price of ignorance: How much does it cost to forget noise structure in low-rank matrix estimation?}},
  doi          = {10.48550/arXiv.2205.10009},
  year         = {2022},
}

@inproceedings{12537,
  abstract     = {The Neural Tangent Kernel (NTK) has emerged as a powerful tool to provide memorization, optimization and generalization guarantees in deep neural networks. A line of work has studied the NTK spectrum for two-layer and deep networks with at least a layer with Ω(N) neurons, N being the number of training samples. Furthermore, there is increasing evidence suggesting that deep networks with sub-linear layer widths are powerful memorizers and optimizers, as long as the number of parameters exceeds the number of samples. Thus, a natural open question is whether the NTK is well conditioned in such a challenging sub-linear setup. In this paper, we answer this question in the affirmative. Our key technical contribution is a lower bound on the smallest NTK eigenvalue for deep networks with the minimum possible over-parameterization: the number of parameters is roughly Ω(N) and, hence, the number of neurons is as little as Ω(N−−√). To showcase the applicability of our NTK bounds, we provide two results concerning memorization capacity and optimization guarantees for gradient descent training.},
  author       = {Bombari, Simone and Amani, Mohammad Hossein and Mondelli, Marco},
  booktitle    = {36th Conference on Neural Information Processing Systems},
  isbn         = {9781713871088},
  pages        = {7628--7640},
  publisher    = {Curran Associates},
  title        = {{Memorization and optimization in deep neural networks with minimum over-parameterization}},
  volume       = {35},
  year         = {2022},
}

@article{12538,
  abstract     = {In this paper, we study the compression of a target two-layer neural network with N nodes into a compressed network with M<N nodes. More precisely, we consider the setting in which the weights of the target network are i.i.d. sub-Gaussian, and we minimize the population L_2 loss between the outputs of the target and of the compressed network, under the assumption of Gaussian inputs. By using tools from high-dimensional probability, we show that this non-convex problem can be simplified when the target network is sufficiently over-parameterized, and provide the error rate of this approximation as a function of the input dimension and N. In this mean-field limit, the simplified objective, as well as the optimal weights of the compressed network, does not depend on the realization of the target network, but only on expected scaling factors. Furthermore, for networks with ReLU activation, we conjecture that the optimum of the simplified optimization problem is achieved by taking weights on the Equiangular Tight Frame (ETF), while the scaling of the weights and the orientation of the ETF depend on the parameters of the target network. Numerical evidence is provided to support this conjecture.},
  author       = {Amani, Mohammad Hossein and Bombari, Simone and Mondelli, Marco and Pukdee, Rattana and Rini, Stefano},
  isbn         = {9781665483414},
  journal      = {IEEE Information Theory Workshop},
  location     = {Mumbai, India},
  pages        = {588--593},
  publisher    = {IEEE},
  title        = {{Sharp asymptotics on the compression of two-layer neural networks}},
  doi          = {10.1109/ITW54588.2022.9965870},
  year         = {2022},
}

@inproceedings{12540,
  abstract     = {We consider the problem of signal estimation in generalized linear models defined via rotationally invariant design matrices. Since these matrices can have an arbitrary spectral distribution, this model is well suited for capturing complex correlation structures which often arise in applications. We propose a novel family of approximate message passing (AMP) algorithms for signal estimation, and rigorously characterize their performance in the high-dimensional limit via a state evolution recursion. Our rotationally invariant AMP has complexity of the same order as the existing AMP derived under the restrictive assumption of a Gaussian design; our algorithm also recovers this existing AMP as a special case. Numerical results showcase a performance close to Vector AMP (which is conjectured to be Bayes-optimal in some settings), but obtained with a much lower complexity, as the proposed algorithm does not require a computationally expensive singular value decomposition.},
  author       = {Venkataramanan, Ramji and Kögler, Kevin and Mondelli, Marco},
  booktitle    = {Proceedings of the 39th International Conference on Machine Learning},
  location     = {Baltimore, MD, United States},
  publisher    = {ML Research Press},
  title        = {{Estimation in rotationally invariant generalized linear models via approximate message passing}},
  volume       = {162},
  year         = {2022},
}

@unpublished{12860,
  abstract     = {Memorization of the relation between entities in a dataset can lead to privacy issues when using a trained model for question answering. We introduce Relational Memorization (RM) to understand, quantify and control this phenomenon. While bounding general memorization can have detrimental effects on the performance of a trained model, bounding RM does not prevent effective learning. The difference is most pronounced when the data distribution is long-tailed, with many queries having only few training examples: Impeding general memorization prevents effective learning, while impeding only relational memorization still allows learning general properties of the underlying concepts. We formalize the notion of Relational Privacy (RP) and, inspired by Differential Privacy (DP), we provide a possible definition of Differential Relational Privacy (DrP). These notions can be used to describe and compute bounds on the amount of RM in a trained model. We illustrate Relational Privacy concepts in experiments with large-scale models for Question Answering.},
  author       = {Bombari, Simone and Achille, Alessandro and Wang, Zijian and Wang, Yu-Xiang and Xie, Yusheng and Singh, Kunwar Yashraj and Appalaraju, Srikar and Mahadevan, Vijay and Soatto, Stefano},
  booktitle    = {arXiv},
  title        = {{Towards differential relational privacy and its use in question answering}},
  doi          = {10.48550/arXiv.2203.16701},
  year         = {2022},
}

@inproceedings{13146,
  abstract     = {A recent line of work has analyzed the theoretical properties of deep neural networks via the Neural Tangent Kernel (NTK). In particular, the smallest eigenvalue of the NTK has been related to the memorization capacity, the global convergence of gradient descent algorithms and the generalization of deep nets. However, existing results either provide bounds in the two-layer setting or assume that the spectrum of the NTK matrices is bounded away from 0 for multi-layer networks. In this paper, we provide tight bounds on the smallest eigenvalue of NTK matrices for deep ReLU nets, both in the limiting case of infinite widths and for finite widths. In the finite-width setting, the network architectures we consider are fairly general: we require the existence of a wide layer with roughly order of N neurons, N being the number of data samples; and the scaling of the remaining layer widths is arbitrary (up to logarithmic factors). To obtain our results, we analyze various quantities of independent interest: we give lower bounds on the smallest singular value of hidden feature matrices, and upper bounds on the Lipschitz constant of input-output feature maps.},
  author       = {Nguyen, Quynh and Mondelli, Marco and Montufar, Guido},
  booktitle    = {Proceedings of the 38th International Conference on Machine Learning},
  isbn         = {9781713845065},
  issn         = {2640-3498},
  location     = {Virtual},
  pages        = {8119--8129},
  publisher    = {ML Research Press},
  title        = {{Tight bounds on the smallest Eigenvalue of the neural tangent kernel for deep ReLU networks}},
  volume       = {139},
  year         = {2021},
}

@article{9002,
  abstract     = { We prove that, for the binary erasure channel (BEC), the polar-coding paradigm gives rise to codes that not only approach the Shannon limit but do so under the best possible scaling of their block length as a function of the gap to capacity. This result exhibits the first known family of binary codes that attain both optimal scaling and quasi-linear complexity of encoding and decoding. Our proof is based on the construction and analysis of binary polar codes with large kernels. When communicating reliably at rates within ε>0 of capacity, the code length n often scales as O(1/εμ), where the constant μ is called the scaling exponent. It is known that the optimal scaling exponent is μ=2, and it is achieved by random linear codes. The scaling exponent of conventional polar codes (based on the 2×2 kernel) on the BEC is μ=3.63. This falls far short of the optimal scaling guaranteed by random codes. Our main contribution is a rigorous proof of the following result: for the BEC, there exist ℓ×ℓ binary kernels, such that polar codes constructed from these kernels achieve scaling exponent μ(ℓ) that tends to the optimal value of 2 as ℓ grows. We furthermore characterize precisely how large ℓ needs to be as a function of the gap between μ(ℓ) and 2. The resulting binary codes maintain the recursive structure of conventional polar codes, and thereby achieve construction complexity O(n) and encoding/decoding complexity O(nlogn).},
  author       = {Fazeli, Arman and Hassani, Hamed and Mondelli, Marco and Vardy, Alexander},
  issn         = {1557-9654},
  journal      = {IEEE Transactions on Information Theory},
  number       = {9},
  pages        = {5693--5710},
  publisher    = {IEEE},
  title        = {{Binary linear codes with optimal scaling: Polar codes with large kernels}},
  doi          = {10.1109/TIT.2020.3038806},
  volume       = {67},
  year         = {2021},
}

@article{9047,
  abstract     = {This work analyzes the latency of the simplified successive cancellation (SSC) decoding scheme for polar codes proposed by Alamdar-Yazdi and Kschischang. It is shown that, unlike conventional successive cancellation decoding, where latency is linear in the block length, the latency of SSC decoding is sublinear. More specifically, the latency of SSC decoding is O(N1−1/μ) , where N is the block length and μ is the scaling exponent of the channel, which captures the speed of convergence of the rate to capacity. Numerical results demonstrate the tightness of the bound and show that most of the latency reduction arises from the parallel decoding of subcodes of rate 0 or 1.},
  author       = {Mondelli, Marco and Hashemi, Seyyed Ali and Cioffi, John M. and Goldsmith, Andrea},
  issn         = {15582248},
  journal      = {IEEE Transactions on Wireless Communications},
  number       = {1},
  pages        = {18--27},
  publisher    = {IEEE},
  title        = {{Sublinear latency for simplified successive cancellation decoding of polar codes}},
  doi          = {10.1109/TWC.2020.3022922},
  volume       = {20},
  year         = {2021},
}

@inproceedings{10053,
  abstract     = {This paper characterizes the latency of the simplified successive-cancellation (SSC) decoding scheme for polar codes under hardware resource constraints. In particular, when the number of processing elements P that can perform SSC decoding operations in parallel is limited, as is the case in practice, the latency of SSC decoding is O(N1−1 μ+NPlog2log2NP), where N is the block length of the code and μ is the scaling exponent of polar codes for the channel. Three direct consequences of this bound are presented. First, in a fully-parallel implementation where P=N2 , the latency of SSC decoding is O(N1−1/μ) , which is sublinear in the block length. This recovers a result from an earlier work. Second, in a fully-serial implementation where P=1 , the latency of SSC decoding scales as O(Nlog2log2N) . The multiplicative constant is also calculated: we show that the latency of SSC decoding when P=1 is given by (2+o(1))Nlog2log2N . Third, in a semi-parallel implementation, the smallest P that gives the same latency as that of the fully-parallel implementation is P=N1/μ . The tightness of our bound on SSC decoding latency and the applicability of the foregoing results is validated through extensive simulations.},
  author       = {Hashemi, Seyyed Ali and Mondelli, Marco and Fazeli, Arman and Vardy, Alexander and Cioffi, John and Goldsmith, Andrea},
  booktitle    = {2021 IEEE International Symposium on Information Theory},
  isbn         = {978-1-5386-8210-4},
  issn         = {2157-8095},
  location     = {Melbourne, Australia},
  pages        = {2369--2374},
  publisher    = {Institute of Electrical and Electronics Engineers},
  title        = {{Parallelism versus latency in simplified successive-cancellation decoding of polar codes}},
  doi          = {10.1109/ISIT45174.2021.9518153},
  year         = {2021},
}

@article{10211,
  abstract     = {We study the problem of recovering an unknown signal 𝑥𝑥 given measurements obtained from a generalized linear model with a Gaussian sensing matrix. Two popular solutions are based on a linear estimator 𝑥𝑥^L and a spectral estimator 𝑥𝑥^s. The former is a data-dependent linear combination of the columns of the measurement matrix, and its analysis is quite simple. The latter is the principal eigenvector of a data-dependent matrix, and a recent line of work has studied its performance. In this paper, we show how to optimally combine 𝑥𝑥^L and 𝑥𝑥^s. At the heart of our analysis is the exact characterization of the empirical joint distribution of (𝑥𝑥,𝑥𝑥^L,𝑥𝑥^s) in the high-dimensional limit. This allows us to compute the Bayes-optimal combination of 𝑥𝑥^L and 𝑥𝑥^s, given the limiting distribution of the signal 𝑥𝑥. When the distribution of the signal is Gaussian, then the Bayes-optimal combination has the form 𝜃𝑥𝑥^L+𝑥𝑥^s and we derive the optimal combination coefficient. In order to establish the limiting distribution of (𝑥𝑥,𝑥𝑥^L,𝑥𝑥^s), we design and analyze an approximate message passing algorithm whose iterates give 𝑥𝑥^L and approach 𝑥𝑥^s. Numerical simulations demonstrate the improvement of the proposed combination with respect to the two methods considered separately.},
  author       = {Mondelli, Marco and Thrampoulidis, Christos and Venkataramanan, Ramji},
  issn         = {1615-3383},
  journal      = {Foundations of Computational Mathematics},
  keywords     = {Applied Mathematics, Computational Theory and Mathematics, Computational Mathematics, Analysis},
  publisher    = {Springer},
  title        = {{Optimal combination of linear and spectral estimators for generalized linear models}},
  doi          = {10.1007/s10208-021-09531-x},
  year         = {2021},
}

@inproceedings{10593,
  abstract     = {We study the problem of estimating a rank-$1$ signal in the presence of rotationally invariant noise-a class of perturbations more general than Gaussian noise. Principal Component Analysis (PCA) provides a natural estimator, and sharp results on its performance have been obtained in the high-dimensional regime. Recently, an Approximate Message Passing (AMP) algorithm has been proposed as an alternative estimator with the potential to improve the accuracy of PCA. However, the existing analysis of AMP requires an initialization that is both correlated with the signal and independent of the noise, which is often unrealistic in practice. In this work, we combine the two methods, and propose to initialize AMP with PCA. Our main result is a rigorous asymptotic characterization of the performance of this estimator. Both the AMP algorithm and its analysis differ from those previously derived in the Gaussian setting: at every iteration, our AMP algorithm requires a specific term to account for PCA initialization, while in the Gaussian case, PCA initialization affects only the first iteration of AMP. The proof is based on a two-phase artificial AMP that first approximates the PCA estimator and then mimics the true AMP. Our numerical simulations show an excellent agreement between AMP results and theoretical predictions, and suggest an interesting open direction on achieving Bayes-optimal performance.},
  author       = {Mondelli, Marco and Venkataramanan, Ramji},
  booktitle    = {35th Conference on Neural Information Processing Systems},
  isbn         = {9781713845393},
  issn         = {1049-5258},
  location     = {Virtual},
  pages        = {29616--29629},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{PCA initialization for approximate message passing in rotationally invariant models}},
  volume       = {35},
  year         = {2021},
}

@inproceedings{10594,
  abstract     = {The question of how and why the phenomenon of mode connectivity occurs in training deep neural networks has gained remarkable attention in the research community. From a theoretical perspective, two possible explanations have been proposed: (i) the loss function has connected sublevel sets, and (ii) the solutions found by stochastic gradient descent are dropout stable. While these explanations provide insights into the phenomenon, their assumptions are not always satisfied in practice. In particular, the first approach requires the network to have one layer with order of N neurons (N being the number of training samples), while the second one requires the loss to be almost invariant after removing half of the neurons at each layer (up to some rescaling of the remaining ones). In this work, we improve both conditions by exploiting the quality of the features at every intermediate layer together with a milder over-parameterization condition. More specifically, we show that: (i) under generic assumptions on the features of intermediate layers, it suffices that the last two hidden layers have order of N−−√ neurons, and (ii) if subsets of features at each layer are linearly separable, then no over-parameterization is needed to show the connectivity. Our experiments confirm that the proposed condition ensures the connectivity of solutions found by stochastic gradient descent, even in settings where the previous requirements do not hold.},
  author       = {Nguyen, Quynh and Bréchet, Pierre and Mondelli, Marco},
  booktitle    = {35th Conference on Neural Information Processing Systems},
  isbn         = {9781713845393},
  issn         = {1049-5258},
  location     = {Virtual},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{When are solutions connected in deep networks?}},
  volume       = {35},
  year         = {2021},
}

