@article{15013,
  abstract     = {We consider random n×n matrices X with independent and centered entries and a general variance profile. We show that the spectral radius of X converges with very high probability to the square root of the spectral radius of the variance matrix of X when n tends to infinity. We also establish the optimal rate of convergence, that is a new result even for general i.i.d. matrices beyond the explicitly solvable Gaussian cases. The main ingredient is the proof of the local inhomogeneous circular law [arXiv:1612.07776] at the spectral edge.},
  author       = {Alt, Johannes and Erdös, László and Krüger, Torben H},
  issn         = {2690-1005},
  journal      = {Probability and Mathematical Physics},
  number       = {2},
  pages        = {221--280},
  publisher    = {Mathematical Sciences Publishers},
  title        = {{Spectral radius of random matrices with independent entries}},
  doi          = {10.2140/pmp.2021.2.221},
  volume       = {2},
  year         = {2021},
}

@misc{13057,
  abstract     = {This dataset comprises all data shown in the figures of the submitted article "Geometric superinductance qubits: Controlling phase delocalization across a single Josephson junction". Additional raw data are available from the corresponding author on reasonable request.},
  author       = {Peruzzo, Matilda and Hassani, Farid and Szep, Grisha and Trioni, Andrea and Redchenko, Elena and Zemlicka, Martin and Fink, Johannes M},
  publisher    = {Zenodo},
  title        = {{Geometric superinductance qubits: Controlling phase delocalization across a single Josephson junction}},
  doi          = {10.5281/ZENODO.5592103},
  year         = {2021},
}

@misc{13058,
  abstract     = {The zip file includes source data used in the main text of the manuscript "Theory of branching morphogenesis by local interactions and global guidance", as well as a representative Jupyter notebook to reproduce the main figures. A sample script for the simulations of branching and annihilating random walks is also included (Sample_script_for_simulations_of_BARWs.ipynb) to generate exemplary branched networks under external guidance. A detailed description of the simulation setup is provided in the supplementary information of the manuscipt.},
  author       = {Ucar, Mehmet C},
  publisher    = {Zenodo},
  title        = {{Source data for the manuscript "Theory of branching morphogenesis by local interactions and global guidance"}},
  doi          = {10.5281/ZENODO.5257160},
  year         = {2021},
}

@misc{13061,
  abstract     = {Infections early in life can have enduring effects on an organism’s development and immunity. In this study, we show that this equally applies to developing “superorganisms” – incipient social insect colonies. When we exposed newly mated Lasius niger ant queens to a low pathogen dose, their colonies grew more slowly than controls before winter, but reached similar sizes afterwards. Independent of exposure, queen hibernation survival improved when the ratio of pupae to workers was small. Queens that reared fewer pupae before worker emergence exhibited lower pathogen levels, indicating that high brood rearing efforts interfere with the ability of the queen’s immune system to suppress pathogen proliferation. Early-life queen pathogen-exposure also improved the immunocompetence of her worker offspring, as demonstrated by challenging the workers to the same pathogen a year later. Transgenerational transfer of the queen’s pathogen experience to her workforce can hence durably reduce the disease susceptibility of the whole superorganism.},
  author       = {Casillas Perez, Barbara E and Pull, Christopher and Naiser, Filip and Naderlinger, Elisabeth and Matas, Jiri and Cremer, Sylvia},
  publisher    = {Dryad},
  title        = {{Early queen infection shapes developmental dynamics and induces long-term disease protection in incipient ant colonies}},
  doi          = {10.5061/DRYAD.7PVMCVDTJ},
  year         = {2021},
}

@misc{13062,
  abstract     = {This paper analyzes the conditions for local adaptation in a metapopulation with infinitely many islands under a model of hard selection, where population size depends on local fitness. Each island belongs to one of two distinct ecological niches or habitats. Fitness is influenced by an additive trait which is under habitat-dependent directional selection. Our analysis is based on the diffusion approximation and  accounts for both genetic drift and demographic stochasticity. By neglecting linkage disequilibria, it yields the joint distribution of allele frequencies and population size on each island. We find that under hard selection, the conditions for local adaptation in a rare habitat are more restrictive for more polygenic traits: even moderate migration load per locus at very many loci is sufficient for population sizes to decline. This further reduces the efficacy of selection at individual loci due to increased drift and because smaller populations are more prone to swamping due to migration, causing a positive feedback between increasing maladaptation and declining population sizes. Our analysis also highlights the importance of demographic stochasticity, which  exacerbates the decline in numbers of maladapted populations, leading to population collapse in the rare habitat at significantly lower migration than predicted by deterministic arguments.},
  author       = {Szep, Eniko and Sachdeva, Himani and Barton, Nicholas H},
  publisher    = {Dryad},
  title        = {{Supplementary code for: Polygenic local adaptation in metapopulations: A stochastic eco-evolutionary model}},
  doi          = {10.5061/DRYAD.8GTHT76P1},
  year         = {2021},
}

@misc{13063,
  abstract     = {We develop a Bayesian model (BayesRR-RC) that provides robust SNP-heritability estimation, an alternative to marker discovery, and accurate genomic prediction, taking 22 seconds per iteration to estimate 8.4 million SNP-effects and 78 SNP-heritability parameters in the UK Biobank. We find that only $\leq$ 10\% of the genetic variation captured for height, body mass index, cardiovascular disease, and type 2 diabetes is attributable to proximal regulatory regions within 10kb upstream of genes, while 12-25% is attributed to coding regions, 32-44% to introns, and 22-28% to distal 10-500kb upstream regions. Up to 24% of all cis and coding regions of each chromosome are associated with each trait, with over 3,100 independent exonic and intronic regions and over 5,400 independent regulatory regions having &gt;95% probability of contributing &gt;0.001% to the genetic variance of these four traits. Our open-source software (GMRM) provides a scalable alternative to current approaches for biobank data.},
  author       = {Robinson, Matthew Richard},
  publisher    = {Dryad},
  title        = {{Probabilistic inference of the genetic architecture of functional enrichment of complex traits}},
  doi          = {10.5061/dryad.sqv9s4n51},
  year         = {2021},
}

@misc{13068,
  abstract     = {Source data and source code for the graphs in "Spatiotemporal dynamics of self-organized branching pancreatic cancer-derived organoids".},
  author       = {Randriamanantsoa, Samuel and Papargyriou, Aristeidis and Maurer, Carlo and Peschke, Katja and Schuster, Maximilian and Zecchin, Giulia and Steiger, Katja and Öllinger, Rupert and Saur, Dieter and Scheel, Christina and Rad, Roland and Hannezo, Edouard B and Reichert, Maximilian and Bausch, Andreas R.},
  publisher    = {Zenodo},
  title        = {{Spatiotemporal dynamics of self-organized branching in pancreas-derived organoids}},
  doi          = {10.5281/ZENODO.5148117},
  year         = {2021},
}

@misc{13069,
  abstract     = {To survive elevated temperatures, ectotherms adjust the fluidity of membranes by fine-tuning lipid desaturation levels in a process previously described to be cell-autonomous. We have discovered that, in Caenorhabditis elegans, neuronal Heat shock Factor 1 (HSF-1), the conserved master regulator of the heat shock response (HSR)- causes extensive fat remodelling in peripheral tissues. These changes include a decrease in fat desaturase and acid lipase expression in the intestine, and a global shift in the saturation levels of plasma membrane’s phospholipids. The observed remodelling of plasma membrane is in line with ectothermic adaptive responses and gives worms a cumulative advantage to warm temperatures. We have determined that at least six TAX-2/TAX-4 cGMP gated channel expressing sensory neurons and TGF-β/BMP are required for signalling across tissues to modulate fat desaturation. We also find neuronal hsf-1  is not only sufficient but also partially necessary to control the fat remodelling response and for survival at warm temperatures. This is the first study to show that a thermostat-based mechanism can cell non-autonomously coordinate membrane saturation and composition across tissues in a multicellular animal.},
  author       = {Chauve, Laetitia and Hodge, Francesca and Murdoch, Sharlene and Masoudzadeh, Fatemah and Mann, Harry-Jack and Lopez-Clavijo, Andrea and Okkenhaug, Hanneke and West, Greg and Sousa, Bebiana C. and Segonds-Pichon, Anne and Li, Cheryl and Wingett, Steven and Kienberger, Hermine and Kleigrewe, Karin and de Bono, Mario and Wakelam, Michael and Casanueva, Olivia},
  publisher    = {Zenodo},
  title        = {{Neuronal HSF-1 coordinates the propagation of fat desaturation across tissues to enable adaptation to high temperatures in C. elegans}},
  doi          = {10.5281/ZENODO.5519410},
  year         = {2021},
}

@misc{13072,
  abstract     = {CpGs and corresponding mean weights for DNAm-based prediction of cognitive abilities (6 traits)},
  author       = {McCartney, Daniel L and Hillary, Robert F and Conole, Eleanor LS and Trejo Banos, Daniel and Gadd, Danni A and Walker, Rosie M and Nangle, Cliff and Flaig, Robin and Campbell, Archie and Murray, Alison D and Munoz Maniega, Susana and del C Valdes-Hernandez, Maria and Harris, Mathew A and Bastin, Mark E and Wardlaw, Joanna M and Harris, Sarah E and Porteous, David J and Tucker-Drob, Elliot M and McIntosh, Andrew M and Evans, Kathryn L and Deary, Ian J and Cox, Simon R and Robinson, Matthew Richard and Marioni, Riccardo E},
  publisher    = {Zenodo},
  title        = {{Blood-based epigenome-wide analyses of cognitive abilities}},
  doi          = {10.5281/ZENODO.5794028},
  year         = {2021},
}

@misc{13080,
  abstract     = {Data for the manuscript 'Closing of the Induced Gap in a Hybrid Superconductor-Semiconductor Nanowire' ([2006.01275] Closing of the Induced Gap in a Hybrid Superconductor-Semiconductor Nanowire (arxiv.org))

We upload a pdf with extended data sets, and the raw data for these extended datasets as well.},
  author       = {Puglia, Denise and Martinez, Esteban and Menard, Gerbold and Pöschl, Andreas and Gronin, Sergei and Gardner, Geoffrey and Kallaher, Ray and Manfra, Michael and Marcus, Charles and Higginbotham, Andrew P and Casparis, Lucas},
  publisher    = {Zenodo},
  title        = {{Data for 'Closing of the Induced Gap in a Hybrid Superconductor-Semiconductor Nanowire}},
  doi          = {10.5281/ZENODO.4592435},
  year         = {2021},
}

@inproceedings{13146,
  abstract     = {A recent line of work has analyzed the theoretical properties of deep neural networks via the Neural Tangent Kernel (NTK). In particular, the smallest eigenvalue of the NTK has been related to the memorization capacity, the global convergence of gradient descent algorithms and the generalization of deep nets. However, existing results either provide bounds in the two-layer setting or assume that the spectrum of the NTK matrices is bounded away from 0 for multi-layer networks. In this paper, we provide tight bounds on the smallest eigenvalue of NTK matrices for deep ReLU nets, both in the limiting case of infinite widths and for finite widths. In the finite-width setting, the network architectures we consider are fairly general: we require the existence of a wide layer with roughly order of N neurons, N being the number of data samples; and the scaling of the remaining layer widths is arbitrary (up to logarithmic factors). To obtain our results, we analyze various quantities of independent interest: we give lower bounds on the smallest singular value of hidden feature matrices, and upper bounds on the Lipschitz constant of input-output feature maps.},
  author       = {Nguyen, Quynh and Mondelli, Marco and Montufar, Guido},
  booktitle    = {Proceedings of the 38th International Conference on Machine Learning},
  isbn         = {9781713845065},
  issn         = {2640-3498},
  location     = {Virtual},
  pages        = {8119--8129},
  publisher    = {ML Research Press},
  title        = {{Tight bounds on the smallest Eigenvalue of the neural tangent kernel for deep ReLU networks}},
  volume       = {139},
  year         = {2021},
}

@inproceedings{13147,
  abstract     = {We investigate fast and communication-efficient algorithms for the classic problem of minimizing a sum of strongly convex and smooth functions that are distributed among n
 different nodes, which can communicate using a limited number of bits. Most previous communication-efficient approaches for this problem are limited to first-order optimization, and therefore have \emph{linear} dependence on the condition number in their communication complexity. We show that this dependence is not inherent: communication-efficient methods can in fact have sublinear dependence on the condition number. For this, we design and analyze the first communication-efficient distributed variants of preconditioned gradient descent for Generalized Linear Models, and for Newton’s method. Our results rely on a new technique for quantizing both the preconditioner and the descent direction at each step of the algorithms, while controlling their convergence rate. We also validate our findings experimentally, showing faster convergence and reduced communication relative to previous methods.},
  author       = {Alimisis, Foivos and Davies, Peter and Alistarh, Dan-Adrian},
  booktitle    = {Proceedings of the 38th International Conference on Machine Learning},
  isbn         = {9781713845065},
  issn         = {2640-3498},
  location     = {Virtual},
  pages        = {196--206},
  publisher    = {ML Research Press},
  title        = {{Communication-efficient distributed optimization with quantized preconditioners}},
  volume       = {139},
  year         = {2021},
}

@article{14117,
  abstract     = {The two fields of machine learning and graphical causality arose and are developed separately. However, there is, now, cross-pollination and increasing interest in both fields to benefit from the advances of the other. In this article, we review fundamental concepts of causal inference and relate them to crucial open problems of machine learning, including transfer and generalization, thereby assaying how causality can contribute to modern machine learning research. This also applies in the opposite direction: we note that most work in causality starts from the premise that the causal variables are given. A central problem for AI and causality is, thus, causal representation learning, that is, the discovery of high-level causal variables from low-level observations. Finally, we delineate some implications of causality for machine learning and propose key research areas at the intersection of both communities.},
  author       = {Scholkopf, Bernhard and Locatello, Francesco and Bauer, Stefan and Ke, Nan Rosemary and Kalchbrenner, Nal and Goyal, Anirudh and Bengio, Yoshua},
  issn         = {1558-2256},
  journal      = {Proceedings of the IEEE},
  keywords     = {Electrical and Electronic Engineering},
  number       = {5},
  pages        = {612--634},
  publisher    = {Institute of Electrical and Electronics Engineers},
  title        = {{Toward causal representation learning}},
  doi          = {10.1109/jproc.2021.3058954},
  volume       = {109},
  year         = {2021},
}

@inproceedings{14176,
  abstract     = {Intensive care units (ICU) are increasingly looking towards machine learning for methods to provide online monitoring of critically ill patients. In machine learning, online monitoring is often formulated as a supervised learning problem. Recently, contrastive learning approaches have demonstrated promising improvements over competitive supervised benchmarks. These methods rely on well-understood data augmentation techniques developed for image data which do not apply to online monitoring. In this work, we overcome this limitation by
supplementing time-series data augmentation techniques with a novel contrastive
learning objective which we call neighborhood contrastive learning (NCL). Our objective explicitly groups together contiguous time segments from each patient while maintaining state-specific information. Our experiments demonstrate a marked improvement over existing work applying contrastive methods to medical time-series.},
  author       = {Yèche, Hugo and Dresdner, Gideon and Locatello, Francesco and Hüser, Matthias and Rätsch, Gunnar},
  booktitle    = {Proceedings of 38th International Conference on Machine Learning},
  location     = {Virtual},
  pages        = {11964--11974},
  publisher    = {ML Research Press},
  title        = {{Neighborhood contrastive learning applied to online patient monitoring}},
  volume       = {139},
  year         = {2021},
}

@inproceedings{14177,
  abstract     = {The focus of disentanglement approaches has been on identifying independent factors of variation in data. However, the causal variables underlying real-world observations are often not statistically independent. In this work, we bridge the gap to real-world scenarios by analyzing the behavior of the most prominent disentanglement approaches on correlated data in a large-scale empirical study (including 4260 models). We show and quantify that systematically induced correlations in the dataset are being learned and reflected in the latent representations, which has implications for downstream applications of disentanglement such as fairness. We also demonstrate how to resolve these latent correlations, either using weak supervision during
training or by post-hoc correcting a pre-trained model with a small number of labels.},
  author       = {Träuble, Frederik and Creager, Elliot and Kilbertus, Niki and Locatello, Francesco and Dittadi, Andrea and Goyal, Anirudh and Schölkopf, Bernhard and Bauer, Stefan},
  booktitle    = {Proceedings of the 38th International Conference on Machine Learning},
  location     = {Virtual},
  pages        = {10401--10412},
  publisher    = {ML Research Press},
  title        = {{On disentangled representations learned from correlated data}},
  volume       = {139},
  year         = {2021},
}

@inproceedings{14178,
  abstract     = {Learning meaningful representations that disentangle the underlying structure of the data generating process is considered to be of key importance in machine learning. While disentangled representations were found to be useful for diverse tasks such as abstract reasoning and fair classification, their scalability and real-world impact remain questionable. We introduce a new high-resolution dataset with 1M simulated images and over 1,800 annotated real-world images of the same setup. In contrast to previous work, this new dataset exhibits correlations, a complex underlying structure, and allows to evaluate transfer to unseen simulated and real-world settings where the encoder i) remains in distribution or ii) is out of distribution. We propose new architectures in order to scale disentangled representation learning to realistic high-resolution settings and conduct a large-scale empirical study of disentangled representations on this dataset. We observe that disentanglement is a good predictor for out-of-distribution (OOD) task performance.},
  author       = {Dittadi, Andrea and Träuble, Frederik and Locatello, Francesco and Wüthrich, Manuel and Agrawal, Vaibhav and Winther, Ole and Bauer, Stefan and Schölkopf, Bernhard},
  booktitle    = {The Ninth International Conference on Learning Representations},
  location     = {Virtual},
  title        = {{On the transfer of disentangled representations in realistic settings}},
  year         = {2021},
}

@inproceedings{14179,
  abstract     = {Self-supervised representation learning has shown remarkable success in a number of domains. A common practice is to perform data augmentation via hand-crafted transformations intended to leave the semantics of the data invariant. We seek to understand the empirical success of this approach from a theoretical perspective. We formulate the augmentation process as a latent variable model by postulating a partition of the latent representation into a content component, which is assumed invariant to augmentation, and a style component, which is allowed to change. Unlike prior work on disentanglement and independent component analysis, we allow for both nontrivial statistical and causal dependencies in the latent space. We study the identifiability of the latent representation based on pairs of views of the observations and prove sufficient conditions that allow us to identify the invariant content partition up to an invertible mapping in both generative and discriminative settings. We find numerical simulations with dependent latent variables are consistent with our theory. Lastly, we introduce Causal3DIdent, a dataset of high-dimensional, visually complex images with rich causal dependencies, which we use to study the effect of data augmentations performed in practice.},
  author       = {Kügelgen, Julius von and Sharma, Yash and Gresele, Luigi and Brendel, Wieland and Schölkopf, Bernhard and Besserve, Michel and Locatello, Francesco},
  booktitle    = {Advances in Neural Information Processing Systems},
  isbn         = {9781713845393},
  location     = {Virtual},
  pages        = {16451--16467},
  title        = {{Self-supervised learning with data augmentations provably isolates content from style}},
  volume       = {34},
  year         = {2021},
}

@inproceedings{14180,
  abstract     = {Modern neural network architectures can leverage large amounts of data to generalize well within the training distribution. However, they are less capable of systematic generalization to data drawn from unseen but related distributions, a feat that is hypothesized to require compositional reasoning and reuse of knowledge. In this work, we present Neural Interpreters, an architecture that factorizes inference in a self-attention network as a system of modules, which we call \emph{functions}. Inputs to the model are routed through a sequence of functions in a way that is end-to-end learned. The proposed architecture can flexibly compose computation along width and depth, and lends itself well to capacity extension after training. To demonstrate the versatility of Neural Interpreters, we evaluate it in two distinct settings: image classification and visual abstract reasoning on Raven Progressive Matrices. In the former, we show that Neural Interpreters perform on par with the vision transformer using fewer parameters, while being transferrable to a new task in a sample efficient manner. In the latter, we find that Neural Interpreters are competitive with respect to the state-of-the-art in terms of systematic generalization. },
  author       = {Rahaman, Nasim and Gondal, Muhammad Waleed and Joshi, Shruti and Gehler, Peter and Bengio, Yoshua and Locatello, Francesco and Schölkopf, Bernhard},
  booktitle    = {Advances in Neural Information Processing Systems},
  isbn         = {9781713845393},
  location     = {Virtual},
  pages        = {10985--10998},
  title        = {{Dynamic inference with neural interpreters}},
  volume       = {34},
  year         = {2021},
}

@inproceedings{14181,
  abstract     = {Variational Inference makes a trade-off between the capacity of the variational family and the tractability of finding an approximate posterior distribution. Instead, Boosting Variational Inference allows practitioners to obtain increasingly good posterior approximations by spending more compute. The main obstacle to widespread adoption of Boosting Variational Inference is the amount of resources necessary to improve over a strong Variational Inference baseline. In our work, we trace this limitation back to the global curvature of the KL-divergence. We characterize how the global curvature impacts time and memory consumption, address the problem with the notion of local curvature, and provide a novel approximate backtracking algorithm for estimating local curvature. We give new theoretical convergence rates for our algorithms and provide experimental validation on synthetic and real-world datasets.},
  author       = {Dresdner, Gideon and Shekhar, Saurav and Pedregosa, Fabian and Locatello, Francesco and Rätsch, Gunnar},
  booktitle    = {Proceedings of the Thirtieth International Joint Conference on Artificial Intelligence},
  location     = {Montreal, Canada},
  pages        = {2337--2343},
  publisher    = {International Joint Conferences on Artificial Intelligence},
  title        = {{Boosting variational inference with locally adaptive step-sizes}},
  doi          = {10.24963/ijcai.2021/322},
  year         = {2021},
}

@inproceedings{14182,
  abstract     = {When machine learning systems meet real world applications, accuracy is only
one of several requirements. In this paper, we assay a complementary
perspective originating from the increasing availability of pre-trained and
regularly improving state-of-the-art models. While new improved models develop
at a fast pace, downstream tasks vary more slowly or stay constant. Assume that
we have a large unlabelled data set for which we want to maintain accurate
predictions. Whenever a new and presumably better ML models becomes available,
we encounter two problems: (i) given a limited budget, which data points should
be re-evaluated using the new model?; and (ii) if the new predictions differ
from the current ones, should we update? Problem (i) is about compute cost,
which matters for very large data sets and models. Problem (ii) is about
maintaining consistency of the predictions, which can be highly relevant for
downstream applications; our demand is to avoid negative flips, i.e., changing
correct to incorrect predictions. In this paper, we formalize the Prediction
Update Problem and present an efficient probabilistic approach as answer to the
above questions. In extensive experiments on standard classification benchmark
data sets, we show that our method outperforms alternative strategies along key
metrics for backward-compatible prediction updates.},
  author       = {Träuble, Frederik and Kügelgen, Julius von and Kleindessner, Matthäus and Locatello, Francesco and Schölkopf, Bernhard and Gehler, Peter},
  booktitle    = {35th Conference on Neural Information Processing Systems},
  isbn         = {9781713845393},
  location     = {Virtual},
  pages        = {116--128},
  title        = {{Backward-compatible prediction updates: A probabilistic approach}},
  volume       = {34},
  year         = {2021},
}

