@unpublished{12750,
  abstract     = {Quantum kinetically constrained models have recently attracted significant attention due to their anomalous dynamics and thermalization. In this work, we introduce a hitherto unexplored family of kinetically constrained models featuring a conserved particle number and strong inversion-symmetry breaking due to facilitated hopping. We demonstrate that these models provide a generic example of so-called quantum Hilbert space fragmentation, that is manifested in disconnected sectors in the Hilbert space that are not apparent in the computational basis. Quantum Hilbert space fragmentation leads to an exponential in system size number of eigenstates with exactly zero entanglement entropy across several bipartite cuts. These eigenstates can be probed dynamically using quenches from simple initial product states. In addition, we study the particle spreading under unitary dynamics launched from the domain wall state, and find faster than diffusive dynamics at high particle densities, that crosses over into logarithmically slow relaxation at smaller densities. Using a classically simulable cellular automaton, we reproduce the logarithmic dynamics observed in the quantum case. Our work suggests that particle conserving constrained models with inversion symmetry breaking realize so far unexplored universality classes of dynamics and invite their further theoretical and experimental studies.},
  author       = {Brighi, Pietro and Ljubotina, Marko and Serbyn, Maksym},
  booktitle    = {arXiv},
  title        = {{Hilbert space fragmentation and slow dynamics in particle-conserving quantum East models}},
  doi          = {10.48550/arXiv.2210.15607},
  year         = {2022},
}

@inproceedings{12775,
  abstract     = {We consider the problem of approximating the reachability probabilities in Markov decision processes (MDP) with uncountable (continuous) state and action spaces. While there are algorithms that, for special classes of such MDP, provide a sequence of approximations converging to the true value in the limit, our aim is to obtain an algorithm with guarantees on the precision of the approximation.
As this problem is undecidable in general, assumptions on the MDP are necessary. Our main contribution is to identify sufficient assumptions that are as weak as possible, thus approaching the "boundary" of which systems can be correctly and reliably analyzed. To this end, we also argue why each of our assumptions is necessary for algorithms based on processing finitely many observations.
We present two solution variants. The first one provides converging lower bounds under weaker assumptions than typical ones from previous works concerned with guarantees. The second one then utilizes stronger assumptions to additionally provide converging upper bounds. Altogether, we obtain an anytime algorithm, i.e. yielding a sequence of approximants with known and iteratively improving precision, converging to the true value in the limit. Besides, due to the generality of our assumptions, our algorithms are very general templates, readily allowing for various heuristics from literature in contrast to, e.g., a specific discretization algorithm. Our theoretical contribution thus paves the way for future practical improvements without sacrificing correctness guarantees.},
  author       = {Grover, Kush and Kretinsky, Jan and Meggendorfer, Tobias and Weininger, Maimilian},
  booktitle    = {33rd International Conference on Concurrency Theory },
  issn         = {1868-8969},
  location     = {Warsaw, Poland},
  publisher    = {Schloss Dagstuhl - Leibniz-Zentrum für Informatik},
  title        = {{Anytime guarantees for reachability in uncountable Markov decision processes}},
  doi          = {10.4230/LIPIcs.CONCUR.2022.11},
  volume       = {243},
  year         = {2022},
}

@article{12776,
  abstract     = {An improved asymptotic formula is established for the number of rational points of bounded height on the split smooth del Pezzo surface of degree 5. The proof uses the five conic bundle structures on the surface.},
  author       = {Browning, Timothy D},
  issn         = {1076-9803},
  journal      = {New York Journal of Mathematics},
  pages        = {1193 -- 1229},
  publisher    = {State University of New York},
  title        = {{Revisiting the Manin–Peyre conjecture for the split del Pezzo surface of degree 5}},
  volume       = {28},
  year         = {2022},
}

@inproceedings{12780,
  abstract     = {The ability to scale out training workloads has been one of the key performance enablers of deep learning. The main scaling approach is data-parallel GPU-based training, which has been boosted by hardware and software support for highly efficient point-to-point communication, and in particular via hardware bandwidth over-provisioning. Overprovisioning comes at a cost: there is an order of magnitude price difference between "cloud-grade" servers with such support, relative to their popular "consumer-grade" counterparts, although single server-grade and consumer-grade GPUs can have similar computational envelopes.

In this paper, we show that the costly hardware overprovisioning approach can be supplanted via algorithmic and system design, and propose a framework called CGX, which provides efficient software support for compressed communication in ML applications, for both multi-GPU single-node training, as well as larger-scale multi-node training. CGX is based on two technical advances: At the system level, it relies on a re-developed communication stack for ML frameworks, which provides flexible, highly-efficient support for compressed communication. At the application level, it provides seamless, parameter-free integration with popular frameworks, so that end-users do not have to modify training recipes, nor significant training code. This is complemented by a layer-wise adaptive compression technique which dynamically balances compression gains with accuracy preservation. CGX integrates with popular ML frameworks, providing up to 3X speedups for multi-GPU nodes based on commodity hardware, and order-of-magnitude improvements in the multi-node setting, with negligible impact on accuracy.},
  author       = {Markov, Ilia and Ramezanikebrya, Hamidreza and Alistarh, Dan-Adrian},
  booktitle    = {Proceedings of the 23rd ACM/IFIP International Middleware Conference},
  isbn         = {9781450393409},
  location     = {Quebec, QC, Canada},
  pages        = {241--254},
  publisher    = {Association for Computing Machinery},
  title        = {{CGX: Adaptive system support for communication-efficient deep learning}},
  doi          = {10.1145/3528535.3565248},
  year         = {2022},
}

@article{12793,
  abstract     = {Let F be a global function field with constant field Fq. Let G be a reductive group over Fq. We establish a variant of Arthur's truncated kernel for G and for its Lie algebra which generalizes Arthur's original construction. We establish a coarse geometric expansion for our variant truncation.
As applications, we consider some existence and uniqueness problems of some cuspidal automorphic representations for the functions field of the projective line P1Fq with two points of ramifications.},
  author       = {Yu, Hongjie},
  issn         = {1945-5844},
  journal      = {Pacific Journal of Mathematics},
  keywords     = {Arthur–Selberg trace formula, cuspidal automorphic representations, global function fields},
  number       = {1},
  pages        = {193--237},
  publisher    = {Mathematical Sciences Publishers},
  title        = {{ A coarse geometric expansion of a variant of Arthur's truncated traces and some applications}},
  doi          = {10.2140/pjm.2022.321.193},
  volume       = {321},
  year         = {2022},
}

@unpublished{12860,
  abstract     = {Memorization of the relation between entities in a dataset can lead to privacy issues when using a trained model for question answering. We introduce Relational Memorization (RM) to understand, quantify and control this phenomenon. While bounding general memorization can have detrimental effects on the performance of a trained model, bounding RM does not prevent effective learning. The difference is most pronounced when the data distribution is long-tailed, with many queries having only few training examples: Impeding general memorization prevents effective learning, while impeding only relational memorization still allows learning general properties of the underlying concepts. We formalize the notion of Relational Privacy (RP) and, inspired by Differential Privacy (DP), we provide a possible definition of Differential Relational Privacy (DrP). These notions can be used to describe and compute bounds on the amount of RM in a trained model. We illustrate Relational Privacy concepts in experiments with large-scale models for Question Answering.},
  author       = {Bombari, Simone and Achille, Alessandro and Wang, Zijian and Wang, Yu-Xiang and Xie, Yusheng and Singh, Kunwar Yashraj and Appalaraju, Srikar and Mahadevan, Vijay and Soatto, Stefano},
  booktitle    = {arXiv},
  title        = {{Towards differential relational privacy and its use in question answering}},
  doi          = {10.48550/arXiv.2203.16701},
  year         = {2022},
}

@misc{13064,
  abstract     = {Genetically informed, deep-phenotyped biobanks are an important research resource and it is imperative that the most powerful, versatile, and efficient analysis approaches are used. Here, we apply our recently developed Bayesian grouped mixture of regressions model (GMRM) in the UK and Estonian Biobanks and obtain the highest genomic prediction accuracy reported to date across 21 heritable traits. When compared to other approaches, GMRM accuracy was greater than annotation prediction models run in the LDAK or LDPred-funct software by 15% (SE 7%) and 14% (SE 2%), respectively, and was 18% (SE 3%) greater than a baseline BayesR model without single-nucleotide polymorphism (SNP) markers grouped into minor allele frequency–linkage disequilibrium (MAF-LD) annotation categories. For height, the prediction accuracy R 2 was 47% in a UK Biobank holdout sample, which was 76% of the estimated h SNP 2 . We then extend our GMRM prediction model to provide mixed-linear model association (MLMA) SNP marker estimates for genome-wide association (GWAS) discovery, which increased the independent loci detected to 16,162 in unrelated UK Biobank individuals, compared to 10,550 from BoltLMM and 10,095 from Regenie, a 62 and 65% increase, respectively. The average χ2 value of the leading markers increased by 15.24 (SE 0.41) for every 1% increase in prediction accuracy gained over a baseline BayesR model across the traits. Thus, we show that modeling genetic associations accounting for MAF and LD differences among SNP markers, and incorporating prior knowledge of genomic function, is important for both genomic prediction and discovery in large-scale individual-level studies.},
  author       = {Orliac, Etienne and Trejo Banos, Daniel and Ojavee, Sven and Läll, Kristi and Mägi, Reedik and Visscher, Peter and Robinson, Matthew Richard},
  publisher    = {Dryad},
  title        = {{Improving genome-wide association discovery and genomic prediction accuracy in biobank data}},
  doi          = {10.5061/DRYAD.GTHT76HMZ},
  year         = {2022},
}

@misc{13066,
  abstract     = {Chromosomal inversions have been shown to play a major role in local adaptation by suppressing recombination between alternative arrangements and maintaining beneficial allele combinations. However, so far, their importance relative to the remaining genome remains largely unknown. Understanding the genetic architecture of adaptation requires better estimates of how loci of different effect sizes contribute to phenotypic variation. Here, we used three Swedish islands where the marine snail Littorina saxatilis has repeatedly evolved into two distinct ecotypes along a habitat transition. We estimated the contribution of inversion polymorphisms to phenotypic divergence while controlling for polygenic effects in the remaining genome using a quantitative genetics framework. We confirmed the importance of inversions but showed that contributions of loci outside inversions are of similar magnitude, with variable proportions dependent on the trait and the population. Some inversions showed consistent effects across all sites, whereas others exhibited site-specific effects, indicating that the genomic basis for replicated phenotypic divergence is only partly shared. The contributions of sexual dimorphism as well as environmental factors to phenotypic variation were significant but minor compared to inversions and polygenic background. Overall, this integrated approach provides insight into the multiple mechanisms contributing to parallel phenotypic divergence.},
  author       = {Koch, Eva and Ravinet, Mark and Westram, Anja M and Jonannesson, Kerstin and Butlin, Roger},
  publisher    = {Dryad},
  title        = {{Data from: Genetic architecture of repeated phenotypic divergence in Littorina saxatilis ecotype evolution}},
  doi          = {10.5061/DRYAD.M905QFV4B},
  year         = {2022},
}

@misc{13076,
  abstract     = {The source code for replicating experiments presented in the paper.

The implementation of the designed priority schedulers can be found in Galois-2.2.1/include/Galois/WorkList/:
StealingMultiQueue.h is the StealingMultiQueue.
MQOptimized/ contains MQ Optimized variants.

We provide images that contain all the dependencies and datasets. Images can be pulled from npostnikova/mq-based-schedulers repository, or downloaded from Zenodo. See readme for more detail.},
  author       = {Postnikova, Anastasiia and Koval, Nikita and Nadiradze, Giorgi and Alistarh, Dan-Adrian},
  publisher    = {Zenodo},
  title        = {{Multi-queues can be state-of-the-art priority schedulers}},
  doi          = {10.5281/ZENODO.5733408},
  year         = {2022},
}

@inproceedings{13239,
  abstract     = {Brains are thought to engage in predictive learning - learning to predict upcoming stimuli - to construct an internal model of their environment. This is especially notable for spatial navigation, as first described by Tolman’s latent learning tasks. However, predictive learning has also been observed in sensory cortex, in settings unrelated to spatial navigation. Apart from normative frameworks such as active inference or efficient coding, what could be the utility of learning to predict the patterns of occurrence of correlated stimuli? Here we show that prediction, and thereby the construction of an internal model of sequential stimuli, can bootstrap the learning process of a working memory task in a recurrent neural network. We implemented predictive learning alongside working memory match-tasks, and networks emerged to solve the prediction task first by encoding information across time to predict upcoming stimuli, and then eavesdropped on this solution to solve the matching task. Eavesdropping was most beneficial when neural resources were limited. Hence, predictive learning acts as a general neural mechanism to learn to store sensory information that can later be essential for working memory tasks.},
  author       = {Van Der Plas, Thijs L. and Vogels, Tim P and Manohar, Sanjay G.},
  booktitle    = {Proceedings of Machine Learning Research},
  issn         = {2640-3498},
  pages        = {518--531},
  publisher    = {ML Research Press},
  title        = {{Predictive learning enables neural networks to learn complex working memory tasks}},
  volume       = {199},
  year         = {2022},
}

@article{13240,
  abstract     = {Ustilago maydis is a biotrophic phytopathogenic fungus that causes corn smut disease. As a well-established model system, U. maydis is genetically fully accessible with large omics datasets available and subject to various biological questions ranging from DNA-repair, RNA-transport, and protein secretion to disease biology. For many genetic approaches, tight control of transgene regulation is important. Here we established an optimised version of the Tetracycline-ON (TetON) system for U. maydis. We demonstrate the Tetracycline concentration-dependent expression of fluorescent protein transgenes and the system’s suitability for the induced expression of the toxic protein BCL2 Associated X-1 (Bax1). The Golden Gate compatible vector system contains a native minimal promoter from the mating factor a-1 encoding gene, mfa with ten copies of the tet-regulated operator (tetO) and a codon optimised Tet-repressor (tetR*) which is translationally fused to the native transcriptional corepressor Mql1 (UMAG_05501). The metabolism-independent transcriptional regulator system is functional both, in liquid culture as well as on solid media in the presence of the inducer and can become a useful tool for toxin-antitoxin studies, identification of antifungal proteins, and to study functions of toxic gene products in Ustilago maydis.},
  author       = {Ingole, Kishor D. and Nagarajan, Nithya and Uhse, Simon and Giannini, Caterina and Djamei, Armin},
  issn         = {2673-6128},
  journal      = {Frontiers in Fungal Biology},
  publisher    = {Frontiers Media},
  title        = {{Tetracycline-controlled (TetON) gene expression system for the smut fungus Ustilago maydis}},
  doi          = {10.3389/ffunb.2022.1029114},
  volume       = {3},
  year         = {2022},
}

@inproceedings{13241,
  abstract     = {Addressing fairness concerns about machine learning models is a crucial step towards their long-term adoption in real-world automated systems. Many approaches for training fair models from data have been developed and an implicit assumption about such algorithms is that they are able to recover a fair model, despite potential historical biases in the data. In this work we show a number of impossibility results that indicate that there is no learning algorithm that can recover a fair model when a proportion of the dataset is subject to arbitrary manipulations. Specifically, we prove that there are situations in which an adversary can force any learner to return a biased classifier, with or without degrading accuracy, and that the strength of this bias increases for learning problems with underrepresented protected groups in the data. Our results emphasize on the importance of studying further data corruption models of various strength and of establishing stricter data collection practices for fairness-aware learning.},
  author       = {Konstantinov, Nikola H and Lampert, Christoph},
  booktitle    = {Proceedings of Machine Learning Research},
  issn         = {2640-3498},
  pages        = {59--83},
  publisher    = {ML Research Press},
  title        = {{On the impossibility of fairness-aware learning from corrupted data}},
  volume       = {171},
  year         = {2022},
}

@inproceedings{14093,
  abstract     = { We propose a stochastic conditional gradient method (CGM) for minimizing convex finite-sum objectives formed as a sum of smooth and non-smooth terms. Existing CGM variants for this template either suffer from slow convergence rates, or require carefully increasing the batch size over the course of the algorithm’s execution, which leads to computing full gradients. In contrast, the proposed method, equipped with a stochastic average gradient (SAG) estimator, requires only one sample per iteration. Nevertheless, it guarantees fast convergence rates on par with more sophisticated variance reduction techniques. In applications we put special emphasis on problems with a large number of separable constraints. Such problems are prevalent among semidefinite programming (SDP) formulations arising in machine learning and theoretical computer science. We provide numerical experiments on matrix completion, unsupervised clustering, and sparsest-cut SDPs. },
  author       = {Dresdner, Gideon and Vladarean, Maria-Luiza and Rätsch, Gunnar and Locatello, Francesco and Cevher, Volkan and Yurtsever, Alp},
  booktitle    = {Proceedings of the 25th International Conference on Artificial Intelligence and Statistics},
  issn         = {2640-3498},
  location     = {Virtual},
  pages        = {8439--8457},
  publisher    = {ML Research Press},
  title        = {{ Faster one-sample stochastic conditional gradient method for composite convex minimization}},
  volume       = {151},
  year         = {2022},
}

@inproceedings{14106,
  abstract     = {We show that deep networks trained to satisfy demographic parity often do so
through a form of race or gender awareness, and that the more we force a network
to be fair, the more accurately we can recover race or gender from the internal state
of the network. Based on this observation, we investigate an alternative fairness
approach: we add a second classification head to the network to explicitly predict
the protected attribute (such as race or gender) alongside the original task. After
training the two-headed network, we enforce demographic parity by merging the
two heads, creating a network with the same architecture as the original network.
We establish a close relationship between existing approaches and our approach
by showing (1) that the decisions of a fair classifier are well-approximated by our
approach, and (2) that an unfair and optimally accurate classifier can be recovered
from a fair classifier and our second head predicting the protected attribute. We use
our explicit formulation to argue that the existing fairness approaches, just as ours,
demonstrate disparate treatment and that they are likely to be unlawful in a wide
range of scenarios under US law.},
  author       = {Lohaus, Michael and Kleindessner, Matthäus and Kenthapadi, Krishnaram and Locatello, Francesco and Russell, Chris},
  booktitle    = {36th Conference on Neural Information Processing Systems},
  isbn         = {9781713871088},
  location     = {New Orleans, LA, United States},
  pages        = {16548--16562},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Are two heads the same as one? Identifying disparate treatment in fair neural networks}},
  volume       = {35},
  year         = {2022},
}

@inproceedings{14107,
  abstract     = {Amodal perception requires inferring the full shape of an object that is partially occluded. This task is particularly challenging on two levels: (1) it requires more information than what is contained in the instant retina or imaging sensor, (2) it is difficult to obtain enough well-annotated amodal labels for supervision. To this end, this paper develops a new framework of
Self-supervised amodal Video object segmentation (SaVos). Our method efficiently leverages the visual information of video temporal sequences to infer the amodal mask of objects. The key intuition is that the occluded part of an object can be explained away if that part is visible in other frames, possibly deformed as long as the deformation can be reasonably learned.
Accordingly, we derive a novel self-supervised learning paradigm that efficiently utilizes the visible object parts as the supervision to guide the training on videos. In addition to learning type prior to complete masks for known types, SaVos also learns the spatiotemporal prior, which is also useful for the amodal task and could generalize to unseen types. The proposed
framework achieves the state-of-the-art performance on the synthetic amodal segmentation benchmark FISHBOWL and the real world benchmark KINS-Video-Car. Further, it lends itself well to being transferred to novel distributions using test-time adaptation, outperforming existing models even after the transfer to a new distribution.},
  author       = {Yao, Jian and Hong, Yuxin and Wang, Chiyu and Xiao, Tianjun and He, Tong and Locatello, Francesco and Wipf, David and Fu, Yanwei and Zhang, Zheng},
  booktitle    = {36th Conference on Neural Information Processing Systems},
  location     = {New Orleans, LA, United States},
  title        = {{Self-supervised amodal video object segmentation}},
  doi          = {10.48550/arXiv.2210.12733},
  year         = {2022},
}

@inproceedings{14114,
  abstract     = {Algorithmic fairness is frequently motivated in terms of a trade-off in which overall performance is decreased so as to improve performance on disadvantaged groups where the algorithm would otherwise be less accurate. Contrary to this, we find that applying existing fairness approaches to computer vision improve fairness by degrading the performance of classifiers across all groups (with increased degradation on the best performing groups). Extending the bias-variance decomposition for classification to fairness, we theoretically explain why the majority of fairness methods designed for low capacity models should not be used in settings involving high-capacity models, a scenario common to computer vision. We corroborate this analysis with extensive experimental support that shows that many of the fairness heuristics used in computer vision also degrade performance on the most disadvantaged groups. Building on these insights, we propose an adaptive augmentation strategy that, uniquely, of all methods tested, improves performance for the disadvantaged groups.},
  author       = {Zietlow, Dominik and Lohaus, Michael and Balakrishnan, Guha and Kleindessner, Matthaus and Locatello, Francesco and Scholkopf, Bernhard and Russell, Chris},
  booktitle    = {2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  isbn         = {9781665469470},
  issn         = {2575-7075},
  location     = {New Orleans, LA, United States},
  pages        = {10400--10411},
  publisher    = {Institute of Electrical and Electronics Engineers},
  title        = {{Leveling down in computer vision: Pareto inefficiencies in fair deep classifiers}},
  doi          = {10.1109/cvpr52688.2022.01016},
  year         = {2022},
}

@inproceedings{14168,
  abstract     = {Recent work has seen the development of general purpose neural architectures
that can be trained to perform tasks across diverse data modalities. General
purpose models typically make few assumptions about the underlying
data-structure and are known to perform well in the large-data regime. At the
same time, there has been growing interest in modular neural architectures that
represent the data using sparsely interacting modules. These models can be more
robust out-of-distribution, computationally efficient, and capable of
sample-efficient adaptation to new data. However, they tend to make
domain-specific assumptions about the data, and present challenges in how
module behavior (i.e., parameterization) and connectivity (i.e., their layout)
can be jointly learned. In this work, we introduce a general purpose, yet
modular neural architecture called Neural Attentive Circuits (NACs) that
jointly learns the parameterization and a sparse connectivity of neural modules
without using domain knowledge. NACs are best understood as the combination of
two systems that are jointly trained end-to-end: one that determines the module
configuration and the other that executes it on an input. We demonstrate
qualitatively that NACs learn diverse and meaningful module configurations on
the NLVR2 dataset without additional supervision. Quantitatively, we show that
by incorporating modularity in this way, NACs improve upon a strong non-modular
baseline in terms of low-shot adaptation on CIFAR and CUBs dataset by about
10%, and OOD robustness on Tiny ImageNet-R by about 2.5%. Further, we find that
NACs can achieve an 8x speedup at inference time while losing less than 3%
performance. Finally, we find NACs to yield competitive results on diverse data
modalities spanning point-cloud classification, symbolic processing and
text-classification from ASCII bytes, thereby confirming its general purpose
nature.},
  author       = {Rahaman, Nasim and Weiss, Martin and Locatello, Francesco and Pal, Chris and Bengio, Yoshua and Schölkopf, Bernhard and Li, Li Erran and Ballas, Nicolas},
  booktitle    = {36th Conference on Neural Information Processing Systems},
  location     = {New Orleans, United States},
  title        = {{Neural attentive circuits}},
  volume       = {35},
  year         = {2022},
}

@inproceedings{14170,
  abstract     = {The idea behind object-centric representation learning is that natural scenes can better be modeled as compositions of objects and their relations as opposed to distributed representations. This inductive bias can be injected into neural networks to potentially improve systematic generalization and performance of downstream tasks in scenes with multiple objects. In this paper, we train state-of-the-art unsupervised models on five common multi-object datasets and evaluate segmentation metrics and downstream object property prediction. In addition, we study generalization and robustness by investigating the settings where either a single object is out of distribution -- e.g., having an unseen color, texture, or shape -- or global properties of the scene are altered -- e.g., by occlusions, cropping, or increasing the number of objects. From our experimental study, we find object-centric representations to be useful for
downstream tasks and generally robust to most distribution shifts affecting objects. However, when the distribution shift affects the input in a less structured manner, robustness in terms of segmentation and downstream task performance may vary significantly across models and distribution shifts. },
  author       = {Dittadi, Andrea and Papa, Samuele and Vita, Michele De and Schölkopf, Bernhard and Winther, Ole and Locatello, Francesco},
  booktitle    = {Proceedings of the 39th International Conference on Machine Learning},
  location     = {Baltimore, MD, United States},
  pages        = {5221--5285},
  publisher    = {ML Research Press},
  title        = {{Generalization and robustness implications in object-centric learning}},
  volume       = {2022},
  year         = {2022},
}

@inproceedings{14171,
  abstract     = {This paper demonstrates how to recover causal graphs from the score of the
data distribution in non-linear additive (Gaussian) noise models. Using score
matching algorithms as a building block, we show how to design a new generation
of scalable causal discovery methods. To showcase our approach, we also propose
a new efficient method for approximating the score's Jacobian, enabling to
recover the causal graph. Empirically, we find that the new algorithm, called
SCORE, is competitive with state-of-the-art causal discovery methods while
being significantly faster.},
  author       = {Rolland, Paul and Cevher, Volkan and Kleindessner, Matthäus and Russel, Chris and Schölkopf, Bernhard and Janzing, Dominik and Locatello, Francesco},
  booktitle    = {Proceedings of the 39th International Conference on Machine Learning},
  location     = {Baltimore, MD, United States},
  pages        = {18741--18753},
  publisher    = {ML Research Press},
  title        = {{Score matching enables causal discovery of nonlinear additive noise  models}},
  volume       = {162},
  year         = {2022},
}

@inproceedings{14172,
  abstract     = {An important component for generalization in machine learning is to uncover underlying latent factors of variation as well as the mechanism through which each factor acts in the world. In this paper, we test whether 17 unsupervised, weakly supervised, and fully supervised representation learning approaches correctly infer the generative factors of variation in simple datasets (dSprites, Shapes3D, MPI3D) from controlled environments, and on our contributed CelebGlow dataset. In contrast to prior robustness work that introduces novel factors of variation during test time, such as blur or other (un)structured noise, we here recompose, interpolate, or extrapolate only existing factors of variation from the training data set (e.g., small and medium-sized objects during training and large objects during testing). Models
that learn the correct mechanism should be able to generalize to this benchmark. In total, we train and test 2000+ models and observe that all of them struggle to learn the underlying mechanism regardless of supervision signal and architectural bias. Moreover, the generalization capabilities of all tested models drop significantly as we move from artificial datasets towards
more realistic real-world datasets. Despite their inability to identify the correct mechanism, the models are quite modular as their ability to infer other in-distribution factors remains fairly stable, providing only a single factoris out-of-distribution. These results point to an important yet understudied problem of learning mechanistic models of observations that can facilitate
generalization.},
  author       = {Schott, Lukas and Kügelgen, Julius von and Träuble, Frederik and Gehler, Peter and Russell, Chris and Bethge, Matthias and Schölkopf, Bernhard and Locatello, Francesco and Brendel, Wieland},
  booktitle    = {10th International Conference on Learning Representations},
  location     = {Virtual},
  title        = {{Visual representation learning does not generalize strongly within the  same domain}},
  year         = {2022},
}