@inproceedings{14213,
  abstract     = {We introduce a method to segment the visual field into independently moving regions, trained with no ground truth or supervision. It consists of an adversarial conditional encoder-decoder architecture based on Slot Attention, modified to use the image as context to decode optical flow without attempting to reconstruct the image itself. In the resulting multi-modal representation, one modality (flow) feeds the encoder to produce separate latent codes (slots), whereas the other modality (image) conditions the decoder to generate the first (flow) from the slots. This design frees the representation from having to encode complex nuisance variability in the image due to, for instance, illumination and reflectance properties of the scene. Since customary autoencoding based on minimizing the reconstruction error does not preclude the entire flow from being encoded into a single slot, we modify the loss to an adversarial criterion based on Contextual Information Separation. The resulting min-max optimization fosters the separation of objects and their assignment to different attention slots, leading to Divided Attention, or DivA. DivA outperforms recent unsupervised multi-object motion segmentation methods while tripling run-time speed up to 104FPS and reducing the performance gap from supervised methods to 12% or less. DivA can handle different numbers of objects and different image sizes at training and test time, is invariant to permutation of object labels, and does not require explicit regularization.},
  author       = {Lao, Dong and Hu, Zhengyang and Locatello, Francesco and Yang, Yanchao and Soatto, Stefano},
  booktitle    = {1st Conference on Parsimony and Learning},
  location     = {Hong Kong, China},
  title        = {{Divided attention: Unsupervised multi-object discovery with contextually separated slots}},
  year         = {2024},
}

@unpublished{14333,
  abstract     = {As causal ground truth is incredibly rare, causal discovery algorithms are
commonly only evaluated on simulated data. This is concerning, given that
simulations reflect common preconceptions about generating processes regarding
noise distributions, model classes, and more. In this work, we propose a novel
method for falsifying the output of a causal discovery algorithm in the absence
of ground truth. Our key insight is that while statistical learning seeks
stability across subsets of data points, causal learning should seek stability
across subsets of variables. Motivated by this insight, our method relies on a
notion of compatibility between causal graphs learned on different subsets of
variables. We prove that detecting incompatibilities can falsify wrongly
inferred causal relations due to violation of assumptions or errors from finite
sample effects. Although passing such compatibility tests is only a necessary
criterion for good performance, we argue that it provides strong evidence for
the causal models whenever compatibility entails strong implications for the
joint distribution. We also demonstrate experimentally that detection of
incompatibilities can aid in causal model selection.},
  author       = {Faller, Philipp M. and Vankadara, Leena Chennuru and Mastakouri, Atalanti A. and Locatello, Francesco and Janzing, Dominik},
  booktitle    = {arXiv},
  title        = {{Self-compatibility: Evaluating causal discovery without ground truth}},
  doi          = {10.48550/arXiv.2307.09552},
  year         = {2023},
}

@unpublished{14946,
  abstract     = {We present a unified framework for studying the identifiability of
representations learned from simultaneously observed views, such as different
data modalities. We allow a partially observed setting in which each view
constitutes a nonlinear mixture of a subset of underlying latent variables,
which can be causally related. We prove that the information shared across all
subsets of any number of views can be learned up to a smooth bijection using
contrastive learning and a single encoder per view. We also provide graphical
criteria indicating which latent variables can be identified through a simple
set of rules, which we refer to as identifiability algebra. Our general
framework and theoretical results unify and extend several previous works on
multi-view nonlinear ICA, disentanglement, and causal representation learning.
We experimentally validate our claims on numerical, image, and multi-modal data
sets. Further, we demonstrate that the performance of prior methods is
recovered in different special cases of our setup. Overall, we find that access
to multiple partial views enables us to identify a more fine-grained
representation, under the generally milder assumption of partial observability.},
  author       = {Yao, Dingling and Xu, Danru and Lachapelle, Sébastien and Magliacane, Sara and Taslakian, Perouz and Martius, Georg and Kügelgen, Julius von and Locatello, Francesco},
  booktitle    = {arXiv},
  title        = {{Multi-view causal representation learning with partial observability}},
  doi          = {10.48550/arXiv.2311.04056},
  year         = {2023},
}

@unpublished{14948,
  abstract     = {The extraction of modular object-centric representations for downstream tasks
is an emerging area of research. Learning grounded representations of objects
that are guaranteed to be stable and invariant promises robust performance
across different tasks and environments. Slot Attention (SA) learns
object-centric representations by assigning objects to \textit{slots}, but
presupposes a \textit{single} distribution from which all slots are randomly
initialised. This results in an inability to learn \textit{specialized} slots
which bind to specific object types and remain invariant to identity-preserving
changes in object appearance. To address this, we present
\emph{\textsc{Co}nditional \textsc{S}lot \textsc{A}ttention} (\textsc{CoSA})
using a novel concept of \emph{Grounded Slot Dictionary} (GSD) inspired by
vector quantization. Our proposed GSD comprises (i) canonical object-level
property vectors and (ii) parametric Gaussian distributions, which define a
prior over the slots. We demonstrate the benefits of our method in multiple
downstream tasks such as scene generation, composition, and task adaptation,
whilst remaining competitive with SA in popular object discovery benchmarks.},
  author       = {Kori, Avinash and Locatello, Francesco and Ribeiro, Fabio De Sousa and Toni, Francesca and Glocker, Ben},
  booktitle    = {arXiv},
  title        = {{Grounded object centric learning}},
  doi          = {10.48550/arXiv.2307.09437},
  year         = {2023},
}

@article{14949,
  abstract     = {Many approaches have been proposed to use diffusion models to augment training datasets for downstream tasks, such as classification. However, diffusion models are themselves trained on large datasets, often with noisy annotations, and it remains an open question to which extent these models contribute to downstream classification performance. In particular, it remains unclear if they generalize enough to improve over directly using the additional data of their pre-training process for augmentation. We systematically evaluate a range of existing methods to generate images from diffusion models and study new extensions to assess their benefit for data augmentation. Personalizing diffusion models towards the target data outperforms simpler prompting strategies. However, using the pre-training data of the diffusion model alone, via a simple nearest-neighbor retrieval procedure, leads to even stronger downstream performance. Our study explores the potential of diffusion models in generating new training data, and surprisingly finds that these sophisticated models are not yet able to beat a simple and strong image retrieval baseline on simple downstream vision tasks.},
  author       = {Burg, Max and Wenzel, Florian and Zietlow, Dominik and Horn, Max and Makansi, Osama and Locatello, Francesco and Russell, Chris},
  issn         = {2835-8856},
  journal      = {Journal of Machine Learning Research},
  publisher    = {ML Research Press},
  title        = {{Image retrieval outperforms diffusion models on data augmentation}},
  year         = {2023},
}

@unpublished{14952,
  abstract     = {While different neural models often exhibit latent spaces that are alike when exposed to semantically related data, this intrinsic similarity is not always immediately discernible. Towards a better understanding of this phenomenon, our work shows how representations learned from these neural modules can be translated between different pre-trained networks via simpler transformations than previously thought. An advantage of this approach is the ability to
estimate these transformations using standard, well-understood algebraic procedures that have closed-form solutions. Our method directly estimates a transformation between two given latent spaces, thereby enabling effective stitching of encoders and decoders without additional training. We extensively validate the adaptability of this translation procedure in different
experimental settings: across various trainings, domains, architectures (e.g., ResNet, CNN, ViT), and in multiple downstream tasks (classification, reconstruction). Notably, we show how it is possible to zero-shot stitch text encoders and vision decoders, or vice-versa, yielding surprisingly good classification performance in this multimodal setting.},
  author       = {Maiorca, Valentino and Moschella, Luca and Norelli, Antonio and Fumero, Marco and Locatello, Francesco and Rodolà, Emanuele},
  booktitle    = {arXiv},
  title        = {{Latent space translation via semantic alignment}},
  doi          = {10.48550/arXiv.2311.00664},
  year         = {2023},
}

@unpublished{14953,
  abstract     = {This paper provides statistical sample complexity bounds for score-matching and its applications in causal discovery. We demonstrate that accurate estimation of the score function is achievable by training a standard deep ReLU neural network using stochastic gradient descent. We establish bounds on the error rate of recovering causal relationships using the score-matching-based causal discovery method of Rolland et al. [2022], assuming a sufficiently good estimation of the score function. Finally, we analyze the upper bound of score-matching estimation within the score-based generative modeling, which has been applied for causal discovery but is also of independent interest within the domain of generative models.},
  author       = {Zhu, Zhenyu and Locatello, Francesco and Cevher, Volkan},
  booktitle    = {arXiv},
  title        = {{Sample complexity bounds for score-matching: Causal discovery and generative modeling}},
  doi          = {10.48550/arXiv.2310.18123},
  year         = {2023},
}

@unpublished{14954,
  abstract     = {When domain knowledge is limited and experimentation is restricted by ethical, financial, or time constraints, practitioners turn to observational causal discovery methods to recover the causal structure, exploiting the statistical properties of their data. Because causal discovery without further assumptions is an ill-posed problem, each algorithm comes with its own set of
usually untestable assumptions, some of which are hard to meet in real datasets. Motivated by these considerations, this paper extensively benchmarks the empirical performance of recent causal discovery methods on observational i.i.d. data generated under different background conditions, allowing for violations of the critical assumptions required by each selected approach. Our experimental findings show that score matching-based methods demonstrate
surprising performance in the false positive and false negative rate of the inferred graph in these challenging scenarios, and we provide theoretical insights into their performance. This work is also the first effort to benchmark the stability of causal discovery algorithms with respect to the values of their hyperparameters. Finally, we hope this paper will set a new standard for the evaluation of causal discovery methods and can serve as an accessible entry point for practitioners interested in the field, highlighting the empirical implications of different algorithm choices.},
  author       = {Montagna, Francesco and Mastakouri, Atalanti A. and Eulig, Elias and Noceti, Nicoletta and Rosasco, Lorenzo and Janzing, Dominik and Aragam, Bryon and Locatello, Francesco},
  booktitle    = {arXiv},
  title        = {{Assumption violations in causal discovery and the robustness of score matching}},
  doi          = {10.48550/arXiv.2310.13387},
  year         = {2023},
}

@inproceedings{14958,
  abstract     = {Causal representation learning (CRL) aims at identifying high-level causal variables from low-level data, e.g. images. Current methods usually assume that all causal variables are captured in the high-dimensional observations. In this work, we focus on learning causal representations from data under partial observability, i.e., when some of the causal variables are not observed in the measurements, and the set of masked variables changes across the different samples. We introduce some initial theoretical results for identifying causal variables under partial observability by exploiting a sparsity regularizer, focusing in particular on the linear and piecewise linear mixing function case. We provide a theorem that allows us to identify the causal variables up to permutation and element-wise linear transformations in the linear case and a lemma that allows us to identify causal variables up to linear transformation in the piecewise case. Finally, we provide a conjecture that would allow us to identify the causal variables up to permutation and element-wise linear transformations also in the piecewise linear case. We test the theorem and conjecture on simulated data, showing the effectiveness of our method.},
  author       = {Xu, Danru and Yao, Dingling and Lachapelle, Sebastien and Taslakian, Perouz and von Kügelgen, Julius and Locatello, Francesco and Magliacane, Sara},
  booktitle    = {Causal Representation Learning Workshop at NeurIPS 2023},
  location     = {New Orleans, LA, United States},
  publisher    = {OpenReview},
  title        = {{A sparsity principle for partially observable causal representation learning}},
  year         = {2023},
}

@unpublished{14961,
  abstract     = {The use of simulated data in the field of causal discovery is ubiquitous due to the scarcity of annotated real data. Recently, Reisach et al., 2021 highlighted the emergence of patterns in simulated linear data, which displays increasing marginal variance in the casual direction. As an ablation in their experiments, Montagna et al., 2023 found that similar patterns may emerge in
nonlinear models for the variance of the score vector $\nabla \log p_{\mathbf{X}}$, and introduced the ScoreSort algorithm. In this work, we formally define and characterize this score-sortability pattern of nonlinear additive noise models. We find that it defines a class of identifiable (bivariate) causal models overlapping with nonlinear additive noise models. We
theoretically demonstrate the advantages of ScoreSort in terms of statistical efficiency compared to prior state-of-the-art score matching-based methods and empirically show the score-sortability of the most common synthetic benchmarks in the literature. Our findings remark (1) the lack of diversity in the data as an important limitation in the evaluation of nonlinear causal discovery approaches, (2) the importance of thoroughly testing different settings within a problem class, and (3) the importance of analyzing statistical properties in
causal discovery, where research is often limited to defining identifiability conditions of the model. },
  author       = {Montagna, Francesco and Noceti, Nicoletta and Rosasco, Lorenzo and Locatello, Francesco},
  booktitle    = {arXiv},
  title        = {{Shortcuts for causal discovery of nonlinear models by score matching}},
  doi          = {10.48550/arXiv.2310.14246},
  year         = {2023},
}

@unpublished{14962,
  abstract     = {In this paper, we show that recent advances in video representation learning
and pre-trained vision-language models allow for substantial improvements in
self-supervised video object localization. We propose a method that first
localizes objects in videos via a slot attention approach and then assigns text
to the obtained slots. The latter is achieved by an unsupervised way to read
localized semantic information from the pre-trained CLIP model. The resulting
video object localization is entirely unsupervised apart from the implicit
annotation contained in CLIP, and it is effectively the first unsupervised
approach that yields good results on regular video benchmarks.},
  author       = {Fan, Ke and Bai, Zechen and Xiao, Tianjun and Zietlow, Dominik and Horn, Max and Zhao, Zixu and Carl-Johann Simon-Gabriel, Carl-Johann Simon-Gabriel and Shou, Mike Zheng and Locatello, Francesco and Schiele, Bernt and Brox, Thomas and Zhang, Zheng and Fu, Yanwei and He, Tong},
  booktitle    = {arXiv},
  title        = {{Unsupervised open-vocabulary object localization in videos}},
  doi          = {10.48550/arXiv.2309.09858},
  year         = {2023},
}

@unpublished{14963,
  abstract     = {Unsupervised object-centric learning methods allow the partitioning of scenes
into entities without additional localization information and are excellent
candidates for reducing the annotation burden of multiple-object tracking (MOT)
pipelines. Unfortunately, they lack two key properties: objects are often split
into parts and are not consistently tracked over time. In fact,
state-of-the-art models achieve pixel-level accuracy and temporal consistency
by relying on supervised object detection with additional ID labels for the
association through time. This paper proposes a video object-centric model for
MOT. It consists of an index-merge module that adapts the object-centric slots
into detection outputs and an object memory module that builds complete object
prototypes to handle occlusions. Benefited from object-centric learning, we
only require sparse detection labels (0%-6.25%) for object localization and
feature binding. Relying on our self-supervised
Expectation-Maximization-inspired loss for object association, our approach
requires no ID labels. Our experiments significantly narrow the gap between the
existing object-centric model and the fully supervised state-of-the-art and
outperform several unsupervised trackers.},
  author       = {Zhao, Zixu and Wang, Jiaze and Horn, Max and Ding, Yizhuo and He, Tong and Bai, Zechen and Zietlow, Dominik and Carl-Johann Simon-Gabriel, Carl-Johann Simon-Gabriel and Shuai, Bing and Tu, Zhuowen and Brox, Thomas and Schiele, Bernt and Fu, Yanwei and Locatello, Francesco and Zhang, Zheng and Xiao, Tianjun},
  booktitle    = {arXiv},
  title        = {{Object-centric multiple object tracking}},
  doi          = {10.48550/arXiv.2309.00233},
  year         = {2023},
}

@inproceedings{14105,
  abstract     = {Despite their recent success, deep neural networks continue to perform poorly when they encounter distribution shifts at test time. Many recently proposed approaches try to counter this by aligning the model to the new distribution prior to inference. With no labels available this requires unsupervised objectives to adapt the model on the observed test data. In this paper, we propose Test-Time SelfTraining (TeST): a technique that takes as input a model trained on some source data and a novel data distribution at test time, and learns invariant and robust representations using a student-teacher framework. We find that models adapted using TeST significantly improve over baseline testtime adaptation algorithms. TeST achieves competitive performance to modern domain adaptation algorithms [4, 43], while having access to 5-10x less data at time of adaption. We thoroughly evaluate a variety of baselines on two tasks:
object detection and image segmentation and find that models adapted with TeST. We find that TeST sets the new stateof-the art for test-time domain adaptation algorithms. },
  author       = {Sinha, Samarth and Gehler, Peter and Locatello, Francesco and Schiele, Bernt},
  booktitle    = {2023 IEEE/CVF Winter Conference on Applications of Computer Vision},
  isbn         = {9781665493475},
  issn         = {2642-9381},
  location     = {Waikoloa, HI, United States},
  publisher    = {Institute of Electrical and Electronics Engineers},
  title        = {{TeST: Test-time Self-Training under distribution shift}},
  doi          = {10.1109/wacv56688.2023.00278},
  year         = {2023},
}

@unpublished{14207,
  abstract     = {The binding problem in human cognition, concerning how the brain represents and connects objects within a fixed network of neural connections, remains a subject of intense debate. Most machine learning efforts addressing this issue in an unsupervised setting have focused on slot-based methods, which may be limiting due to their discrete nature and difficulty to express uncertainty. Recently, the Complex AutoEncoder was proposed as an alternative that learns continuous and distributed object-centric representations. However, it is only applicable to simple toy data. In this paper, we present Rotating Features, a generalization of complex-valued features to higher dimensions, and a new evaluation procedure for extracting objects from distributed representations. Additionally, we show the applicability of our approach to pre-trained features. Together, these advancements enable us to scale distributed object-centric representations from simple toy to real-world data. We believe this work advances a new paradigm for addressing the binding problem in machine learning and has the potential to inspire further innovation in the field.},
  author       = {Löwe, Sindy and Lippe, Phillip and Locatello, Francesco and Welling, Max},
  booktitle    = {arXiv},
  title        = {{Rotating features for object discovery}},
  doi          = {10.48550/arXiv.2306.00600},
  year         = {2023},
}

@inproceedings{14208,
  abstract     = {This paper focuses on over-parameterized deep neural networks (DNNs) with ReLU activation functions and proves that when the data distribution is well-separated, DNNs can achieve Bayes-optimal test error for classification while obtaining (nearly) zero-training error under the lazy training regime. For this purpose, we unify three interrelated concepts of overparameterization, benign overfitting, and the Lipschitz constant of DNNs. Our results indicate that interpolating with smoother functions leads to better generalization. Furthermore, we investigate the special case where interpolating smooth ground-truth functions is performed by DNNs under the Neural Tangent Kernel (NTK) regime for generalization. Our result demonstrates that the generalization error converges to a constant order that only depends on label noise and initialization noise, which theoretically verifies benign overfitting. Our analysis provides a tight lower bound on the normalized margin under non-smooth activation functions, as well as the minimum eigenvalue of NTK under high-dimensional settings, which has its own interest in learning theory.},
  author       = {Zhu, Zhenyu and Liu, Fanghui and Chrysos, Grigorios G and Locatello, Francesco and Cevher, Volkan},
  booktitle    = {Proceedings of the 40th International Conference on Machine Learning},
  location     = {Honolulu, Hawaii, United States},
  pages        = {43105--43128},
  publisher    = {ML Research Press},
  title        = {{Benign overfitting in deep neural networks under lazy training}},
  volume       = {202},
  year         = {2023},
}

@unpublished{14209,
  abstract     = {Diffusion models excel at generating photorealistic images from text-queries. Naturally, many approaches have been proposed to use these generative abilities to augment training datasets for downstream tasks, such as classification. However, diffusion models are themselves trained on large noisily supervised, but nonetheless, annotated datasets. It is an open question whether the generalization capabilities of diffusion models beyond using the additional data of the pre-training process for augmentation lead to improved downstream performance. We perform a systematic evaluation of existing methods to generate images from diffusion models and study new extensions to assess their benefit for data augmentation. While we find that personalizing diffusion models towards the target data outperforms simpler prompting strategies, we also show that using the training data of the diffusion model alone, via a simple nearest neighbor retrieval procedure, leads to even stronger downstream performance. Overall, our study probes the limitations of diffusion models for data augmentation but also highlights its potential in generating new training data to improve performance on simple downstream vision tasks.},
  author       = {Burg, Max F. and Wenzel, Florian and Zietlow, Dominik and Horn, Max and Makansi, Osama and Locatello, Francesco and Russell, Chris},
  booktitle    = {arXiv},
  title        = {{A data augmentation perspective on diffusion models and retrieval}},
  doi          = {10.48550/arXiv.2304.10253},
  year         = {2023},
}

@unpublished{14210,
  abstract     = {Recovering the latent factors of variation of high dimensional data has so far focused on simple synthetic settings. Mostly building on unsupervised and weakly-supervised objectives, prior work missed out on the positive implications for representation learning on real world data. In this work, we propose to leverage knowledge extracted from a diversified set of supervised tasks to learn a common disentangled representation. Assuming each supervised task only depends on an unknown subset of the factors of variation, we disentangle the feature space of a supervised multi-task model, with features activating sparsely across different tasks and information being shared as appropriate. Importantly, we never directly observe the factors of variations but establish that access to multiple tasks is sufficient for identifiability under sufficiency and minimality assumptions. We validate our approach on six real world distribution shift benchmarks, and different data modalities (images, text), demonstrating how disentangled representations can be transferred to real settings.},
  author       = {Fumero, Marco and Wenzel, Florian and Zancato, Luca and Achille, Alessandro and Rodolà, Emanuele and Soatto, Stefano and Schölkopf, Bernhard and Locatello, Francesco},
  booktitle    = {arXiv},
  title        = {{Leveraging sparse and shared feature activations for disentangled representation learning}},
  doi          = {10.48550/arXiv.2304.07939},
  year         = {2023},
}

@inproceedings{14211,
  abstract     = {Causal discovery methods are intrinsically constrained by the set of assumptions needed to ensure structure identifiability. Moreover additional restrictions are often imposed in order to simplify the inference task: this is the case for the Gaussian noise assumption on additive non-linear models, which is common to many causal discovery approaches. In this paper we show the shortcomings of inference under this hypothesis, analyzing the risk of edge inversion under violation of Gaussianity of the noise terms. Then, we propose a novel method for inferring the topological ordering of the variables in the causal graph, from data generated according to an additive non-linear model with a generic noise distribution. This leads to NoGAM (Not only Gaussian Additive noise Models), a causal discovery algorithm with a minimal set of assumptions and state of the art performance, experimentally benchmarked on synthetic data.},
  author       = {Montagna, Francesco and Noceti, Nicoletta and Rosasco, Lorenzo and Zhang, Kun and Locatello, Francesco},
  booktitle    = {2nd Conference on Causal Learning and Reasoning},
  location     = {Tübingen, Germany},
  title        = {{Causal discovery with score matching on additive models with arbitrary noise}},
  year         = {2023},
}

@inproceedings{14212,
  abstract     = {This paper demonstrates how to discover the whole causal graph from the second derivative of the log-likelihood in observational non-linear additive Gaussian noise models. Leveraging scalable machine learning approaches to approximate the score function ∇logp(X), we extend the work of Rolland et al. (2022) that only recovers the topological order from the score and requires an expensive pruning step removing spurious edges among those admitted by the ordering. Our analysis leads to DAS (acronym for Discovery At Scale), a practical algorithm that reduces the complexity of the pruning by a factor proportional to the graph size. In practice, DAS achieves competitive accuracy with current state-of-the-art while being over an order of magnitude faster. Overall, our approach enables principled and scalable causal discovery, significantly lowering the compute bar.},
  author       = {Montagna, Francesco and Noceti, Nicoletta and Rosasco, Lorenzo and Zhang, Kun and Locatello, Francesco},
  booktitle    = {2nd Conference on Causal Learning and Reasoning},
  location     = {Tübingen, Germany},
  title        = {{Scalable causal discovery with score matching}},
  year         = {2023},
}

@inproceedings{14214,
  abstract     = {Recent years have seen a surge of interest in learning high-level causal representations from low-level image pairs under interventions. Yet, existing efforts are largely limited to simple synthetic settings that are far away from real-world problems. In this paper, we present Causal Triplet, a causal representation learning benchmark featuring not only visually more complex scenes, but also two crucial desiderata commonly overlooked in previous works: (i) an actionable counterfactual setting, where only certain object-level variables allow for counterfactual observations whereas others do not; (ii) an interventional downstream task with an emphasis on out-of-distribution robustness from the independent causal mechanisms principle. Through extensive experiments, we find that models built with the knowledge of disentangled or object-centric representations significantly outperform their distributed counterparts. However, recent causal representation learning methods still struggle to identify such latent structures, indicating substantial challenges and opportunities for future work.},
  author       = {Liu, Yuejiang and Alahi, Alexandre and Russell, Chris and Horn, Max and Zietlow, Dominik and Schölkopf, Bernhard and Locatello, Francesco},
  booktitle    = {2nd Conference on Causal Learning and Reasoning},
  location     = {Tübingen, Germany},
  title        = {{Causal triplet: An open challenge for intervention-centric causal representation learning}},
  year         = {2023},
}