@inproceedings{14176,
  abstract     = {Intensive care units (ICU) are increasingly looking towards machine learning for methods to provide online monitoring of critically ill patients. In machine learning, online monitoring is often formulated as a supervised learning problem. Recently, contrastive learning approaches have demonstrated promising improvements over competitive supervised benchmarks. These methods rely on well-understood data augmentation techniques developed for image data which do not apply to online monitoring. In this work, we overcome this limitation by
supplementing time-series data augmentation techniques with a novel contrastive
learning objective which we call neighborhood contrastive learning (NCL). Our objective explicitly groups together contiguous time segments from each patient while maintaining state-specific information. Our experiments demonstrate a marked improvement over existing work applying contrastive methods to medical time-series.},
  author       = {Yèche, Hugo and Dresdner, Gideon and Locatello, Francesco and Hüser, Matthias and Rätsch, Gunnar},
  booktitle    = {Proceedings of 38th International Conference on Machine Learning},
  location     = {Virtual},
  pages        = {11964--11974},
  publisher    = {ML Research Press},
  title        = {{Neighborhood contrastive learning applied to online patient monitoring}},
  volume       = {139},
  year         = {2021},
}

@inproceedings{14177,
  abstract     = {The focus of disentanglement approaches has been on identifying independent factors of variation in data. However, the causal variables underlying real-world observations are often not statistically independent. In this work, we bridge the gap to real-world scenarios by analyzing the behavior of the most prominent disentanglement approaches on correlated data in a large-scale empirical study (including 4260 models). We show and quantify that systematically induced correlations in the dataset are being learned and reflected in the latent representations, which has implications for downstream applications of disentanglement such as fairness. We also demonstrate how to resolve these latent correlations, either using weak supervision during
training or by post-hoc correcting a pre-trained model with a small number of labels.},
  author       = {Träuble, Frederik and Creager, Elliot and Kilbertus, Niki and Locatello, Francesco and Dittadi, Andrea and Goyal, Anirudh and Schölkopf, Bernhard and Bauer, Stefan},
  booktitle    = {Proceedings of the 38th International Conference on Machine Learning},
  location     = {Virtual},
  pages        = {10401--10412},
  publisher    = {ML Research Press},
  title        = {{On disentangled representations learned from correlated data}},
  volume       = {139},
  year         = {2021},
}

@inproceedings{14178,
  abstract     = {Learning meaningful representations that disentangle the underlying structure of the data generating process is considered to be of key importance in machine learning. While disentangled representations were found to be useful for diverse tasks such as abstract reasoning and fair classification, their scalability and real-world impact remain questionable. We introduce a new high-resolution dataset with 1M simulated images and over 1,800 annotated real-world images of the same setup. In contrast to previous work, this new dataset exhibits correlations, a complex underlying structure, and allows to evaluate transfer to unseen simulated and real-world settings where the encoder i) remains in distribution or ii) is out of distribution. We propose new architectures in order to scale disentangled representation learning to realistic high-resolution settings and conduct a large-scale empirical study of disentangled representations on this dataset. We observe that disentanglement is a good predictor for out-of-distribution (OOD) task performance.},
  author       = {Dittadi, Andrea and Träuble, Frederik and Locatello, Francesco and Wüthrich, Manuel and Agrawal, Vaibhav and Winther, Ole and Bauer, Stefan and Schölkopf, Bernhard},
  booktitle    = {The Ninth International Conference on Learning Representations},
  location     = {Virtual},
  title        = {{On the transfer of disentangled representations in realistic settings}},
  year         = {2021},
}

@inproceedings{14179,
  abstract     = {Self-supervised representation learning has shown remarkable success in a number of domains. A common practice is to perform data augmentation via hand-crafted transformations intended to leave the semantics of the data invariant. We seek to understand the empirical success of this approach from a theoretical perspective. We formulate the augmentation process as a latent variable model by postulating a partition of the latent representation into a content component, which is assumed invariant to augmentation, and a style component, which is allowed to change. Unlike prior work on disentanglement and independent component analysis, we allow for both nontrivial statistical and causal dependencies in the latent space. We study the identifiability of the latent representation based on pairs of views of the observations and prove sufficient conditions that allow us to identify the invariant content partition up to an invertible mapping in both generative and discriminative settings. We find numerical simulations with dependent latent variables are consistent with our theory. Lastly, we introduce Causal3DIdent, a dataset of high-dimensional, visually complex images with rich causal dependencies, which we use to study the effect of data augmentations performed in practice.},
  author       = {Kügelgen, Julius von and Sharma, Yash and Gresele, Luigi and Brendel, Wieland and Schölkopf, Bernhard and Besserve, Michel and Locatello, Francesco},
  booktitle    = {Advances in Neural Information Processing Systems},
  isbn         = {9781713845393},
  location     = {Virtual},
  pages        = {16451--16467},
  title        = {{Self-supervised learning with data augmentations provably isolates content from style}},
  volume       = {34},
  year         = {2021},
}

@inproceedings{14180,
  abstract     = {Modern neural network architectures can leverage large amounts of data to generalize well within the training distribution. However, they are less capable of systematic generalization to data drawn from unseen but related distributions, a feat that is hypothesized to require compositional reasoning and reuse of knowledge. In this work, we present Neural Interpreters, an architecture that factorizes inference in a self-attention network as a system of modules, which we call \emph{functions}. Inputs to the model are routed through a sequence of functions in a way that is end-to-end learned. The proposed architecture can flexibly compose computation along width and depth, and lends itself well to capacity extension after training. To demonstrate the versatility of Neural Interpreters, we evaluate it in two distinct settings: image classification and visual abstract reasoning on Raven Progressive Matrices. In the former, we show that Neural Interpreters perform on par with the vision transformer using fewer parameters, while being transferrable to a new task in a sample efficient manner. In the latter, we find that Neural Interpreters are competitive with respect to the state-of-the-art in terms of systematic generalization. },
  author       = {Rahaman, Nasim and Gondal, Muhammad Waleed and Joshi, Shruti and Gehler, Peter and Bengio, Yoshua and Locatello, Francesco and Schölkopf, Bernhard},
  booktitle    = {Advances in Neural Information Processing Systems},
  isbn         = {9781713845393},
  location     = {Virtual},
  pages        = {10985--10998},
  title        = {{Dynamic inference with neural interpreters}},
  volume       = {34},
  year         = {2021},
}

@inproceedings{14181,
  abstract     = {Variational Inference makes a trade-off between the capacity of the variational family and the tractability of finding an approximate posterior distribution. Instead, Boosting Variational Inference allows practitioners to obtain increasingly good posterior approximations by spending more compute. The main obstacle to widespread adoption of Boosting Variational Inference is the amount of resources necessary to improve over a strong Variational Inference baseline. In our work, we trace this limitation back to the global curvature of the KL-divergence. We characterize how the global curvature impacts time and memory consumption, address the problem with the notion of local curvature, and provide a novel approximate backtracking algorithm for estimating local curvature. We give new theoretical convergence rates for our algorithms and provide experimental validation on synthetic and real-world datasets.},
  author       = {Dresdner, Gideon and Shekhar, Saurav and Pedregosa, Fabian and Locatello, Francesco and Rätsch, Gunnar},
  booktitle    = {Proceedings of the Thirtieth International Joint Conference on Artificial Intelligence},
  location     = {Montreal, Canada},
  pages        = {2337--2343},
  publisher    = {International Joint Conferences on Artificial Intelligence},
  title        = {{Boosting variational inference with locally adaptive step-sizes}},
  doi          = {10.24963/ijcai.2021/322},
  year         = {2021},
}

@inproceedings{14182,
  abstract     = {When machine learning systems meet real world applications, accuracy is only
one of several requirements. In this paper, we assay a complementary
perspective originating from the increasing availability of pre-trained and
regularly improving state-of-the-art models. While new improved models develop
at a fast pace, downstream tasks vary more slowly or stay constant. Assume that
we have a large unlabelled data set for which we want to maintain accurate
predictions. Whenever a new and presumably better ML models becomes available,
we encounter two problems: (i) given a limited budget, which data points should
be re-evaluated using the new model?; and (ii) if the new predictions differ
from the current ones, should we update? Problem (i) is about compute cost,
which matters for very large data sets and models. Problem (ii) is about
maintaining consistency of the predictions, which can be highly relevant for
downstream applications; our demand is to avoid negative flips, i.e., changing
correct to incorrect predictions. In this paper, we formalize the Prediction
Update Problem and present an efficient probabilistic approach as answer to the
above questions. In extensive experiments on standard classification benchmark
data sets, we show that our method outperforms alternative strategies along key
metrics for backward-compatible prediction updates.},
  author       = {Träuble, Frederik and Kügelgen, Julius von and Kleindessner, Matthäus and Locatello, Francesco and Schölkopf, Bernhard and Gehler, Peter},
  booktitle    = {35th Conference on Neural Information Processing Systems},
  isbn         = {9781713845393},
  location     = {Virtual},
  pages        = {116--128},
  title        = {{Backward-compatible prediction updates: A probabilistic approach}},
  volume       = {34},
  year         = {2021},
}

@unpublished{14221,
  abstract     = {The world is structured in countless ways. It may be prudent to enforce corresponding structural properties to a learning algorithm's solution, such as incorporating prior beliefs, natural constraints, or causal structures. Doing so may translate to faster, more accurate, and more flexible models, which may directly relate to real-world impact. In this dissertation, we consider two different research areas that concern structuring a learning algorithm's solution: when the structure is known and when it has to be discovered.},
  author       = {Locatello, Francesco},
  booktitle    = {arXiv},
  title        = {{Enforcing and discovering structure in machine learning}},
  doi          = {10.48550/arXiv.2111.13693},
  year         = {2021},
}

@inproceedings{14326,
  abstract     = {Learning object-centric representations of complex scenes is a promising step towards enabling efficient abstract reasoning from low-level perceptual features. Yet, most deep learning approaches learn distributed representations that do not capture the compositional properties of natural scenes. In this paper, we present the Slot Attention module, an architectural component that interfaces with perceptual representations such as the output of a convolutional neural network and produces a set of task-dependent abstract representations which we call slots. These slots are exchangeable and can bind to any object in the input by specializing through a competitive procedure over multiple rounds of attention. We empirically demonstrate that Slot Attention can extract object-centric representations that enable generalization to unseen compositions when trained on unsupervised object discovery and supervised property prediction tasks.

},
  author       = {Locatello, Francesco and Weissenborn, Dirk and Unterthiner, Thomas and Mahendran, Aravindh and Heigold, Georg and Uszkoreit, Jakob and Dosovitskiy, Alexey and Kipf, Thomas},
  booktitle    = {Advances in Neural Information Processing Systems},
  isbn         = {9781713829546},
  location     = {Virtual},
  pages        = {11525--11538},
  publisher    = {Curran Associates},
  title        = {{Object-centric learning with slot attention}},
  volume       = {33},
  year         = {2020},
}

@article{14125,
  abstract     = {Motivation: Recent technological advances have led to an increase in the production and availability of single-cell data. The ability to integrate a set of multi-technology measurements would allow the identification of biologically or clinically meaningful observations through the unification of the perspectives afforded by each technology. In most cases, however, profiling technologies consume the used cells and thus pairwise correspondences between datasets are lost. Due to the sheer size single-cell datasets can acquire, scalable algorithms that are able to universally match single-cell measurements carried out in one cell to its corresponding sibling in another technology are needed.
Results: We propose Single-Cell data Integration via Matching (SCIM), a scalable approach to recover such correspondences in two or more technologies. SCIM assumes that cells share a common (low-dimensional) underlying structure and that the underlying cell distribution is approximately constant across technologies. It constructs a technology-invariant latent space using an autoencoder framework with an adversarial objective. Multi-modal datasets are integrated by pairing cells across technologies using a bipartite matching scheme that operates on the low-dimensional latent representations. We evaluate SCIM on a simulated cellular branching process and show that the cell-to-cell matches derived by SCIM reflect the same pseudotime on the simulated dataset. Moreover, we apply our method to two real-world scenarios, a melanoma tumor sample and a human bone marrow sample, where we pair cells from a scRNA dataset to their sibling cells in a CyTOF dataset achieving 90% and 78% cell-matching accuracy for each one of the samples, respectively.},
  author       = {Stark, Stefan G and Ficek, Joanna and Locatello, Francesco and Bonilla, Ximena and Chevrier, Stéphane and Singer, Franziska and Aebersold, Rudolf and Al-Quaddoomi, Faisal S and Albinus, Jonas and Alborelli, Ilaria and Andani, Sonali and Attinger, Per-Olof and Bacac, Marina and Baumhoer, Daniel and Beck-Schimmer, Beatrice and Beerenwinkel, Niko and Beisel, Christian and Bernasconi, Lara and Bertolini, Anne and Bodenmiller, Bernd and Bonilla, Ximena and Casanova, Ruben and Chevrier, Stéphane and Chicherova, Natalia and D'Costa, Maya and Danenberg, Esther and Davidson, Natalie and gan, Monica-Andreea Dră and Dummer, Reinhard and Engler, Stefanie and Erkens, Martin and Eschbach, Katja and Esposito, Cinzia and Fedier, André and Ferreira, Pedro and Ficek, Joanna and Frei, Anja L and Frey, Bruno and Goetze, Sandra and Grob, Linda and Gut, Gabriele and Günther, Detlef and Haberecker, Martina and Haeuptle, Pirmin and Heinzelmann-Schwarz, Viola and Herter, Sylvia and Holtackers, Rene and Huesser, Tamara and Irmisch, Anja and Jacob, Francis and Jacobs, Andrea and Jaeger, Tim M and Jahn, Katharina and James, Alva R and Jermann, Philip M and Kahles, André and Kahraman, Abdullah and Koelzer, Viktor H and Kuebler, Werner and Kuipers, Jack and Kunze, Christian P and Kurzeder, Christian and Lehmann, Kjong-Van and Levesque, Mitchell and Lugert, Sebastian and Maass, Gerd and Manz, Markus and Markolin, Philipp and Mena, Julien and Menzel, Ulrike and Metzler, Julian M and Miglino, Nicola and Milani, Emanuela S and Moch, Holger and Muenst, Simone and Murri, Riccardo and Ng, Charlotte KY and Nicolet, Stefan and Nowak, Marta and Pedrioli, Patrick GA and Pelkmans, Lucas and Piscuoglio, Salvatore and Prummer, Michael and Ritter, Mathilde and Rommel, Christian and Rosano-González, María L and Rätsch, Gunnar and Santacroce, Natascha and Castillo, Jacobo Sarabia del and Schlenker, Ramona and Schwalie, Petra C and Schwan, Severin and Schär, Tobias and Senti, Gabriela and Singer, Franziska and Sivapatham, Sujana and Snijder, Berend and Sobottka, Bettina and Sreedharan, Vipin T and Stark, Stefan and Stekhoven, Daniel J and Theocharides, Alexandre PA and Thomas, Tinu M and Tolnay, Markus and Tosevski, Vinko and Toussaint, Nora C and Tuncel, Mustafa A and Tusup, Marina and Drogen, Audrey Van and Vetter, Marcus and Vlajnic, Tatjana and Weber, Sandra and Weber, Walter P and Wegmann, Rebekka and Weller, Michael and Wendt, Fabian and Wey, Norbert and Wicki, Andreas and Wollscheid, Bernd and Yu, Shuqing and Ziegler, Johanna and Zimmermann, Marc and Zoche, Martin and Zuend, Gregor and Rätsch, Gunnar and Lehmann, Kjong-Van},
  issn         = {1367-4811},
  journal      = {Bioinformatics},
  keywords     = {Computational Mathematics, Computational Theory and Mathematics, Computer Science Applications, Molecular Biology, Biochemistry, Statistics and Probability},
  number       = {Supplement_2},
  pages        = {i919--i927},
  publisher    = {Oxford University Press},
  title        = {{SCIM: Universal single-cell matching with unpaired feature sets}},
  doi          = {10.1093/bioinformatics/btaa843},
  volume       = {36},
  year         = {2020},
}

@inproceedings{14186,
  abstract     = {The goal of the unsupervised learning of disentangled representations is to
separate the independent explanatory factors of variation in the data without
access to supervision. In this paper, we summarize the results of Locatello et
al., 2019, and focus on their implications for practitioners. We discuss the
theoretical result showing that the unsupervised learning of disentangled
representations is fundamentally impossible without inductive biases and the
practical challenges it entails. Finally, we comment on our experimental
findings, highlighting the limitations of state-of-the-art approaches and
directions for future research.},
  author       = {Locatello, Francesco and Bauer, Stefan and Lucic, Mario and Rätsch, Gunnar and Gelly, Sylvain and Schölkopf, Bernhard and Bachem, Olivier},
  booktitle    = {The 34th AAAI Conference on Artificial Intelligence},
  isbn         = {9781577358350},
  issn         = {2374-3468},
  location     = {New York, NY, United States},
  number       = {9},
  pages        = {13681--13684},
  publisher    = {Association for the Advancement of Artificial Intelligence},
  title        = {{A commentary on the unsupervised learning of disentangled representations}},
  doi          = {10.1609/aaai.v34i09.7120},
  volume       = {34},
  year         = {2020},
}

@inproceedings{14187,
  abstract     = {We propose a novel Stochastic Frank-Wolfe (a.k.a. conditional gradient)
algorithm for constrained smooth finite-sum minimization with a generalized
linear prediction/structure. This class of problems includes empirical risk
minimization with sparse, low-rank, or other structured constraints. The
proposed method is simple to implement, does not require step-size tuning, and
has a constant per-iteration cost that is independent of the dataset size.
Furthermore, as a byproduct of the method we obtain a stochastic estimator of
the Frank-Wolfe gap that can be used as a stopping criterion. Depending on the
setting, the proposed method matches or improves on the best computational
guarantees for Stochastic Frank-Wolfe algorithms. Benchmarks on several
datasets highlight different regimes in which the proposed method exhibits a
faster empirical convergence than related methods. Finally, we provide an
implementation of all considered methods in an open-source package.},
  author       = {Négiar, Geoffrey and Dresdner, Gideon and Tsai, Alicia and Ghaoui, Laurent El and Locatello, Francesco and Freund, Robert M. and Pedregosa, Fabian},
  booktitle    = {Proceedings of the 37th International Conference on Machine Learning},
  location     = {Virtual},
  pages        = {7253--7262},
  title        = {{Stochastic Frank-Wolfe for constrained finite-sum minimization}},
  volume       = {119},
  year         = {2020},
}

@inproceedings{14188,
  abstract     = {Intelligent agents should be able to learn useful representations by
observing changes in their environment. We model such observations as pairs of
non-i.i.d. images sharing at least one of the underlying factors of variation.
First, we theoretically show that only knowing how many factors have changed,
but not which ones, is sufficient to learn disentangled representations.
Second, we provide practical algorithms that learn disentangled representations
from pairs of images without requiring annotation of groups, individual
factors, or the number of factors that have changed. Third, we perform a
large-scale empirical study and show that such pairs of observations are
sufficient to reliably learn disentangled representations on several benchmark
data sets. Finally, we evaluate our learned representations and find that they
are simultaneously useful on a diverse suite of tasks, including generalization
under covariate shifts, fairness, and abstract reasoning. Overall, our results
demonstrate that weak supervision enables learning of useful disentangled
representations in realistic scenarios.},
  author       = {Locatello, Francesco and Poole, Ben and Rätsch, Gunnar and Schölkopf, Bernhard and Bachem, Olivier and Tschannen, Michael},
  booktitle    = {Proceedings of the 37th International Conference on Machine Learning},
  location     = {Virtual},
  pages        = {6348–6359},
  title        = {{Weakly-supervised disentanglement without compromises}},
  volume       = {119},
  year         = {2020},
}

@article{14195,
  abstract     = {The idea behind the unsupervised learning of disentangled representations is that real-world data is generated by a few explanatory factors of variation which can be recovered by unsupervised learning algorithms. In this paper, we provide a sober look at recent progress in the field and challenge some common assumptions. We first theoretically show that the unsupervised learning of disentangled representations is fundamentally impossible without inductive biases on both the models and the data. Then, we train over 14000
 models covering most prominent methods and evaluation metrics in a reproducible large-scale experimental study on eight data sets. We observe that while the different methods successfully enforce properties “encouraged” by the corresponding losses, well-disentangled models seemingly cannot be identified without supervision. Furthermore, different evaluation metrics do not always agree on what should be considered “disentangled” and exhibit systematic differences in the estimation. Finally, increased disentanglement does not seem to necessarily lead to a decreased sample complexity of learning for downstream tasks. Our results suggest that future work on disentanglement learning should be explicit about the role of inductive biases and (implicit) supervision, investigate concrete benefits of enforcing disentanglement of the learned representations, and consider a reproducible experimental setup covering several data sets.},
  author       = {Locatello, Francesco and Bauer, Stefan and Lucic, Mario and Rätsch, Gunnar and Gelly, Sylvain and Schölkopf, Bernhard and Bachem, Olivier},
  journal      = {Journal of Machine Learning Research},
  publisher    = {MIT Press},
  title        = {{A sober look at the unsupervised learning of disentangled representations and their evaluation}},
  volume       = {21},
  year         = {2020},
}

@inproceedings{14184,
  abstract     = {Learning disentangled representations is considered a cornerstone problem in
representation learning. Recently, Locatello et al. (2019) demonstrated that
unsupervised disentanglement learning without inductive biases is theoretically
impossible and that existing inductive biases and unsupervised methods do not
allow to consistently learn disentangled representations. However, in many
practical settings, one might have access to a limited amount of supervision,
for example through manual labeling of (some) factors of variation in a few
training examples. In this paper, we investigate the impact of such supervision
on state-of-the-art disentanglement methods and perform a large scale study,
training over 52000 models under well-defined and reproducible experimental
conditions. We observe that a small number of labeled examples (0.01--0.5\% of
the data set), with potentially imprecise and incomplete labels, is sufficient
to perform model selection on state-of-the-art unsupervised models. Further, we
investigate the benefit of incorporating supervision into the training process.
Overall, we empirically validate that with little and imprecise supervision it
is possible to reliably learn disentangled representations.},
  author       = {Locatello, Francesco and Tschannen, Michael and Bauer, Stefan and Rätsch, Gunnar and Schölkopf, Bernhard and Bachem, Olivier},
  booktitle    = {8th International Conference on Learning Representations},
  location     = {Virtual},
  title        = {{Disentangling factors of variation using few labels}},
  year         = {2019},
}

@inproceedings{14189,
  abstract     = {We consider the problem of recovering a common latent source with independent
components from multiple views. This applies to settings in which a variable is
measured with multiple experimental modalities, and where the goal is to
synthesize the disparate measurements into a single unified representation. We
consider the case that the observed views are a nonlinear mixing of
component-wise corruptions of the sources. When the views are considered
separately, this reduces to nonlinear Independent Component Analysis (ICA) for
which it is provably impossible to undo the mixing. We present novel
identifiability proofs that this is possible when the multiple views are
considered jointly, showing that the mixing can theoretically be undone using
function approximators such as deep neural networks. In contrast to known
identifiability results for nonlinear ICA, we prove that independent latent
sources with arbitrary mixing can be recovered as long as multiple,
sufficiently different noisy views are available.},
  author       = {Gresele, Luigi and Rubenstein, Paul K. and Mehrjou, Arash and Locatello, Francesco and Schölkopf, Bernhard},
  booktitle    = {Proceedings of the 35th Conference on Uncertainty in Artificial  Intelligence},
  location     = {Tel Aviv, Israel},
  pages        = {217--227},
  publisher    = {ML Research Press},
  title        = {{The incomplete Rosetta Stone problem: Identifiability results for multi-view nonlinear ICA}},
  volume       = {115},
  year         = {2019},
}

@inproceedings{14190,
  abstract     = {Learning meaningful and compact representations with disentangled semantic
aspects is considered to be of key importance in representation learning. Since
real-world data is notoriously costly to collect, many recent state-of-the-art
disentanglement models have heavily relied on synthetic toy data-sets. In this
paper, we propose a novel data-set which consists of over one million images of
physical 3D objects with seven factors of variation, such as object color,
shape, size and position. In order to be able to control all the factors of
variation precisely, we built an experimental platform where the objects are
being moved by a robotic arm. In addition, we provide two more datasets which
consist of simulations of the experimental setup. These datasets provide for
the first time the possibility to systematically investigate how well different
disentanglement methods perform on real data in comparison to simulation, and
how simulated data can be leveraged to build better representations of the real
world. We provide a first experimental study of these questions and our results
indicate that learned models transfer poorly, but that model and hyperparameter
selection is an effective means of transferring information to the real world.},
  author       = {Gondal, Muhammad Waleed and Wüthrich, Manuel and Miladinović, Đorđe and Locatello, Francesco and Breidt, Martin and Volchkov, Valentin and Akpo, Joel and Bachem, Olivier and Schölkopf, Bernhard and Bauer, Stefan},
  booktitle    = {Advances in Neural Information Processing Systems},
  isbn         = {9781713807933},
  location     = {Vancouver, Canada},
  title        = {{On the transfer of inductive bias from simulation to the real world: a new disentanglement dataset}},
  volume       = {32},
  year         = {2019},
}

@inproceedings{14191,
  abstract     = {A broad class of convex optimization problems can be formulated as a semidefinite program (SDP), minimization of a convex function over the positive-semidefinite cone subject to some affine constraints. The majority of classical SDP solvers are designed for the deterministic setting where problem data is readily available. In this setting, generalized conditional gradient methods (aka Frank-Wolfe-type methods) provide scalable solutions by leveraging the so-called linear minimization oracle instead of the projection onto the semidefinite cone. Most problems in machine learning and modern engineering applications, however, contain some degree of stochasticity. In this work, we propose the first conditional-gradient-type method for solving stochastic optimization problems under affine constraints. Our method guarantees O(k−1/3) convergence rate in expectation on the objective residual and O(k−5/12) on the feasibility gap.},
  author       = {Locatello, Francesco and Yurtsever, Alp and Fercoq, Olivier and Cevher, Volkan},
  booktitle    = {Advances in Neural Information Processing Systems},
  isbn         = {9781713807933},
  location     = {Vancouver, Canada},
  pages        = {14291–14301},
  title        = {{Stochastic Frank-Wolfe for composite convex minimization}},
  volume       = {32},
  year         = {2019},
}

@inproceedings{14193,
  abstract     = {A disentangled representation encodes information about the salient factors
of variation in the data independently. Although it is often argued that this
representational format is useful in learning to solve many real-world
down-stream tasks, there is little empirical evidence that supports this claim.
In this paper, we conduct a large-scale study that investigates whether
disentangled representations are more suitable for abstract reasoning tasks.
Using two new tasks similar to Raven's Progressive Matrices, we evaluate the
usefulness of the representations learned by 360 state-of-the-art unsupervised
disentanglement models. Based on these representations, we train 3600 abstract
reasoning models and observe that disentangled representations do in fact lead
to better down-stream performance. In particular, they enable quicker learning
using fewer samples.},
  author       = {Steenkiste, Sjoerd van and Locatello, Francesco and Schmidhuber, Jürgen and Bachem, Olivier},
  booktitle    = {Advances in Neural Information Processing Systems},
  isbn         = {9781713807933},
  location     = {Vancouver, Canada},
  title        = {{Are disentangled representations helpful for abstract visual reasoning?}},
  volume       = {32},
  year         = {2019},
}

@inproceedings{14197,
  abstract     = {Recently there has been a significant interest in learning disentangled
representations, as they promise increased interpretability, generalization to
unseen scenarios and faster learning on downstream tasks. In this paper, we
investigate the usefulness of different notions of disentanglement for
improving the fairness of downstream prediction tasks based on representations.
We consider the setting where the goal is to predict a target variable based on
the learned representation of high-dimensional observations (such as images)
that depend on both the target variable and an \emph{unobserved} sensitive
variable. We show that in this setting both the optimal and empirical
predictions can be unfair, even if the target variable and the sensitive
variable are independent. Analyzing the representations of more than
\num{12600} trained state-of-the-art disentangled models, we observe that
several disentanglement scores are consistently correlated with increased
fairness, suggesting that disentanglement may be a useful property to encourage
fairness when sensitive variables are not observed.},
  author       = {Locatello, Francesco and Abbati, Gabriele and Rainforth, Tom and Bauer, Stefan and Schölkopf, Bernhard and Bachem, Olivier},
  booktitle    = {Advances in Neural Information Processing Systems},
  isbn         = {9781713807933},
  location     = {Vancouver, Canada},
  pages        = {14611–14624},
  title        = {{On the fairness of disentangled representations}},
  volume       = {32},
  year         = {2019},
}

