@inproceedings{14106,
  abstract     = {We show that deep networks trained to satisfy demographic parity often do so
through a form of race or gender awareness, and that the more we force a network
to be fair, the more accurately we can recover race or gender from the internal state
of the network. Based on this observation, we investigate an alternative fairness
approach: we add a second classification head to the network to explicitly predict
the protected attribute (such as race or gender) alongside the original task. After
training the two-headed network, we enforce demographic parity by merging the
two heads, creating a network with the same architecture as the original network.
We establish a close relationship between existing approaches and our approach
by showing (1) that the decisions of a fair classifier are well-approximated by our
approach, and (2) that an unfair and optimally accurate classifier can be recovered
from a fair classifier and our second head predicting the protected attribute. We use
our explicit formulation to argue that the existing fairness approaches, just as ours,
demonstrate disparate treatment and that they are likely to be unlawful in a wide
range of scenarios under US law.},
  author       = {Lohaus, Michael and Kleindessner, Matthäus and Kenthapadi, Krishnaram and Locatello, Francesco and Russell, Chris},
  booktitle    = {36th Conference on Neural Information Processing Systems},
  isbn         = {9781713871088},
  location     = {New Orleans, LA, United States},
  pages        = {16548--16562},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Are two heads the same as one? Identifying disparate treatment in fair neural networks}},
  volume       = {35},
  year         = {2022},
}

@inproceedings{14173,
  abstract     = {Since out-of-distribution generalization is a generally ill-posed problem, various proxy targets (e.g., calibration, adversarial robustness, algorithmic corruptions, invariance across shifts) were studied across different research programs resulting in different recommendations. While sharing the same aspirational goal, these approaches have never been tested under the same
experimental conditions on real data. In this paper, we take a unified view of previous work, highlighting message discrepancies that we address empirically, and providing recommendations on how to measure the robustness of a model and how to improve it. To this end, we collect 172 publicly available dataset pairs for training and out-of-distribution evaluation of accuracy, calibration error, adversarial attacks, environment invariance, and synthetic corruptions. We fine-tune over 31k networks, from nine different architectures in the many- and
few-shot setting. Our findings confirm that in- and out-of-distribution accuracies tend to increase jointly, but show that their relation is largely dataset-dependent, and in general more nuanced and more complex than posited by previous, smaller scale studies.},
  author       = {Wenzel, Florian and Dittadi, Andrea and Gehler, Peter Vincent and Carl-Johann Simon-Gabriel, Carl-Johann Simon-Gabriel and Horn, Max and Zietlow, Dominik and Kernert, David and Russell, Chris and Brox, Thomas and Schiele, Bernt and Schölkopf, Bernhard and Locatello, Francesco},
  booktitle    = {36th Conference on Neural Information Processing Systems},
  isbn         = {9781713871088},
  location     = {New Orleans, LA, United States},
  pages        = {7181--7198},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Assaying out-of-distribution generalization in transfer learning}},
  volume       = {35},
  year         = {2022},
}

@inproceedings{12537,
  abstract     = {The Neural Tangent Kernel (NTK) has emerged as a powerful tool to provide memorization, optimization and generalization guarantees in deep neural networks. A line of work has studied the NTK spectrum for two-layer and deep networks with at least a layer with Ω(N) neurons, N being the number of training samples. Furthermore, there is increasing evidence suggesting that deep networks with sub-linear layer widths are powerful memorizers and optimizers, as long as the number of parameters exceeds the number of samples. Thus, a natural open question is whether the NTK is well conditioned in such a challenging sub-linear setup. In this paper, we answer this question in the affirmative. Our key technical contribution is a lower bound on the smallest NTK eigenvalue for deep networks with the minimum possible over-parameterization: the number of parameters is roughly Ω(N) and, hence, the number of neurons is as little as Ω(N−−√). To showcase the applicability of our NTK bounds, we provide two results concerning memorization capacity and optimization guarantees for gradient descent training.},
  author       = {Bombari, Simone and Amani, Mohammad Hossein and Mondelli, Marco},
  booktitle    = {36th Conference on Neural Information Processing Systems},
  isbn         = {9781713871088},
  pages        = {7628--7640},
  publisher    = {Curran Associates},
  title        = {{Memorization and optimization in deep neural networks with minimum over-parameterization}},
  volume       = {35},
  year         = {2022},
}