@article{6944,
  abstract     = {We study the problem of automatically detecting if a given multi-class classifier operates outside of its specifications (out-of-specs), i.e. on input data from a different distribution than what it was trained for. This is an important problem to solve on the road towards creating reliable computer vision systems for real-world applications, because the quality of a classifier’s predictions cannot be guaranteed if it operates out-of-specs. Previously proposed methods for out-of-specs detection make decisions on the level of single inputs. This, however, is insufficient to achieve low false positive rate and high false negative rates at the same time. In this work, we describe a new procedure named KS(conf), based on statistical reasoning. Its main component is a classical Kolmogorov–Smirnov test that is applied to the set of predicted confidence values for batches of samples. Working with batches instead of single samples allows increasing the true positive rate without negatively affecting the false positive rate, thereby overcoming a crucial limitation of single sample tests. We show by extensive experiments using a variety of convolutional network architectures and datasets that KS(conf) reliably detects out-of-specs situations even under conditions where other tests fail. It furthermore has a number of properties that make it an excellent candidate for practical deployment: it is easy to implement, adds almost no overhead to the system, works with any classifier that outputs confidence scores, and requires no a priori knowledge about how the data distribution could change.},
  author       = {Sun, Rémy and Lampert, Christoph},
  issn         = {1573-1405},
  journal      = {International Journal of Computer Vision},
  number       = {4},
  pages        = {970--995},
  publisher    = {Springer Nature},
  title        = {{KS(conf): A light-weight test if a multiclass classifier operates outside of its specifications}},
  doi          = {10.1007/s11263-019-01232-x},
  volume       = {128},
  year         = {2020},
}

@inproceedings{7479,
  abstract     = {Multi-exit architectures, in which a stack of processing layers is interleaved with early output layers, allow the processing of a test example to stop early and thus save computation time and/or energy.  In this work, we propose a new training procedure for multi-exit architectures based on the principle of knowledge distillation. The method encourage searly exits to mimic later, more accurate exits, by matching their output probabilities.
Experiments  on  CIFAR100  and  ImageNet  show  that distillation-based training significantly improves the accuracy of early exits while maintaining state-of-the-art accuracy  for  late  ones.   The  method  is  particularly  beneficial when  training  data  is  limited  and  it  allows  a  straightforward extension to semi-supervised learning,i.e. making use of unlabeled data at training time. Moreover, it takes only afew lines to implement and incurs almost no computational overhead at training time, and none at all at test time.},
  author       = {Bui Thi Mai, Phuong and Lampert, Christoph},
  booktitle    = {IEEE International Conference on Computer Vision},
  isbn         = {9781728148038},
  issn         = {15505499},
  location     = {Seoul, Korea},
  pages        = {1355--1364},
  publisher    = {IEEE},
  title        = {{Distillation-based training for multi-exit architectures}},
  doi          = {10.1109/ICCV.2019.00144},
  volume       = {2019-October},
  year         = {2019},
}

@inproceedings{7640,
  abstract     = {We propose a new model for detecting visual relationships, such as "person riding motorcycle" or "bottle on table". This task is an important step towards comprehensive structured mage understanding, going beyond detecting individual objects. Our main novelty is a Box Attention mechanism that allows to model pairwise interactions between objects using standard object detection pipelines. The resulting model is conceptually clean, expressive and relies on well-justified training and prediction procedures. Moreover, unlike previously proposed approaches, our model does not introduce any additional complex components or hyperparameters on top of those already required by the underlying detection model. We conduct an experimental evaluation on two datasets, V-COCO and Open Images, demonstrating strong quantitative and qualitative results.},
  author       = {Kolesnikov, Alexander and Kuznetsova, Alina and Lampert, Christoph and Ferrari, Vittorio},
  booktitle    = {Proceedings of the 2019 International Conference on Computer Vision Workshop},
  isbn         = {9781728150239},
  location     = {Seoul, South Korea},
  publisher    = {IEEE},
  title        = {{Detecting visual relationships using box attention}},
  doi          = {10.1109/ICCVW.2019.00217},
  year         = {2019},
}

@inproceedings{6482,
  abstract     = {Computer vision systems for automatic image categorization have become accurate and reliable enough that they can run continuously for days or even years as components of real-world commercial applications. A major open problem in this context, however, is quality control. Good classification performance can only be expected if systems run under the specific conditions, in particular data distributions, that they were trained for. Surprisingly, none of the currently used deep network architectures have a built-in functionality that could detect if a network operates on data from a distribution it was not trained for, such that potentially a warning to the human users could be triggered. In this work, we describe KS(conf), a procedure for detecting such outside of specifications (out-of-specs) operation, based on statistical testing of the network outputs. We show by extensive experiments using the ImageNet, AwA2 and DAVIS datasets on a variety of ConvNets architectures that KS(conf) reliably detects out-of-specs situations. It furthermore has a number of properties that make it a promising candidate for practical deployment: it is easy to implement, adds almost no overhead to the system, works with all networks, including pretrained ones, and requires no a priori knowledge of how the data distribution could change. },
  author       = {Sun, Rémy and Lampert, Christoph},
  isbn         = {9783030129385},
  issn         = {1611-3349},
  location     = {Stuttgart, Germany},
  pages        = {244--259},
  publisher    = {Springer Nature},
  title        = {{KS(conf): A light-weight test if a ConvNet operates outside of Its specifications}},
  doi          = {10.1007/978-3-030-12939-2_18},
  volume       = {11269},
  year         = {2019},
}

@inproceedings{6590,
  abstract     = {Modern machine learning methods often require more data for training than a single expert can provide. Therefore, it has become a standard procedure to collect data from external sources, e.g. via crowdsourcing. Unfortunately, the quality of these sources is not always guaranteed. As additional complications, the data might be stored in a distributed way, or might even have to remain private. In this work, we address the question of how to learn robustly in such scenarios. Studying the problem through the lens of statistical learning theory, we derive a procedure that allows for learning from all available sources, yet automatically suppresses irrelevant or corrupted data. We show by extensive experiments that our method provides significant improvements over alternative approaches from robust statistics and distributed optimization. },
  author       = {Konstantinov, Nikola H and Lampert, Christoph},
  booktitle    = {Proceedings of the 36th International Conference on Machine Learning},
  location     = {Long Beach, CA, USA},
  pages        = {3488--3498},
  publisher    = {ML Research Press},
  title        = {{Robust learning from untrusted sources}},
  volume       = {97},
  year         = {2019},
}

@phdthesis{197,
  abstract     = {Modern computer vision systems heavily rely on statistical machine learning models, which typically require large amounts of labeled data to be learned reliably. Moreover, very recently computer vision research widely adopted techniques for representation learning, which further increase the demand for labeled data. However, for many important practical problems there is relatively small amount of labeled data available, so it is problematic to leverage full potential of the representation learning methods. One way to overcome this obstacle is to invest substantial resources into producing large labelled datasets. Unfortunately, this can be prohibitively expensive in practice. In this thesis we focus on the alternative way of tackling the aforementioned issue. We concentrate on methods, which make use of weakly-labeled or even unlabeled data. Specifically, the first half of the thesis is dedicated to the semantic image segmentation task. We develop a technique, which achieves competitive segmentation performance and only requires annotations in a form of global image-level labels instead of dense segmentation masks. Subsequently, we present a new methodology, which further improves segmentation performance by leveraging tiny additional feedback from a human annotator. By using our methods practitioners can greatly reduce the amount of data annotation effort, which is required to learn modern image segmentation models. In the second half of the thesis we focus on methods for learning from unlabeled visual data. We study a family of autoregressive models for modeling structure of natural images and discuss potential applications of these models. Moreover, we conduct in-depth study of one of these applications, where we develop the state-of-the-art model for the probabilistic image colorization task.},
  author       = {Kolesnikov, Alexander},
  issn         = {2663-337X},
  pages        = {113},
  publisher    = {Institute of Science and Technology Austria},
  title        = {{Weakly-Supervised Segmentation and Unsupervised Modeling of Natural Images}},
  doi          = {10.15479/AT:ISTA:th_1021},
  year         = {2018},
}

@phdthesis{68,
  abstract     = {The most common assumption made in statistical learning theory is the assumption of the independent and identically distributed (i.i.d.) data. While being very convenient mathematically, it is often very clearly violated in practice. This disparity between the machine learning theory and applications underlies a growing demand in the development of algorithms that learn from dependent data and theory that can provide generalization guarantees similar to the independent situations. This thesis is dedicated to two variants of dependencies that can arise in practice. One is a dependence on the level of samples in a single learning task. Another dependency type arises in the multi-task setting when the tasks are dependent on each other even though the data for them can be i.i.d. In both cases we model the data (samples or tasks) as stochastic processes and introduce new algorithms for both settings that take into account and exploit the resulting dependencies. We prove the theoretical guarantees on the performance of the introduced algorithms under different evaluation criteria and, in addition, we compliment the theoretical study by the empirical one, where we evaluate some of the algorithms on two real world datasets to highlight their practical applicability.},
  author       = {Zimin, Alexander},
  issn         = {2663-337X},
  pages        = {92},
  publisher    = {Institute of Science and Technology Austria},
  title        = {{Learning from dependent data}},
  doi          = {10.15479/AT:ISTA:TH1048},
  year         = {2018},
}

@inproceedings{6011,
  abstract     = {We establish a data-dependent notion of algorithmic stability for Stochastic Gradient Descent (SGD), and employ it to develop novel generalization bounds. This is in contrast to previous distribution-free algorithmic stability results for SGD which depend on the worst-case constants. By virtue of the data-dependent argument, our bounds provide new insights into learning with SGD on convex and non-convex problems. In the convex case, we show that the bound on the generalization error depends on the risk at the initialization point. In the non-convex case, we prove that the expected curvature of the objective function around the initialization point has crucial influence on the generalization error. In both cases, our results suggest a simple data-driven strategy to stabilize SGD by pre-screening its initialization. As a corollary, our results allow us to show optimistic generalization bounds that exhibit fast convergence rates for SGD subject to a vanishing empirical risk and low noise of stochastic gradient. },
  author       = {Kuzborskij, Ilja and Lampert, Christoph},
  booktitle    = {Proceedings of the 35 th International Conference on Machine Learning},
  location     = {Stockholm, Sweden},
  pages        = {2815--2824},
  publisher    = {ML Research Press},
  title        = {{Data-dependent stability of stochastic gradient descent}},
  volume       = {80},
  year         = {2018},
}

@inproceedings{1108,
  abstract     = {In this work we study the learnability of stochastic processes with respect to the conditional risk, i.e. the existence of a learning algorithm that improves its next-step performance with the amount of observed data. We introduce a notion of pairwise discrepancy between conditional distributions at different times steps and show how certain properties of these discrepancies can be used to construct a successful learning algorithm. Our main results are two theorems that establish criteria for learnability for many classes of stochastic processes, including all special cases studied previously in the literature.},
  author       = {Zimin, Alexander and Lampert, Christoph},
  location     = {Fort Lauderdale, FL, United States},
  pages        = {213 -- 222},
  publisher    = {ML Research Press},
  title        = {{Learning theory for conditional risk minimization}},
  volume       = {54},
  year         = {2017},
}

@inproceedings{6841,
  abstract     = {In classical machine learning, regression is treated as a black box process of identifying a suitable function from a hypothesis set without attempting to gain insight into the mechanism connecting inputs and outputs. In the natural sciences, however, finding an interpretable function for a phenomenon is the prime goal as it allows to understand and generalize results. This paper proposes a novel type of function learning network, called equation learner (EQL), that can learn analytical expressions and is able to extrapolate to unseen domains. It is implemented as an end-to-end differentiable feed-forward network and allows for efficient gradient based training. Due to sparsity regularization concise interpretable expressions can be obtained. Often the true underlying source expression is identified.},
  author       = {Martius, Georg S and Lampert, Christoph},
  booktitle    = {5th International Conference on Learning Representations, ICLR 2017 - Workshop Track Proceedings},
  location     = {Toulon, France},
  publisher    = {International Conference on Learning Representations},
  title        = {{Extrapolation and learning equations}},
  year         = {2017},
}

@inproceedings{911,
  abstract     = {We develop a probabilistic technique for colorizing grayscale natural images. In light of the intrinsic uncertainty of this task, the proposed probabilistic framework has numerous desirable properties. In particular, our model is able to produce multiple plausible and vivid colorizations for a given grayscale image and is one of the first colorization models to provide a proper stochastic sampling scheme. Moreover, our training procedure is supported by a rigorous theoretical framework that does not require any ad hoc heuristics and allows for efficient modeling and learning of the joint pixel color distribution.We demonstrate strong quantitative and qualitative experimental results on the CIFAR-10 dataset and the challenging ILSVRC 2012 dataset.},
  author       = {Royer, Amélie and Kolesnikov, Alexander and Lampert, Christoph},
  location     = {London, United Kingdom},
  pages        = {85.1--85.12},
  publisher    = {BMVA Press},
  title        = {{Probabilistic image colorization}},
  doi          = {10.5244/c.31.85},
  year         = {2017},
}

@inproceedings{1000,
  abstract     = {We study probabilistic models of natural images and extend the autoregressive family of PixelCNN models by incorporating latent variables. Subsequently, we describe two new generative image models that exploit different image transformations as latent variables: a quantized grayscale view of the image or a multi-resolution image pyramid. The proposed models tackle two known shortcomings of existing PixelCNN models: 1) their tendency to focus on low-level image details, while largely ignoring high-level image information, such as object shapes, and 2) their computationally costly procedure for image sampling. We experimentally demonstrate benefits of our LatentPixelCNN models, in particular showing that they produce much more realistically looking image samples than previous state-of-the-art probabilistic models. },
  author       = {Kolesnikov, Alexander and Lampert, Christoph},
  booktitle    = {34th International Conference on Machine Learning},
  isbn         = {978-151085514-4},
  location     = {Sydney, Australia},
  pages        = {1905 -- 1914},
  publisher    = {JMLR},
  title        = {{PixelCNN models with auxiliary variables for natural image modeling}},
  volume       = {70},
  year         = {2017},
}

@inproceedings{998,
  abstract     = {A major open problem on the road to artificial intelligence is the development of incrementally learning systems that learn about more and more concepts over time from a stream of data. In this work, we introduce a new training strategy, iCaRL, that allows learning in such a class-incremental way: only the training data for a small number of classes has to be present at the same time and new classes can be added progressively. iCaRL learns strong classifiers and a data representation simultaneously. This distinguishes it from earlier works that were fundamentally limited to fixed data representations and therefore incompatible with deep learning architectures. We show by experiments on CIFAR-100 and ImageNet ILSVRC 2012 data that iCaRL can learn many classes incrementally over a long period of time where other strategies quickly fail. },
  author       = {Rebuffi, Sylvestre Alvise and Kolesnikov, Alexander and Sperl, Georg and Lampert, Christoph},
  isbn         = {978-153860457-1},
  location     = {Honolulu, HA, United States},
  pages        = {5533 -- 5542},
  publisher    = {IEEE},
  title        = {{iCaRL: Incremental classifier and representation learning}},
  doi          = {10.1109/CVPR.2017.587},
  volume       = {2017},
  year         = {2017},
}

@inproceedings{999,
  abstract     = {In multi-task learning, a learner is given a collection of prediction tasks and needs to solve all of them. In contrast to previous work, which required that annotated training data must be available for all tasks, we consider a new setting, in which for some tasks, potentially most of them, only unlabeled training data is provided. Consequently, to solve all tasks, information must be transferred between tasks with labels and tasks without labels. Focusing on an instance-based transfer method we analyze two variants of this setting: when the set of labeled tasks is fixed, and when it can be actively selected by the learner. We state and prove a generalization bound that covers both scenarios and derive from it an algorithm for making the choice of labeled tasks (in the active case) and for transferring information between the tasks in a principled way. We also illustrate the effectiveness of the algorithm on synthetic and real data. },
  author       = {Pentina, Anastasia and Lampert, Christoph},
  isbn         = {9781510855144},
  location     = {Sydney, Australia},
  pages        = {2807 -- 2816},
  publisher    = {ML Research Press},
  title        = {{Multi-task learning with labeled and unlabeled tasks}},
  volume       = {70},
  year         = {2017},
}

@inproceedings{1098,
  abstract     = {Better understanding of the potential benefits of information transfer and representation learning is an important step towards the goal of building intelligent systems that are able to persist in the world and learn over time. In this work, we consider a setting where the learner encounters a stream of tasks but is able to retain only limited information from each encountered task, such as a learned predictor. In contrast to most previous works analyzing this scenario, we do not make any distributional assumptions on the task generating process. Instead, we formulate a complexity measure that captures the diversity of the observed tasks. We provide a lifelong learning algorithm with error guarantees for every observed task (rather than on average). We show sample complexity reductions in comparison to solving every task in isolation in terms of our task complexity measure. Further, our algorithmic framework can naturally be viewed as learning a representation from encountered tasks with a neural network.},
  author       = {Pentina, Anastasia and Urner, Ruth},
  location     = {Barcelona, Spain},
  pages        = {3619--3627},
  publisher    = {Neural Information Processing Systems},
  title        = {{Lifelong learning with weighted majority votes}},
  volume       = {29},
  year         = {2016},
}

@inproceedings{1102,
  abstract     = {Weakly-supervised object localization methods tend to fail for object classes that consistently co-occur with the same background elements, e.g. trains on tracks. We propose a method to overcome these failures by adding a very small amount of model-specific additional annotation. The main idea is to cluster a deep network\'s mid-level representations and assign object or distractor labels to each cluster. Experiments show substantially improved localization results on the challenging ILSVC2014 dataset for bounding box detection and the PASCAL VOC2012 dataset for semantic segmentation.},
  author       = {Kolesnikov, Alexander and Lampert, Christoph},
  booktitle    = {Proceedings of the British Machine Vision Conference 2016},
  location     = {York, United Kingdom},
  pages        = {92.1--92.12},
  publisher    = {BMVA Press},
  title        = {{Improving weakly-supervised object localization by micro-annotation}},
  doi          = {10.5244/C.30.92},
  volume       = {2016-September},
  year         = {2016},
}

@phdthesis{1126,
  abstract     = {Traditionally machine learning has been focusing on the problem of solving a single
task in isolation. While being quite well understood, this approach disregards an
important aspect of human learning: when facing a new problem, humans are able to
exploit knowledge acquired from previously learned tasks. Intuitively, access to several
problems simultaneously or sequentially could also be advantageous for a machine
learning system, especially if these tasks are closely related. Indeed, results of many
empirical studies have provided justification for this intuition. However, theoretical
justifications of this idea are rather limited.
The focus of this thesis is to expand the understanding of potential benefits of information
transfer between several related learning problems. We provide theoretical
analysis for three scenarios of multi-task learning - multiple kernel learning, sequential
learning and active task selection. We also provide a PAC-Bayesian perspective on
lifelong learning and investigate how the task generation process influences the generalization
guarantees in this scenario. In addition, we show how some of the obtained
theoretical results can be used to derive principled multi-task and lifelong learning
algorithms and illustrate their performance on various synthetic and real-world datasets.},
  author       = {Pentina, Anastasia},
  issn         = {2663-337X},
  pages        = {127},
  publisher    = {Institute of Science and Technology Austria},
  title        = {{Theoretical foundations of multi-task lifelong learning}},
  doi          = {10.15479/AT:ISTA:TH_776},
  year         = {2016},
}

@inproceedings{1369,
  abstract     = {We introduce a new loss function for the weakly-supervised training of semantic image segmentation models based on three guiding principles: to seed with weak localization cues, to expand objects based on the information about which classes can occur in an image, and to constrain the segmentations to coincide with object boundaries. We show experimentally that training a deep convolutional neural network using the proposed loss function leads to substantially better segmentations than previous state-of-the-art methods on the challenging PASCAL VOC 2012 dataset. We furthermore give insight into the working mechanism of our method by a detailed experimental study that illustrates how the segmentation quality is affected by each term of the proposed loss function as well as their combinations.},
  author       = {Kolesnikov, Alexander and Lampert, Christoph},
  location     = {Amsterdam, The Netherlands},
  pages        = {695 -- 711},
  publisher    = {Springer},
  title        = {{Seed, expand and constrain: Three principles for weakly-supervised image segmentation}},
  doi          = {10.1007/978-3-319-46493-0_42},
  volume       = {9908},
  year         = {2016},
}

@inproceedings{1706,
  abstract     = {We consider a problem of learning kernels for use in SVM classification in the multi-task and lifelong scenarios and provide generalization bounds on the error of a large margin classifier. Our results show that, under mild conditions on the family of kernels used for learning, solving several related tasks simultaneously is beneficial over single task learning. In particular, as the number of observed tasks grows, assuming that in the considered family of kernels there exists one that yields low approximation error on all tasks, the overhead associated with learning such a kernel vanishes and the complexity converges to that of learning when this good kernel is given to the learner.},
  author       = {Pentina, Anastasia and Ben David, Shai},
  location     = {Banff, AB, Canada},
  pages        = {194 -- 208},
  publisher    = {Springer},
  title        = {{Multi-task and lifelong learning of kernels}},
  doi          = {10.1007/978-3-319-24486-0_13},
  volume       = {9355},
  year         = {2015},
}

@inproceedings{1859,
  abstract     = {Structural support vector machines (SSVMs) are amongst the best performing models for structured computer vision tasks, such as semantic image segmentation or human pose estimation. Training SSVMs, however, is computationally costly, because it requires repeated calls to a structured prediction subroutine (called \emph{max-oracle}), which has to solve an optimization problem itself, e.g. a graph cut.
In this work, we introduce a new algorithm for SSVM training that is more efficient than earlier techniques when the max-oracle is computationally expensive, as it is frequently the case in computer vision tasks. The main idea is to (i) combine the recent stochastic Block-Coordinate Frank-Wolfe algorithm with efficient hyperplane caching, and (ii) use an automatic selection rule for deciding whether to call the exact max-oracle or to rely on an approximate one based on the cached hyperplanes.
We show experimentally that this strategy leads to faster convergence to the optimum with respect to the number of requires oracle calls, and that this translates into faster convergence with respect to the total runtime when the max-oracle is slow compared to the other steps of the algorithm. },
  author       = {Shah, Neel and Kolmogorov, Vladimir and Lampert, Christoph},
  location     = {Boston, MA, USA},
  pages        = {2737 -- 2745},
  publisher    = {IEEE},
  title        = {{A multi-plane block-coordinate Frank-Wolfe algorithm for training structural SVMs with a costly max-oracle}},
  doi          = {10.1109/CVPR.2015.7298890},
  year         = {2015},
}

