@inproceedings{2057,
  abstract     = {In the past few years, a lot of attention has been devoted to multimedia indexing by fusing multimodal informations. Two kinds of fusion schemes are generally considered: The early fusion and the late fusion. We focus on late classifier fusion, where one combines the scores of each modality at the decision level. To tackle this problem, we investigate a recent and elegant well-founded quadratic program named MinCq coming from the machine learning PAC-Bayesian theory. MinCq looks for the weighted combination, over a set of real-valued functions seen as voters, leading to the lowest misclassification rate, while maximizing the voters’ diversity. We propose an extension of MinCq tailored to multimedia indexing. Our method is based on an order-preserving pairwise loss adapted to ranking that allows us to improve Mean Averaged Precision measure while taking into account the diversity of the voters that we want to fuse. We provide evidence that this method is naturally adapted to late fusion procedures and confirm the good behavior of our approach on the challenging PASCAL VOC’07 benchmark.},
  author       = {Morvant, Emilie and Habrard, Amaury and Ayache, Stéphane},
  booktitle    = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)},
  location     = {Joensuu, Finland},
  pages        = {153 -- 162},
  publisher    = {Springer},
  title        = {{Majority vote of diverse classifiers for late fusion}},
  doi          = {10.1007/978-3-662-44415-3_16},
  volume       = {8621},
  year         = {2014},
}

@inproceedings{2160,
  abstract     = {Transfer learning has received a lot of attention in the machine learning community over the last years, and several effective algorithms have been developed. However, relatively little is known about their theoretical properties, especially in the setting of lifelong learning, where the goal is to transfer information to tasks for which no data have been observed so far. In this work we study lifelong learning from a theoretical perspective. Our main result is a PAC-Bayesian generalization bound that offers a unified view on existing paradigms for transfer learning, such as the transfer of parameters or the transfer of low-dimensional representations. We also use the bound to derive two principled lifelong learning algorithms, and we show that these yield results comparable with existing methods.},
  author       = {Pentina, Anastasia and Lampert, Christoph},
  location     = {Beijing, China},
  pages        = {991 -- 999},
  publisher    = {ML Research Press},
  title        = {{A PAC-Bayesian bound for Lifelong Learning}},
  volume       = {32},
  year         = {2014},
}

@inproceedings{2171,
  abstract     = {We present LS-CRF, a new method for training cyclic Conditional Random Fields (CRFs) from large datasets that is inspired by classical closed-form expressions for the maximum likelihood parameters of a generative graphical model with tree topology. Training a CRF with LS-CRF requires only solving a set of independent regression problems, each of which can be solved efficiently in closed form or by an iterative solver. This makes LS-CRF orders of magnitude faster than classical CRF training based on probabilistic inference, and at the same time more flexible and easier to implement than other approximate techniques, such as pseudolikelihood or piecewise training. We apply LS-CRF to the task of semantic image segmentation, showing that it achieves on par accuracy to other training techniques at higher speed, thereby allowing efficient CRF training from very large training sets. For example, training a linearly parameterized pairwise CRF on 150,000 images requires less than one hour on a modern workstation.},
  author       = {Kolesnikov, Alexander and Guillaumin, Matthieu and Ferrari, Vittorio and Lampert, Christoph},
  booktitle    = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)},
  editor       = {Fleet, David and Pajdla, Tomas and Schiele, Bernt and Tuytelaars, Tinne},
  location     = {Zurich, Switzerland},
  number       = {PART 3},
  pages        = {550 -- 565},
  publisher    = {Springer},
  title        = {{Closed-form approximate CRF training for scalable image segmentation}},
  doi          = {10.1007/978-3-319-10578-9_36},
  volume       = {8691},
  year         = {2014},
}

@inproceedings{2172,
  abstract     = {Fisher Kernels and Deep Learning were two developments with significant impact on large-scale object categorization in the last years. Both approaches were shown to achieve state-of-the-art results on large-scale object categorization datasets, such as ImageNet. Conceptually, however, they are perceived as very different and it is not uncommon for heated debates to spring up when advocates of both paradigms meet at conferences or workshops. In this work, we emphasize the similarities between both architectures rather than their differences and we argue that such a unified view allows us to transfer ideas from one domain to the other. As a concrete example we introduce a method for learning a support vector machine classifier with Fisher kernel at the same time as a task-specific data representation. We reinterpret the setting as a multi-layer feed forward network. Its final layer is the classifier, parameterized by a weight vector, and the two previous layers compute Fisher vectors, parameterized by the coefficients of a Gaussian mixture model. We introduce a gradient descent based learning algorithm that, in contrast to other feature learning techniques, is not just derived from intuition or biological analogy, but has a theoretical justification in the framework of statistical learning theory. Our experiments show that the new training procedure leads to significant improvements in classification accuracy while preserving the modularity and geometric interpretability of a support vector machine setup.},
  author       = {Sydorov, Vladyslav and Sakurada, Mayu and Lampert, Christoph},
  booktitle    = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition},
  location     = {Columbus, USA},
  pages        = {1402 -- 1409},
  publisher    = {IEEE},
  title        = {{Deep Fisher Kernels – End to end learning of the Fisher Kernel GMM parameters}},
  doi          = {10.1109/CVPR.2014.182},
  year         = {2014},
}

@inproceedings{2173,
  abstract     = {In this work we introduce a new approach to co-classification, i.e. the task of jointly classifying multiple, otherwise independent, data samples. The method we present, named CoConut, is based on the idea of adding a regularizer in the label space to encode certain priors on the resulting labelings. A regularizer that encourages labelings that are smooth across the test set, for instance, can be seen as a test-time variant of the cluster assumption, which has been proven useful at training time in semi-supervised learning. A regularizer that introduces a preference for certain class proportions can be regarded as a prior distribution on the class labels. CoConut can build on existing classifiers without making any assumptions on how they were obtained and without the need to re-train them. The use of a regularizer adds a new level of flexibility. It allows the integration of potentially new information at test time, even in other modalities than what the classifiers were trained on. We evaluate our framework on six datasets, reporting a clear performance gain in classification accuracy compared to the standard classification setup that predicts labels for each test sample separately.
},
  author       = {Khamis, Sameh and Lampert, Christoph},
  booktitle    = {Proceedings of the British Machine Vision Conference 2014},
  location     = {Nottingham, UK},
  publisher    = {BMVA Press},
  title        = {{CoConut: Co-classification with output space regularization}},
  year         = {2014},
}

@article{2180,
  abstract     = {Weighted majority votes allow one to combine the output of several classifiers or voters. MinCq is a recent algorithm for optimizing the weight of each voter based on the minimization of a theoretical bound over the risk of the vote with elegant PAC-Bayesian generalization guarantees. However, while it has demonstrated good performance when combining weak classifiers, MinCq cannot make use of the useful a priori knowledge that one may have when using a mixture of weak and strong voters. In this paper, we propose P-MinCq, an extension of MinCq that can incorporate such knowledge in the form of a  constraint over the distribution of the weights, along with general proofs of convergence that stand in the sample compression setting for data-dependent voters. The approach is applied to a vote of k-NN classifiers with a specific modeling of the voters' performance. P-MinCq significantly outperforms the classic k-NN classifier, a symmetric NN and MinCq using the same voters. We show that it is also competitive with LMNN, a popular metric learning algorithm, and that combining both approaches further reduces the error.},
  author       = {Bellet, Aurélien and Habrard, Amaury and Morvant, Emilie and Sebban, Marc},
  journal      = {Machine Learning},
  number       = {1-2},
  pages        = {129 -- 154},
  publisher    = {Springer},
  title        = {{Learning a priori constrained weighted majority votes}},
  doi          = {10.1007/s10994-014-5462-z},
  volume       = {97},
  year         = {2014},
}

@inproceedings{2189,
  abstract     = {En apprentissage automatique, nous parlons d'adaptation de domaine lorsque les données de test (cibles) et d'apprentissage (sources) sont générées selon différentes distributions. Nous devons donc développer des algorithmes de classification capables de s'adapter à une nouvelle distribution, pour laquelle aucune information sur les étiquettes n'est disponible. Nous attaquons cette problématique sous l'angle de l'approche PAC-Bayésienne qui se focalise sur l'apprentissage de modèles définis comme des votes de majorité sur un ensemble de fonctions. Dans ce contexte, nous introduisons PV-MinCq une version adaptative de l'algorithme (non adaptatif) MinCq. PV-MinCq suit le principe suivant. Nous transférons les étiquettes sources aux points cibles proches pour ensuite appliquer MinCq sur l'échantillon cible ``auto-étiqueté'' (justifié par une borne théorique). Plus précisément, nous définissons un auto-étiquetage non itératif qui se focalise dans les régions où les distributions marginales source et cible sont les plus similaires. Dans un second temps, nous étudions l'influence de notre auto-étiquetage pour en déduire une procédure de validation des hyperparamètres. Finalement, notre approche montre des résultats empiriques prometteurs.},
  author       = {Morvant, Emilie},
  location     = {Saint-Etienne, France},
  pages        = {49--58},
  publisher    = {Elsevier},
  title        = {{Adaptation de domaine de vote de majorité par auto-étiquetage non itératif}},
  volume       = {1},
  year         = {2014},
}

@article{2516,
  abstract     = {We study the problem of object recognition for categories for which we have no training examples, a task also called zero-data or zero-shot learning. This situation has hardly been studied in computer vision research, even though it occurs frequently: the world contains tens of thousands of different object classes and for only few of them image collections have been formed and suitably annotated. To tackle the problem we introduce attribute-based classification: objects are identified based on a high-level description that is phrased in terms of semantic attributes, such as the object's color or shape. Because the identification of each such property transcends the specific learning task at hand, the attribute classifiers can be pre-learned independently, e.g. from existing image datasets unrelated to the current task. Afterwards, new classes can be detected based on their attribute representation, without the need for a new training phase. In this paper we also introduce a new dataset, Animals with Attributes, of over 30,000 images of 50 animal classes, annotated with 85 semantic attributes. Extensive experiments on this and two more datasets show that attribute-based classification indeed is able to categorize images without access to any training images of the target classes.},
  author       = {Lampert, Christoph and Nickisch, Hannes and Harmeling, Stefan},
  journal      = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
  number       = {3},
  pages        = {453 -- 465},
  publisher    = {IEEE},
  title        = {{Attribute-based classification for zero-shot learning of object categories}},
  doi          = {10.1109/TPAMI.2013.140},
  volume       = {36},
  year         = {2013},
}

@inproceedings{2520,
  abstract     = {We propose a probabilistic model to infer supervised latent variables in
the Hamming space from observed data. Our model allows simultaneous
inference of the number of binary latent variables, and their values. The
latent variables preserve neighbourhood structure of the data in a sense
that objects in the same semantic concept have similar latent values, and
objects in different concepts have dissimilar latent values. We formulate
the supervised infinite latent variable problem based on an intuitive
principle of pulling objects together if they are of the same type, and
pushing them apart if they are not. We then combine this principle with a
flexible Indian Buffet Process prior on the latent variables. We show that
the inferred supervised latent variables can be directly used to perform a
nearest neighbour search for the purpose of retrieval.  We introduce a new
application of dynamically extending hash codes, and show how to
effectively couple the structure of the hash codes with continuously
growing structure of the neighbourhood preserving infinite latent feature
space.},
  author       = {Quadrianto, Novi and Sharmanska, Viktoriia and Knowles, David and Ghahramani, Zoubin},
  booktitle    = {Proceedings of the 29th conference uncertainty in Artificial Intelligence},
  isbn         = {9780974903996},
  location     = {Bellevue, WA, United States},
  pages        = {527 -- 536},
  publisher    = {AUAI Press},
  title        = {{The supervised IBP: Neighbourhood preserving infinite latent feature models}},
  year         = {2013},
}

@inproceedings{2901,
  abstract     = { We introduce the M-modes problem for graphical models: predicting the M label configurations of highest probability that are at the same time local maxima of the probability landscape. M-modes have multiple possible applications: because they are intrinsically diverse, they provide a principled alternative to non-maximum suppression techniques for structured prediction, they can act as codebook vectors for quantizing the configuration space, or they can form component centers for mixture model approximation. We present two algorithms for solving the M-modes problem. The first algorithm solves the problem in polynomial time when the underlying graphical model is a simple chain. The second algorithm solves the problem for junction chains. In synthetic and real dataset, we demonstrate how M-modes can improve the performance of prediction. We also use the generated modes as a tool to understand the topography of the probability distribution of configurations, for example with relation to the training set size and amount of noise in the data. },
  author       = {Chen, Chao and Kolmogorov, Vladimir and Yan, Zhu and Metaxas, Dimitris and Lampert, Christoph},
  location     = {Scottsdale, AZ, United States},
  pages        = {161 -- 169},
  publisher    = {JMLR},
  title        = {{Computing the M most probable modes of a graphical model}},
  volume       = {31},
  year         = {2013},
}

@inproceedings{2948,
  abstract     = {Many visual datasets are traditionally used to analyze the performance of different learning techniques. The evaluation is usually done within each dataset, therefore it is questionable if such results are a reliable indicator of true generalization ability. We propose here an algorithm to exploit the existing data resources when learning on a new multiclass problem. Our main idea is to identify an image representation that decomposes orthogonally into two subspaces: a part specific to each dataset, and a part generic to, and therefore shared between, all the considered source sets. This allows us to use the generic representation as un-biased reference knowledge for a novel classification task. By casting the method in the multi-view setting, we also make it possible to use different features for different databases. We call the algorithm MUST, Multitask Unaligned Shared knowledge Transfer. Through extensive experiments on five public datasets, we show that MUST consistently improves the cross-datasets generalization performance.},
  author       = {Tommasi, Tatiana and Quadrianto, Novi and Caputo, Barbara and Lampert, Christoph},
  location     = {Daejeon, Korea},
  pages        = {1 -- 15},
  publisher    = {Springer},
  title        = {{Beyond dataset bias: Multi-task unaligned shared knowledge transfer}},
  doi          = {10.1007/978-3-642-37331-2_1},
  volume       = {7724},
  year         = {2013},
}

@misc{3321,
  author       = {Quadrianto, Novi and Lampert, Christoph},
  booktitle    = {Encyclopedia of Systems Biology},
  editor       = {Dubitzky, Werner and Wolkenhauer, Olaf and Cho, Kwang and Yokota, Hiroki},
  pages        = {1069 -- 1069},
  publisher    = {Springer},
  title        = {{Kernel based learning}},
  doi          = {10.1007/978-1-4419-9863-7_604},
  volume       = {3},
  year         = {2013},
}

@inproceedings{2293,
  abstract     = {Many computer vision problems have an asymmetric distribution of information between training and test time. In this work, we study the case where we are given additional information about the training data, which however will not be available at test time. This situation is called learning using privileged information (LUPI). We introduce two maximum-margin techniques that are able to make use of this additional source of information, and we show that the framework is applicable to several scenarios that have been studied in computer vision before. Experiments with attributes, bounding boxes, image tags and rationales as additional information in object classification show promising results.},
  author       = {Sharmanska, Viktoriia and Quadrianto, Novi and Lampert, Christoph},
  location     = {Sydney, Australia},
  pages        = {825 -- 832},
  publisher    = {IEEE},
  title        = {{Learning to rank using privileged information}},
  doi          = {10.1109/ICCV.2013.107},
  year         = {2013},
}

@inproceedings{2294,
  abstract     = {In this work we propose a system for automatic classification of Drosophila embryos into developmental stages.
While the system is designed to solve an actual problem in biological research, we believe that the principle underly-
ing it is interesting not only for biologists, but also for researchers in computer vision. The main idea is to combine two orthogonal sources of information:  one is a classifier trained on strongly invariant features,  which makes it applicable to images of very different conditions, but also leads to rather noisy predictions. The other is a label propagation step based on a more powerful similarity measure that however is only consistent within specific subsets of the data at a time.
In our biological setup, the information sources are the shape and the staining patterns of embryo images. We show
experimentally  that  while  neither  of  the  methods  can  be used by itself to achieve satisfactory results, their combina-
tion achieves prediction quality comparable to human performance.},
  author       = {Kazmar, Tomas and Kvon, Evgeny and Stark, Alexander and Lampert, Christoph},
  location     = {Sydney, Australia},
  publisher    = {IEEE},
  title        = {{Drosophila Embryo Stage Annotation using Label Propagation}},
  doi          = {10.1109/ICCV.2013.139},
  year         = {2013},
}

@inproceedings{2825,
  abstract     = {We study the problem of maximum marginal prediction (MMP) in probabilistic graphical models, a task that occurs, for example, as the Bayes optimal decision rule under a Hamming loss. MMP is typically performed as a two-stage procedure: one estimates each variable's marginal probability and then forms a prediction from the states of maximal probability. In this work we propose a simple yet effective technique for accelerating MMP when inference is sampling-based: instead of the above two-stage procedure we directly estimate the posterior probability of each decision variable. This allows us to identify the point of time when we are sufficiently certain about any individual decision. Whenever this is the case, we dynamically prune the variables we are confident about from the underlying factor graph. Consequently, at any time only samples of variables whose decision is still uncertain need to be created. Experiments in two prototypical scenarios, multi-label classification and image inpainting, show that adaptive sampling can drastically accelerate MMP without sacrificing prediction accuracy.},
  author       = {Lampert, Christoph},
  location     = {Lake Tahoe, NV, United States},
  pages        = {82 -- 90},
  publisher    = {Neural Information Processing Systems},
  title        = {{Dynamic pruning of factor graphs for maximum marginal prediction}},
  volume       = {1},
  year         = {2012},
}

@inproceedings{2915,
  author       = {Kroemer, Oliver and Lampert, Christoph and Peters, Jan},
  publisher    = {Deutsches Zentrum für Luft und Raumfahrt},
  title        = {{Multi-modal learning for dynamic tactile sensing}},
  year         = {2012},
}

@inproceedings{3124,
  abstract     = {We consider the problem of inference in a graphical model with binary variables. While in theory it is arguably preferable to compute marginal probabilities, in practice researchers often use MAP inference due to the availability of efficient discrete optimization algorithms. We bridge the gap between the two approaches by introducing the Discrete Marginals technique in which approximate marginals are obtained by minimizing an objective function with unary and pairwise terms over a discretized domain. This allows the use of techniques originally developed for MAP-MRF inference and learning. We explore two ways to set up the objective function - by discretizing the Bethe free energy and by learning it from training data. Experimental results show that for certain types of graphs a learned function can outperform the Bethe approximation. We also establish a link between the Bethe free energy and submodular functions.
},
  author       = {Korc, Filip and Kolmogorov, Vladimir and Lampert, Christoph},
  location     = {Edinburgh, Scotland},
  publisher    = {ICML},
  title        = {{Approximating marginals using discrete energy minimization}},
  year         = {2012},
}

@inproceedings{3125,
  abstract     = {We propose a new learning method to infer a mid-level feature representation that combines the advantage of semantic attribute representations with the higher expressive power of non-semantic features. The idea lies in augmenting an existing attribute-based representation with additional dimensions for which an autoencoder model is coupled with a large-margin principle. This construction allows a smooth transition between the zero-shot regime with no training example, the unsupervised regime with training examples but without class labels, and the supervised regime with training examples and with class labels. The resulting optimization problem can be solved efficiently, because several of the necessity steps have closed-form solutions. Through extensive experiments we show that the augmented representation achieves better results in terms of object categorization accuracy than the semantic representation alone.},
  author       = {Sharmanska, Viktoriia and Quadrianto, Novi and Lampert, Christoph},
  location     = {Florence, Italy},
  number       = {PART 5},
  pages        = {242 -- 255},
  publisher    = {Springer},
  title        = {{Augmented attribute representations}},
  doi          = {10.1007/978-3-642-33715-4_18},
  volume       = {7576},
  year         = {2012},
}

@inproceedings{3126,
  abstract     = {In this work we propose a new information-theoretic clustering algorithm that infers cluster memberships by direct optimization of a non-parametric mutual information estimate between data distribution and cluster assignment. Although the optimization objective has a solid theoretical foundation it is hard to optimize. We propose an approximate optimization formulation that leads to an efficient algorithm with low runtime complexity. The algorithm has a single free parameter, the number of clusters to find. We demonstrate superior performance on several synthetic and real datasets.
},
  author       = {Müller, Andreas and Nowozin, Sebastian and Lampert, Christoph},
  location     = {Graz, Austria},
  pages        = {205 -- 215},
  publisher    = {Springer},
  title        = {{Information theoretic clustering using minimal spanning trees}},
  doi          = {10.1007/978-3-642-32717-9_21},
  volume       = {7476},
  year         = {2012},
}

@inproceedings{3127,
  abstract     = {When searching for characteristic subpatterns in potentially noisy graph data, it appears self-evident that having multiple observations would be better than having just one. However, it turns out that the inconsistencies introduced when different graph instances have different edge sets pose a serious challenge. In this work we address this challenge for the problem of finding maximum weighted cliques.
    We introduce the concept of most persistent soft-clique. This is subset of vertices, that 1) is almost fully or at least densely connected, 2) occurs in all or almost all graph instances, and 3) has the maximum weight. We present a measure of clique-ness, that essentially counts the number of edge missing to make a subset of vertices into a clique. With this measure, we show that the problem of finding the most persistent soft-clique problem can be cast either as: a) a max-min two person game optimization problem, or b) a min-min soft margin optimization problem. Both formulations lead to the same solution when using a partial Lagrangian method to solve the optimization problems. By experiments on synthetic data and on real social network data, we show that the proposed method is able to reliably find soft cliques in graph data, even if that is distorted by random noise or unreliable observations.},
  author       = {Quadrianto, Novi and Lampert, Christoph and Chen, Chao},
  booktitle    = {Proceedings of the 29th International Conference on Machine Learning},
  location     = {Edinburgh, United Kingdom},
  pages        = {211--218},
  publisher    = {ML Research Press},
  title        = {{The most persistent soft-clique in a set of sampled graphs}},
  year         = {2012},
}

