@article{11671,
  abstract     = {Given only the URL of a Web page, can we identify its language? In this article we examine this question. URL-based language classification is useful when the content of the Web page is not available or downloading the content is a waste of bandwidth and time.
We built URL-based language classifiers for English, German, French, Spanish, and Italian by applying a variety of algorithms and features. As algorithms we used machine learning algorithms which are widely applied for text classification and state-of-art algorithms for language identification of text. As features we used words, various sized n-grams, and custom-made features (our novel feature set). We compared our approaches with two baseline methods, namely classification by country code top-level domains and classification by IP addresses of the hosting Web servers.

We trained and tested our classifiers in a 10-fold cross-validation setup on a dataset obtained from the Open Directory Project and from querying a commercial search engine. We obtained the lowest F1-measure for English (94) and the highest F1-measure for German (98) with the best performing classifiers.

We also evaluated the performance of our methods: (i) on a set of Web pages written in Adobe Flash and (ii) as part of a language-focused crawler. In the first case, the content of the Web page is hard to extract and in the second page downloading pages of the “wrong” language constitutes a waste of bandwidth. In both settings the best classifiers have a high accuracy with an F1-measure between 95 (for English) and 98 (for Italian) for the Adobe Flash pages and a precision between 90 (for Italian) and 97 (for French) for the language-focused crawler.},
  author       = {Baykan, Eda and Weber, Ingmar and Henzinger, Monika H},
  issn         = {1559-114X},
  journal      = {ACM Transactions on the Web},
  keywords     = {Computer Networks and Communications},
  number       = {1},
  publisher    = {Association for Computing Machinery},
  title        = {{A comprehensive study of techniques for URL-based web page language classification}},
  doi          = {10.1145/2435215.2435218},
  volume       = {7},
  year         = {2013},
}

@article{11673,
  abstract     = {Given only the URL of a Web page, can we identify its topic? We study this problem in detail by exploring a large number of different feature sets and algorithms on several datasets. We also show that the inherent overlap between topics and the sparsity of the information in URLs makes this a very challenging problem. Web page classification without a page’s content is desirable when the content is not available at all, when a classification is needed before obtaining the content, or when classification speed is of utmost importance. For our experiments we used five different corpora comprising a total of about 3 million (URL, classification) pairs. We evaluated several techniques for feature generation and classification algorithms. The individual binary classifiers were then combined via boosting into metabinary classifiers. We achieve typical F-measure values between 80 and 85, and a typical precision of around 86. The precision can be pushed further over 90 while maintaining a typical level of recall between 30 and 40.},
  author       = {Baykan, Eda and Henzinger, Monika H and Marian, Ludmila and Weber, Ingmar},
  issn         = {1559-114X},
  journal      = {ACM Transactions on the Web},
  keywords     = {Topic classification, URL, ODP},
  number       = {3},
  publisher    = {Association for Computing Machinery},
  title        = {{A comprehensive study of features and algorithms for URL-based topic classification}},
  doi          = {10.1145/1993053.1993057},
  volume       = {5},
  year         = {2011},
}

