@article{11673,
  abstract     = {Given only the URL of a Web page, can we identify its topic? We study this problem in detail by exploring a large number of different feature sets and algorithms on several datasets. We also show that the inherent overlap between topics and the sparsity of the information in URLs makes this a very challenging problem. Web page classification without a page’s content is desirable when the content is not available at all, when a classification is needed before obtaining the content, or when classification speed is of utmost importance. For our experiments we used five different corpora comprising a total of about 3 million (URL, classification) pairs. We evaluated several techniques for feature generation and classification algorithms. The individual binary classifiers were then combined via boosting into metabinary classifiers. We achieve typical F-measure values between 80 and 85, and a typical precision of around 86. The precision can be pushed further over 90 while maintaining a typical level of recall between 30 and 40.},
  author       = {Baykan, Eda and Henzinger, Monika H and Marian, Ludmila and Weber, Ingmar},
  issn         = {1559-114X},
  journal      = {ACM Transactions on the Web},
  keywords     = {Topic classification, URL, ODP},
  number       = {3},
  publisher    = {Association for Computing Machinery},
  title        = {{A comprehensive study of features and algorithms for URL-based topic classification}},
  doi          = {10.1145/1993053.1993057},
  volume       = {5},
  year         = {2011},
}

