@article{11673,
  abstract     = {Given only the URL of a Web page, can we identify its topic? We study this problem in detail by exploring a large number of different feature sets and algorithms on several datasets. We also show that the inherent overlap between topics and the sparsity of the information in URLs makes this a very challenging problem. Web page classification without a page’s content is desirable when the content is not available at all, when a classification is needed before obtaining the content, or when classification speed is of utmost importance. For our experiments we used five different corpora comprising a total of about 3 million (URL, classification) pairs. We evaluated several techniques for feature generation and classification algorithms. The individual binary classifiers were then combined via boosting into metabinary classifiers. We achieve typical F-measure values between 80 and 85, and a typical precision of around 86. The precision can be pushed further over 90 while maintaining a typical level of recall between 30 and 40.},
  author       = {Baykan, Eda and Henzinger, Monika H and Marian, Ludmila and Weber, Ingmar},
  issn         = {1559-114X},
  journal      = {ACM Transactions on the Web},
  keywords     = {Topic classification, URL, ODP},
  number       = {3},
  publisher    = {Association for Computing Machinery},
  title        = {{A comprehensive study of features and algorithms for URL-based topic classification}},
  doi          = {10.1145/1993053.1993057},
  volume       = {5},
  year         = {2011},
}

@article{11685,
  abstract     = {We consider the problem of sampling URLs uniformly at random from the Web. A tool for sampling URLs uniformly can be used to estimate various properties of Web pages, such as the fraction of pages in various Internet domains or written in various languages. Moreover, uniform URL sampling can be used to determine the sizes of various search engines relative to the entire Web. In this paper, we consider sampling approaches based on random walks of the Web graph. In particular, we suggest ways of improving sampling based on random walks to make the samples closer to uniform. We suggest a natural test bed based on random graphs for testing the effectiveness of our procedures. We then use our sampling approach to estimate the distribution of pages over various Internet domains and to estimate the coverage of various search engine indexes.},
  author       = {Henzinger, Monika H and Heydon, Allan and Mitzenmacher, Michael and Najork, Marc},
  issn         = {1389-1286},
  journal      = {Computer Networks},
  keywords     = {URL sampling, Random walks, Internet domain distribution, Search engine size},
  number       = {1-6},
  pages        = {295--308},
  publisher    = {Elsevier},
  title        = {{On near-uniform URL sampling}},
  doi          = {10.1016/s1389-1286(00)00055-4},
  volume       = {33},
  year         = {2000},
}

