[{"date_created":"2022-07-27T13:48:11Z","date_published":"2011-07-01T00:00:00Z","article_type":"original","month":"07","language":[{"iso":"eng"}],"scopus_import":"1","publisher":"Association for Computing Machinery","publication":"ACM Transactions on the Web","issue":"3","type":"journal_article","day":"01","status":"public","intvolume":"         5","article_number":"15","doi":"10.1145/1993053.1993057","year":"2011","title":"A comprehensive study of features and algorithms for URL-based topic classification","article_processing_charge":"No","volume":5,"date_updated":"2022-09-12T08:46:56Z","quality_controlled":"1","oa_version":"None","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","extern":"1","publication_identifier":{"eissn":["1559-114X"],"issn":["1559-1131"]},"_id":"11673","citation":{"mla":"Baykan, Eda, et al. “A Comprehensive Study of Features and Algorithms for URL-Based Topic Classification.” <i>ACM Transactions on the Web</i>, vol. 5, no. 3, 15, Association for Computing Machinery, 2011, doi:<a href=\"https://doi.org/10.1145/1993053.1993057\">10.1145/1993053.1993057</a>.","ama":"Baykan E, Henzinger MH, Marian L, Weber I. A comprehensive study of features and algorithms for URL-based topic classification. <i>ACM Transactions on the Web</i>. 2011;5(3). doi:<a href=\"https://doi.org/10.1145/1993053.1993057\">10.1145/1993053.1993057</a>","ista":"Baykan E, Henzinger MH, Marian L, Weber I. 2011. A comprehensive study of features and algorithms for URL-based topic classification. ACM Transactions on the Web. 5(3), 15.","short":"E. Baykan, M.H. Henzinger, L. Marian, I. Weber, ACM Transactions on the Web 5 (2011).","apa":"Baykan, E., Henzinger, M. H., Marian, L., &#38; Weber, I. (2011). A comprehensive study of features and algorithms for URL-based topic classification. <i>ACM Transactions on the Web</i>. Association for Computing Machinery. <a href=\"https://doi.org/10.1145/1993053.1993057\">https://doi.org/10.1145/1993053.1993057</a>","ieee":"E. Baykan, M. H. Henzinger, L. Marian, and I. Weber, “A comprehensive study of features and algorithms for URL-based topic classification,” <i>ACM Transactions on the Web</i>, vol. 5, no. 3. Association for Computing Machinery, 2011.","chicago":"Baykan, Eda, Monika H Henzinger, Ludmila Marian, and Ingmar Weber. “A Comprehensive Study of Features and Algorithms for URL-Based Topic Classification.” <i>ACM Transactions on the Web</i>. Association for Computing Machinery, 2011. <a href=\"https://doi.org/10.1145/1993053.1993057\">https://doi.org/10.1145/1993053.1993057</a>."},"publication_status":"published","keyword":["Topic classification","URL","ODP"],"author":[{"first_name":"Eda","full_name":"Baykan, Eda","last_name":"Baykan"},{"id":"540c9bbd-f2de-11ec-812d-d04a5be85630","full_name":"Henzinger, Monika H","last_name":"Henzinger","orcid":"0000-0002-5008-6530","first_name":"Monika H"},{"first_name":"Ludmila","last_name":"Marian","full_name":"Marian, Ludmila"},{"first_name":"Ingmar","full_name":"Weber, Ingmar","last_name":"Weber"}],"abstract":[{"text":"Given only the URL of a Web page, can we identify its topic? We study this problem in detail by exploring a large number of different feature sets and algorithms on several datasets. We also show that the inherent overlap between topics and the sparsity of the information in URLs makes this a very challenging problem. Web page classification without a page’s content is desirable when the content is not available at all, when a classification is needed before obtaining the content, or when classification speed is of utmost importance. For our experiments we used five different corpora comprising a total of about 3 million (URL, classification) pairs. We evaluated several techniques for feature generation and classification algorithms. The individual binary classifiers were then combined via boosting into metabinary classifiers. We achieve typical F-measure values between 80 and 85, and a typical precision of around 86. The precision can be pushed further over 90 while maintaining a typical level of recall between 30 and 40.","lang":"eng"}]},{"abstract":[{"lang":"eng","text":"We consider the problem of sampling URLs uniformly at random from the Web. A tool for sampling URLs uniformly can be used to estimate various properties of Web pages, such as the fraction of pages in various Internet domains or written in various languages. Moreover, uniform URL sampling can be used to determine the sizes of various search engines relative to the entire Web. In this paper, we consider sampling approaches based on random walks of the Web graph. In particular, we suggest ways of improving sampling based on random walks to make the samples closer to uniform. We suggest a natural test bed based on random graphs for testing the effectiveness of our procedures. We then use our sampling approach to estimate the distribution of pages over various Internet domains and to estimate the coverage of various search engine indexes."}],"author":[{"first_name":"Monika H","last_name":"Henzinger","full_name":"Henzinger, Monika H","orcid":"0000-0002-5008-6530","id":"540c9bbd-f2de-11ec-812d-d04a5be85630"},{"last_name":"Heydon","full_name":"Heydon, Allan","first_name":"Allan"},{"last_name":"Mitzenmacher","full_name":"Mitzenmacher, Michael","first_name":"Michael"},{"first_name":"Marc","full_name":"Najork, Marc","last_name":"Najork"}],"keyword":["URL sampling","Random walks","Internet domain distribution","Search engine size"],"publication_status":"published","citation":{"ieee":"M. H. Henzinger, A. Heydon, M. Mitzenmacher, and M. Najork, “On near-uniform URL sampling,” <i>Computer Networks</i>, vol. 33, no. 1–6. Elsevier, pp. 295–308, 2000.","apa":"Henzinger, M. H., Heydon, A., Mitzenmacher, M., &#38; Najork, M. (2000). On near-uniform URL sampling. <i>Computer Networks</i>. Elsevier. <a href=\"https://doi.org/10.1016/s1389-1286(00)00055-4\">https://doi.org/10.1016/s1389-1286(00)00055-4</a>","chicago":"Henzinger, Monika H, Allan Heydon, Michael Mitzenmacher, and Marc Najork. “On Near-Uniform URL Sampling.” <i>Computer Networks</i>. Elsevier, 2000. <a href=\"https://doi.org/10.1016/s1389-1286(00)00055-4\">https://doi.org/10.1016/s1389-1286(00)00055-4</a>.","ama":"Henzinger MH, Heydon A, Mitzenmacher M, Najork M. On near-uniform URL sampling. <i>Computer Networks</i>. 2000;33(1-6):295-308. doi:<a href=\"https://doi.org/10.1016/s1389-1286(00)00055-4\">10.1016/s1389-1286(00)00055-4</a>","mla":"Henzinger, Monika H., et al. “On Near-Uniform URL Sampling.” <i>Computer Networks</i>, vol. 33, no. 1–6, Elsevier, 2000, pp. 295–308, doi:<a href=\"https://doi.org/10.1016/s1389-1286(00)00055-4\">10.1016/s1389-1286(00)00055-4</a>.","short":"M.H. Henzinger, A. Heydon, M. Mitzenmacher, M. Najork, Computer Networks 33 (2000) 295–308.","ista":"Henzinger MH, Heydon A, Mitzenmacher M, Najork M. 2000. On near-uniform URL sampling. Computer Networks. 33(1–6), 295–308."},"_id":"11685","extern":"1","publication_identifier":{"issn":["1389-1286"]},"user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","oa_version":"None","quality_controlled":"1","volume":33,"date_updated":"2022-09-12T09:09:13Z","article_processing_charge":"No","title":"On near-uniform URL sampling","doi":"10.1016/s1389-1286(00)00055-4","year":"2000","intvolume":"        33","status":"public","day":"01","type":"journal_article","issue":"1-6","publication":"Computer Networks","page":"295-308","publisher":"Elsevier","scopus_import":"1","language":[{"iso":"eng"}],"month":"06","date_published":"2000-06-01T00:00:00Z","article_type":"original","date_created":"2022-07-28T15:11:53Z"}]
