[{"type":"journal_article","language":[{"iso":"eng"}],"doi":"10.1145/1993053.1993057","publisher":"Association for Computing Machinery","year":"2011","date_published":"2011-07-01T00:00:00Z","author":[{"last_name":"Baykan","full_name":"Baykan, Eda","first_name":"Eda"},{"first_name":"Monika H","full_name":"Henzinger, Monika H","orcid":"0000-0002-5008-6530","last_name":"Henzinger","id":"540c9bbd-f2de-11ec-812d-d04a5be85630"},{"last_name":"Marian","first_name":"Ludmila","full_name":"Marian, Ludmila"},{"last_name":"Weber","full_name":"Weber, Ingmar","first_name":"Ingmar"}],"_id":"11673","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","article_processing_charge":"No","extern":"1","keyword":["Topic classification","URL","ODP"],"intvolume":"         5","date_created":"2022-07-27T13:48:11Z","article_number":"15","volume":5,"article_type":"original","month":"07","quality_controlled":"1","publication_identifier":{"issn":["1559-1131"],"eissn":["1559-114X"]},"status":"public","date_updated":"2022-09-12T08:46:56Z","oa_version":"None","publication_status":"published","day":"01","abstract":[{"lang":"eng","text":"Given only the URL of a Web page, can we identify its topic? We study this problem in detail by exploring a large number of different feature sets and algorithms on several datasets. We also show that the inherent overlap between topics and the sparsity of the information in URLs makes this a very challenging problem. Web page classification without a page’s content is desirable when the content is not available at all, when a classification is needed before obtaining the content, or when classification speed is of utmost importance. For our experiments we used five different corpora comprising a total of about 3 million (URL, classification) pairs. We evaluated several techniques for feature generation and classification algorithms. The individual binary classifiers were then combined via boosting into metabinary classifiers. We achieve typical F-measure values between 80 and 85, and a typical precision of around 86. The precision can be pushed further over 90 while maintaining a typical level of recall between 30 and 40."}],"issue":"3","scopus_import":"1","citation":{"chicago":"Baykan, Eda, Monika H Henzinger, Ludmila Marian, and Ingmar Weber. “A Comprehensive Study of Features and Algorithms for URL-Based Topic Classification.” <i>ACM Transactions on the Web</i>. Association for Computing Machinery, 2011. <a href=\"https://doi.org/10.1145/1993053.1993057\">https://doi.org/10.1145/1993053.1993057</a>.","ieee":"E. Baykan, M. H. Henzinger, L. Marian, and I. Weber, “A comprehensive study of features and algorithms for URL-based topic classification,” <i>ACM Transactions on the Web</i>, vol. 5, no. 3. Association for Computing Machinery, 2011.","short":"E. Baykan, M.H. Henzinger, L. Marian, I. Weber, ACM Transactions on the Web 5 (2011).","mla":"Baykan, Eda, et al. “A Comprehensive Study of Features and Algorithms for URL-Based Topic Classification.” <i>ACM Transactions on the Web</i>, vol. 5, no. 3, 15, Association for Computing Machinery, 2011, doi:<a href=\"https://doi.org/10.1145/1993053.1993057\">10.1145/1993053.1993057</a>.","apa":"Baykan, E., Henzinger, M. H., Marian, L., &#38; Weber, I. (2011). A comprehensive study of features and algorithms for URL-based topic classification. <i>ACM Transactions on the Web</i>. Association for Computing Machinery. <a href=\"https://doi.org/10.1145/1993053.1993057\">https://doi.org/10.1145/1993053.1993057</a>","ista":"Baykan E, Henzinger MH, Marian L, Weber I. 2011. A comprehensive study of features and algorithms for URL-based topic classification. ACM Transactions on the Web. 5(3), 15.","ama":"Baykan E, Henzinger MH, Marian L, Weber I. A comprehensive study of features and algorithms for URL-based topic classification. <i>ACM Transactions on the Web</i>. 2011;5(3). doi:<a href=\"https://doi.org/10.1145/1993053.1993057\">10.1145/1993053.1993057</a>"},"publication":"ACM Transactions on the Web","title":"A comprehensive study of features and algorithms for URL-based topic classification"}]
