diff --git a/ed1n/MapReduce-algorithms.bib b/ed1n/MapReduce-algorithms.bib index 377ec0b..edb15d6 100644 --- a/ed1n/MapReduce-algorithms.bib +++ b/ed1n/MapReduce-algorithms.bib @@ -1,1532 +1,1532 @@ -@INPROCEEDINGS{WangYi_etal_2009, - author = "Yi Wang and Hongjie Bai and Matt Stanton and Wen-Yen Chen and Edward Y. Chang", - title = "{PLDA}: Parallel Latent {Dirichlet} Allocation for Large-Scale Applications", - booktitle = "Proceedings of the Fifth International Conference on Algorithmic Aspects in Information and Management (AAIM 2009)", - address = "San Francisco, California", - year = 2009, - pages = "301--314", -} - -@INPROCEEDINGS{ChuCT_etal_2006, - author = "Cheng-Tao Chu and Sang Kyun Kim and Yi-An Lin and YuanYuan Yu and Gary Bradski and Andrew Ng and Kunle Olukotun", - title = "{Map-Reduce} for Machine Learning on Multicore", - booktitle = "Advances in Neural Information Processing Systems 19 (NIPS 2006)", - year = 2006, - address = "Vancouver, British Columbia, Canada", - pages = "281--288", -} - -@book{Owen_2010, - author={Sean Owen and Robin Anil}, - title={Mahout in Action}, - publisher={Manning Publications Co.}, - year=2010, - address={Greenwich, Connecticut} -} - -@inproceedings{Asuncion_2008, - author = {Arthur Asuncion and Padhraic Smyth and Max Welling}, - title = {Asynchronous Distributed Learning of Topic Models}, - booktitle = "Advances in Neural Information Processing Systems 21 (NIPS 2008)", - year = 2008, - address = "Vancouver, British Columbia, Canada", - pages = {81--88} -} - -@incollection{Bottou_2004, - author = {Bottou, L\'{e}on}, - title = {Stochastic Learning}, - booktitle = {Advanced Lectures on Machine Learning}, - pages = {146-168}, - publisher = {Springer Verlag}, - year = {2004}, - editor = {Bousquet, Olivier and von Luxburg, Ulrike}, - series = {Lecture Notes in Artificial Intelligence, LNAI~3176}, - address = {Berlin}, - url = {http://leon.bottou.org/papers/bottou-mlss-2004}, -} - -@INPROCEEDINGS{Feigenbaum_2004, - author = {Joan Feigenbaum and Sampath Kannan and Andrew Mcgregor and Siddharth Suri and Jian Zhang}, - title = {On Graph Problems in a Semi-Streaming Model}, - booktitle = {31st International Colloquium on Automata, Languages and Programming}, - year = {2004}, - pages = {531--543} -} - -@INPROCEEDINGS{Taura_2003, - author = {Kenjiro Taura and Toshio Endo and Kenji Kaneda and Akinori Yonezawa}, - title = {Phoenix: a Parallel Programming Model for Accommodating Dynamically Joining Resources}, - booktitle = {Proceedings of the Ninth ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming}, - year = {2003}, - pages = {216--229}, - publisher = {ACM} -} - -@InProceedings{Levenberg_2009, - author = {Levenberg, Abby and Osborne, Miles}, - title = {Stream-based Randomised Language Models for {SMT}}, - booktitle = {Proceedings of the 2009 Conference on Empirical Methods in Natural Language Processing}, - year = {2009}, - address = {Singapore}, - pages = {756--764}, -} - -@inproceedings{Levenberg_2010, - Address = "Los Angeles, California", - Author = "Abby Levenberg and Chris Callison-Burch and Miles Osborne", - booktitle = "Proceedings of the 11th Annual Conference of the North American Chapter of the Association for Computational Linguistics (NAACL HLT 2010)", - Title = {Stream-based Translation Models for Statistical Machine Translation}, - Year = 2010 -} - -@inproceedings{Petrovic_2010, - Address = "Los Angeles, California", - Author = "Sasa Petrovic and Miles Osborne and Victor Lavrenko", - booktitle = "Proceedings of the 11th Annual Conference of the North American Chapter of the Association for Computational Linguistics (NAACL HLT 2010)", - Title = {Streaming First Story Detection with application to {Twitter}}, - Year = 2010 -} - -Sasa Petrovic, Miles Osborne and Victor Lavrenko. Streaming First Story Detection with application to Twitter. NAACL, Los Angeles, USA. June 2010. - -@inproceedings{Alon_1996, - author = {Alon, Noga and Matias, Yossi and Szegedy, Mario}, - title = {The space complexity of approximating the frequency moments}, - booktitle = {Proceedings of the 28th Annual ACM Symposium on Theory of Computing (STOC '96)}, - year = {1996}, - isbn = {0-89791-785-5}, - pages = {20--29}, - address = {Philadelphia, Pennsylvania}, -} - -@UNPUBLISHED{Smith_2004, - author = "Noah Smith", - title = {Log-linear models}, - note = "http://www.cs.cmu.edu/~nasmith/papers/smith.tut04.pdf", - year = 2004 - } - -@article{Brown_1993, - author = {Brown, Peter F. and Della Pietra, Vincent J. and Della Pietra, Stephen A. and Mercer, Robert L. }, - citeulike-article-id = {1286336}, - journal = {Computational Linguistics}, - number = {2}, - pages = {263--311}, - publisher = {MIT Press}, - title = {The Mathematics of Statistical Machine Translation: Parameter Estimation}, - volume = {19}, - year = {1993} -} - - -@BOOK{Brants_2010, - author = "Thorsten Brants and Peng Xu", - title = "Distributed Language Models ", - publisher = "Morgan \& Claypool Publishers", - year = 2010, -} - -@inproceedings{Callison_Burch_2009, - author = {Callison-Burch, Chris and Koehn, Philipp and Monz, Christof and Schroeder, Josh}, - title = {Findings of the 2009 workshop on statistical machine translation}, - booktitle = {Proceedings of the Fourth Workshop on Statistical Machine Translation (StatMT '09)}, - year = {2009}, - pages = {1--28}, - address = {Athens, Greece}, - } - -@article{Gao_2010, - Author={Qin Gao and Stephan Vogel}, - title={Training phrase-based machine translation models on the cloud: Open source machine translation toolkit {Chaski}}, - journal={The Prague Bulletin of Mathematical Linguistics}, - Volume={93}, - year=2010, - pages={37--46} -} - -@inproceedings{Koehn_2003, - Address = "Edmonton, Alberta, Canada", - Author = "Philipp Koehn and Franz J. Och and Daniel Marcu", - booktitle = "Proceedings of the 2003 Human Language Technology Conference of the North American Chapter of the Association for Computational Linguistics (HLT/NAACL 2003)", - Doi = {http://dx.doi.org/10.3115/1073445.1073462}, - Pages = {48--54}, - Title = {Statistical phrase-based translation}, - Year = {2003}} - - -@inproceedings{Sha_2003, - author = {Sha, Fei and Pereira, Fernando}, - pages = {134--141}, - title = {Shallow parsing with conditional random fields}, - booktitle = "Proceedings of the 2003 Human Language Technology Conference of the North American Chapter of the Association for Computational Linguistics (\mbox{HLT/NAACL} 2003)", - address = "Edmonton, Alberta, Canada", - year = 2003, -} - -@inproceedings{Malouf_2002, - author = {Malouf, Robert}, - title = {A comparison of algorithms for maximum entropy parameter estimation}, - booktitle = "Proceedings of the Sixth Conference on Natural Language Learning (CoNLL-2002)", - year = {2002}, - pages = {49--55}, - address = "Taipei, Taiwan" - } - -@ARTICLE{LBFGS, - author = {Dong C. Liu and Jorge Nocedal and Dong C. Liu and Jorge Nocedal}, - title = {On the limited memory {B}{F}{G}{S} method for large scale optimization}, - journal = {Mathematical Programming B}, - year = {1989}, - volume = {45}, - number = {3}, - pages = {503--528} -} - - -@inproceedings{Lafferty_2001, - author = {Lafferty, John D. and McCallum, Andrew and Pereira, Fernando}, - title = "Conditional Random Fields: Probabilistic Models for Segmenting and Labeling Sequence Data", - booktitle = {Proceedings of the Eighteenth International Conference on Machine Learning (ICML '01)}, - year = {2001}, - isbn = {1-55860-778-1}, - pages = {282--289}, - address = {San Francisco, California}, - } - -@conference{Nigam_1999, - title={{Using maximum entropy for text classification}}, - author={Kamal Nigam and John Lafferty and Andrew McCallum}, - booktitle={Proceedings of the IJCAI-99 Workshop on Machine Learning for Information Filtering}, - pages={61--67}, - year={1999}, - address ="Stockholm, Sweden" -} - -@inproceedings{Liang_2009, - title = {Online {EM} for Unsupervised Models}, - author = {Percy Liang and Dan Klein}, - booktitle = {North American Association for Computational Linguistics (NAACL)}, - year = {2009}, -} - -@article{Och_2003, - Author = {Franz J. Och and Hermann Ney}, - Journal = {Computational Linguistics}, - Number = {1}, - Pages = {19--51}, - Title = {A systematic comparison of various statistical alignment models}, - Volume = {29}, - Year = {2003} -} - -@conference{Seymore_1999, - title={Learning hidden {Markov} model structure for information extraction}, - author="Kristie Seymore and Andrew Mccallum and Ronald Rosenfeld", - booktitle={Proceedings of the \mbox{AAAI-99} Workshop on Machine Learning for Information Extraction}, - pages={37--42}, - year={1999}, - address = "Orlando, Florida" -} - -@book{Koehn_2009, - title={Statistical Machine Translation}, - author={Philipp Koehn}, - publisher = "Cambridge University Press", - address = "Cambridge, England", - year={2010}, -} - -@article{Lopez_2008, - author={Adam Lopez}, - title={Statistical Machine Translation}, - journal={ACM Computing Surveys}, - volume={40}, - number={3}, - article={8}, - pages={1--49}, - year=2008 -} - -@book{Ross_1996, - title={Stochastic processes}, - author={Ross, Sheldon M.}, - year={1996}, - publisher={Wiley}, - address={New York} -} - -@inproceedings{Cutting_1992, - author = {Cutting, Doug and Kupiec, Julian and Pedersen, Jan and Sibun, Penelope}, - title = {A practical part-of-speech tagger}, - booktitle = {Proceedings of the Third Conference on Applied Natural Language Processing}, - year = {1992}, - pages = {133--140}, - address = {Trento, Italy}, -} - -@article{Stanke_2003, - address = {Institut f\"{u}r Mikrobiologie und Genetik, Abteilung Bioinformatik, Universit\"{a}t G\"{o}ttingen, G\"{o}ttingen, Germany. mstanke@gwdg.de}, - author = {Stanke, Mario and Waack, Stephan}, - citeulike-article-id = {2002786}, - citeulike-linkout-0 = {http://view.ncbi.nlm.nih.gov/pubmed/14534192}, - citeulike-linkout-1 = {http://www.hubmed.org/display.cgi?uids=14534192}, - issn = {1367-4811}, - journal = {Bioinformatics}, - keywords = {algorithm, gene-prediction}, - month = {October}, - posted-at = {2007-11-28 10:30:57}, - priority = {2}, - title = {Gene prediction with a hidden {Markov} model and a new intron submodel}, - url = {http://view.ncbi.nlm.nih.gov/pubmed/14534192}, - volume = {19 Suppl 2}, - year = {2003}, - pages = "ii215--225", -} - -@INPROCEEDINGS{Abouzeid_etal_VLDB2009, - author = "Azza Abouzeid and Kamil Bajda-Pawlikowski and Daniel Abadi and Avi Silberschatz and Alexander Rasin", - title = "{HadoopDB}: An Architectural Hybrid of {MapReduce} and {DBMS} Technologies for Analytical Workloads", - booktitle = "Proceedings of the 35th International Conference on Very Large Data Base (VLDB 2009)", - address = "Lyon, France", - year = 2009, - pages = "922--933", -} - -@INPROCEEDINGS{Amdahl_1967, - author = "Gene Amdahl", - title = "Validity of the Single Processor Approach to Achieving Large-Scale Computing Capabilities", - booktitle = "Proceedings of the AFIPS Spring Joint Computer Conference", - year = 1967, - pages = "483--485", -} - -@INPROCEEDINGS{Ananthanarayanan_etal_2009, - author = "Rajagopal Ananthanarayanan and Karan Gupta and Prashant Pandey and Himabindu Pucha and Prasenjit Sarkar and Mansi Shah and Renu Tewari", - title = "Cloud Analytics: Do We \emph{Really} Need to Reinvent the Storage Stack?", - booktitle = "Proceedings of the 2009 Workshop on Hot Topics in Cloud Computing (HotCloud 09)", - address = "San Diego, California", - year = 2009, -} - -@TECHREPORT{Armbrust_etal_2009, - author = "Michael Armbrust and Armando Fox and Rean Griffith and Anthony D. Joseph and Randy H. Katz and Andrew Konwinski and Gunho Lee and David A. Patterson and Ariel Rabkin and Ion Stoica and Matei Zaharia", - title = "Above the Clouds: A {Berkeley} View of Cloud Computing", - number = "UCB/EECS-2009-28", - institution = "Electrical Engineering and Computer Sciences, University of California at Berkeley", - year = 2009, -} - -@INPROCEEDINGS{asuncion08asynchronous, - title = {Asynchronous Distributed Learning of Topic Models.}, - author = {Arthur Asuncion and Padhraic Smyth and Max Welling}, - booktitle = {NIPS}, - publisher = {MIT Press}, - year = {2008} -} - -@article{Dempster_Laird_Rubin_1977, - abstract = {A broadly applicable algorithm for computing maximum likelihood estimates from incomplete data is presented at various levels of generality. Theory showing the monotone behaviour of the likelihood and convergence of the algorithm is derived. Many examples are sketched, including missing value situations, applications to grouped, censored or truncated data, finite mixture models, variance component estimation, hyperparameter estimation, iteratively reweighted least squares and factor analysis.}, - author = {Dempster, Arthur P. and Laird, Nan M. and Rubin, Donald B.}, - doi = {10.2307/2984875}, - issn = {00359246}, - journal = {Journal of the Royal Statistical Society. Series B (Methodological)}, - number = {1}, - pages = {1--38}, - publisher = {Blackwell Publishing for the Royal Statistical Society}, - title = {Maximum Likelihood from Incomplete Data via the {EM} Algorithm}, - volume = {39}, - year = {1977} -} - -@book{Jelinek_1997, - author = {Jelinek, Frederick}, - title = {Statistical methods for speech recognition}, - year = {1997}, - isbn = {0-262-10066-5}, - publisher = {MIT Press}, - address = {Cambridge, Massachusetts}, -} - -@conference{Vogel_1996, - title={{HMM-based word alignment in statistical translation}}, - author={Vogel, Stephan and Ney, Hermann and Tillmann, Christoph}, - booktitle = "Proceedings of the 16th International Conference on Computational Linguistics (COLING 1996)", - pages={836--841}, - year={1996}, - address = "Copenhagen, Denmark" -} - -@conference{Hassan_2005, - title={Stock Market Forecasting Using Hidden {Markov} Models: A New Approach}, - author={Hassan, Md. Rafiul and Nath, Baikunth}, - booktitle={Proceedings of the 5th International Conference on Intelligent Systems Design and Applications (ISDA '05)}, - pages={192--196}, - year={2005}, - address = "Wroclaw, Poland" -} - -@INCOLLECTION{Rabiner_1990, - author = {Rabiner, Lawrence R.}, - title = {A tutorial on hidden {Markov} models and selected applications in speech recognition}, - booktitle = {Readings in Speech Recognition}, - year = {1990}, - isbn = {1-55860-124-4}, - pages = {267--296}, - publisher = {Morgan Kaufmann Publishers}, - address = {San Francisco, California}, - } - - ------- - -@ARTICLE{Albert_Barabasi_2002, - author = "{R\'{e}ka} Albert and {Albert-L\'{a}szl\'{o}} {Barab\'{a}si}", - title = "Statistical Mechanics of Complex Networks", - journal = "Reviews of Modern Physics", - volume = 74, - pages = "47--97", - year = 2002, -} - -@TECHREPORT{Alvaro_etal_2009, - author = "Peter Alvaro and Tyson Condie and Neil Conway and Khaled Elmeleegy and Joseph M. Hellerstein and Russell C. Sears", - title = "{BOOM}: Data-Centric Programming in the Datacenter", - number = "UCB/EECS-2009-98", - institution = "Electrical Engineering and Computer Sciences, University of California at Berkeley", - year = 2009, -} - -@INPROCEEDINGS{Anderson_etal_SOSP1995, - author = "Thomas Anderson and Michael Dahlin and Jeanna Neefe and David Patterson and Drew Roselli and Randolph Wang", - title = "Serverless Network File Systems", - booktitle = "Proceedings of the 15th ACM Symposium on Operating Systems Principles (SOSP 1995)", - year = 1995, - pages = "109--126", - address = "Copper Mountain Resort, Colorado", -} - -@ARTICLE{Anh_Moffat_2005, - author = "Vo Ngoc Anh and Alistair Moffat", - title = "Inverted index compression using word-aligned binary codes", - journal = "Information Retrieval", - volume = 8, - number = 1, - year = 2005, - pages = "151--166", -} - -@INPROCEEDINGS{Baeza-Yates_etal_2005, - author = "Ricardo Baeza-Yates and Carlos Castillo and Vicente {L\'{o}pez}", - title = "{PageRank} Increase under Different Collusion Topologies", - booktitle = "Proceedings of the First International Workshop on Adversarial Information Retrieval on the Web (AIRWeb 2005)", - year = 2005, - address = "Chiba, Japan", - pages = "17--24", -} - -@INPROCEEDINGS{Baeza-Yates_etal_SIGIR2007, - author = "Ricardo Baeza-Yates and Aristides Gionis and Flavio Junqueira and Vanessa Murdock and Vassilis Plachouras and Fabrizio Silvestri", - title = "The Impact of Caching on Search Engines", - booktitle = "Proceedings of the 30th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR 2007)", - address = "Amsterdam, The Netherlands", - year = 2007, - pages = "183--190", -} - -@INPROCEEDINGS{Baeza-Yates_etal_2007, - author = "Ricardo Baeza-Yates and Carlos Castillo and Flavio Junqueira and Vassilis Plachouras and Fabrizio Silvestri", - title = "Challenges on Distributed Web Retrieval", - booktitle = "Proceedings of the IEEE 23rd International Conference on Data Engineering (ICDE 2007)", - address = "Istanbul, Turkey", - year = 2007, - pages = "6--20", -} - -@INPROCEEDINGS{Banko01, - author = "Michele Banko and Eric Brill", - title = "Scaling to Very Very Large Corpora for Natural Language Disambiguation", - booktitle = "Proceedings of the 39th Annual Meeting of the Association for Computational Linguistics (ACL 2001)", - year = 2001, - pages = "26--33", - address = "Toulouse, France", -} - -@INPROCEEDINGS{Barham_etal_2003, - author = "Paul Barham and Boris Dragovic and Keir Fraser and Steven Hand and Tim Harris and Alex Ho and Rolf Neugebauer and Ian Pratt and Andrew Warfield", - title = "Xen and the Art of Virtualization", - booktitle = "Proceedings of the 19th ACM Symposium on Operating Systems Principles (SOSP 2003)", - year = 2003, - address = "Bolton Landing, New York", - pages = "164--177", -} - -@ARTICLE{Barroso03, - author = "Luiz {Andr\'{e}} Barroso and Jeffrey Dean and Urs {H\"{o}lzle}", - title = "Web Search for a Planet: The {Google} Cluster Architecture", - journal = "IEEE Micro", - volume = 23, - number = 2, - pages = "22--28", - year = 2003, -} - -@ARTICLE{Barroso_Holzle_2007, - author = "Luiz {Andr\'{e}} Barroso and Urs {H\"{o}lzle}", - title = "The Case for Energy-Proportional Computing", - journal = "Computer", - volume = 40, - number = 12, - pages = "33--37", - year = 2007, -} - -@BOOK{Barroso_Holzle_2009, - author = "Luiz {Andr\'{e}} Barroso and Urs {H\"{o}lzle}", - title = "The Datacenter as a Computer: An Introduction to the Design of Warehouse-Scale Machines", - publisher = "Morgan \& Claypool Publishers", - year = 2009, -} - -@INPROCEEDINGS{Becla_Wang_2005, - author = "Jacek Becla and Daniel L. Wang", - title = "Lessons Learned from Managing a Petabyte", - booktitle = "Proceedings of the Second Biennial Conference on Innovative Data Systems Research (CIDR 2005)", - year = 2005, - address = "Asilomar, California", -} - -@TECHREPORT{Becla_etal_2006, - author = "Jacek Becla and Andrew Hanushevsky and Sergei Nikolaev and Ghaleb Abdulla and Alex Szalay and Maria Nieto-Santisteban and Ani Thakar and Jim Gray", - title = "Designing a Multi-petabyte Database for {LSST}", - type = "SLAC Publications", - number = "SLAC-PUB-12292", - institution = "Stanford Linear Accelerator Center", - year = 2006, - month = "May", -} - -@ARTICLE{Bell_etal_2009, - author = "Gordon Bell and Tony Hey and Alex Szalay", - title = "Beyond the Data Deluge", - journal = "Science", - volume = 323, - number = 5919, - pages = "1297--1298", - year = 2009, -} - -@ARTICLE{Bianchini_etal_2005, - author = "Monica Bianchini and Marco Gori and Franco Scarselli", - title = "Inside {PageRank}", - journal = "ACM Transactions on Internet Technology", - volume = 5, - number = 1, - pages = "92--128", - year = 2005, -} - -@BOOK{Borges_1999, - author = "Jorge Luis Borges", - title = "Collected Fictions (translated by {Andrew} {Hurley})", - publisher = "Penguin", - year = 1999, -} - -@INPROCEEDINGS{Brants_etal_EMNLP2007, - author = "Thorsten Brants and Ashok C. Popat and Peng Xu and Franz J. Och and Jeffrey Dean", - title = "Large Language Models in Machine Translation", - booktitle = "Proceedings of the 2007 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning", - year = 2007, - pages = "858--867", - address = "Prague, Czech Republic", -} - -@INPROCEEDINGS{Brill_etal_TREC2001, - author = "Eric Brill and Jimmy Lin and Michele Banko and Susan Dumais and Andrew Ng", - title = "Data-Intensive Question Answering", - booktitle = "Proceedings of the Tenth Text REtrieval Conference (TREC 2001)", - year = 2001, - pages = "393--400", - address = "Gaithersburg, Maryland", -} - -@BOOK{Brooks_1995, - author = "Frederick P. Brooks", - title = "The Mythical Man-Month: Essays on Software Engineering, Anniversary Edition", - publisher = "Addison-Wesley", - address = "Reading, Massachusetts", - year = 1995, -} - -@BOOK{Buttcher_etal_2010, - author = "Stefan {B\"{u}ttcher} and Charles L. A. Clarke and Gordon V. Cormack", - title = "Information Retrieval: Implementing and Evaluating Search Engines", - publisher = "MIT Press", - address = "Cambridge, Massachusetts", - year = 2010, -} - -@ARTICLE{Buyya_etal_2009, - author = "Rajkumar Buyya and Chee Shin Yeo and Srikumar Venugopal and James Broberg and Ivona Brandic", - title = "Cloud Computing and Emerging {IT} Platforms: Vision, Hype, and Reality for Delivering Computing as the 5th Utility", - journal = "Future Generation Computer Systems", - volume = 25, - number = 6, - pages = "599--616", - year = 2009, -} - -@ARTICLE{Cabrera_Long_1991, - author = "Luis-Felipe Cabrera and Darrell D. E. Long", - title = "{Swift}: Using Distributed Disk Striping to Provide High {I/O} Data Rates", - journal = "Computer Systems", - volume = 4, - number = 4, - pages = "405--436", - year = 1991, -} - -@INPROCEEDINGS{ChangFay_etal_OSDI2006, - author = "Fay Chang and Jeffrey Dean and Sanjay Ghemawat and Wilson C. Hsieh and Deborah A. Wallach and Michael Burrows and Tushar Chandra and Andrew Fikes and Robert Gruber", - title = "{Bigtable}: A Distributed Storage System for Structured Data", - booktitle = "Proceedings of the 7th Symposium on Operating System Design and Implementation (OSDI 2006)", - address = "Seattle, Washington", - year = 2006, - pages = "205--218", -} - -@INPROCEEDINGS{Chen_Goodman_ACL1996, - author = "Stanley F. Chen and Joshua Goodman", - title = "An Empirical Study of Smoothing Techniques for Language Modeling", - booktitle = "Proceedings of the 34th Annual Meeting of the Association for Computational Linguistics (ACL 1996)", - year = 1996, - pages = "310--318", - address = "Santa Cruz, California", -} - -@ARTICLE{Church_Hanks_1990, - author = "Kenneth W. Church and Patrick Hanks", - title = "Word Association Norms, Mutual Information, and Lexicography", - journal = "Computational Linguistics", - volume = 16, - number = 1, - pages = "22--29", - year = 1990, -} - -@ARTICLE{CohenJonathan_2009, - author = "Jonathan Cohen", - title = "Graph Twiddling in a {MapReduce} World", - journal = "Computing in Science and Engineering", - volume = 11, - number = 4, - pages = "29--41", - year = 2009, -} - -@INPROCEEDINGS{CooperBrian_etal_2010, - author = "Brian F. Cooper and Adam Silberstein and Erwin Tam and Raghu Ramakrishnan and Russell Sears", - title = "Benchmarking Cloud Serving Systems with {YCSB}", - booktitle = "Proceedings of the First ACM Symposium on Cloud Computing (ACM SOCC 2010)", - address = "Indianapolis, Indiana", - year = 2010, -} - -@BOOK{CLR, - author = "Thomas H. Cormen and Charles E. Leiserson and Ronald L. Rivest", - title = "Introduction to Algorithms", - publisher = "MIT Press", - address = "Cambridge, Massachusetts", - year = 1990, -} - -@BOOK{Croft_etal_2009, - author = "W. Bruce Croft and Donald Meztler and Trevor Strohman", - title = "Search Engines: Information Retrieval in Practice", - publisher = "Addison-Wesley", - address = "Reading, Massachusetts", - year = 2009, -} - -@ARTICLE{Culler_etal_1993, - author = "David Culler and Richard Karp and David Patterson and Abhijit Sahay and Klaus Erik Schauser and Eunice Santos and Ramesh Subramonian and Thorsten von Eicken", - title = "{LogP}: Towards a Realistic Model of Parallel Computation", - journal = "ACM SIGPLAN Notices", - volume = 28, - number = 7, - year = 1993, - pages = "1--12", -} - -@INPROCEEDINGS{Dean_Ghemawat_OSDI2004, - author = "Jeffrey Dean and Sanjay Ghemawat", - title = "{MapReduce}: Simplified Data Processing on Large Clusters", - booktitle = "Proceedings of the 6th Symposium on Operating System Design and Implementation (OSDI 2004)", - address = "San Francisco, California", - year = 2004, - pages = "137--150", -} - -@ARTICLE{Dean_Ghemawat_CACM2008, - author = "Jeffrey Dean and Sanjay Ghemawat", - title = "{MapReduce}: Simplified Data Processing on Large Clusters", - journal = "Communications of the ACM", - volume = 51, - number = 1, - pages = "107--113", - year = 2008, -} - -@ARTICLE{Dean_Ghemawat_CACM2010, - author = "Jeffrey Dean and Sanjay Ghemawat", - title = "{MapReduce}: A Flexible Data Processing Tool", - journal = "Communications of the ACM", - volume = 53, - number = 1, - pages = "72--77", - year = 2010, -} - -@INPROCEEDINGS{DeCandia_etal_2007, - author = "Giuseppe DeCandia and Deniz Hastorun and Madan Jampani and Gunavardhan Kakulapati and Avinash Lakshman and Alex Pilchin and Swami Sivasubramanian and Peter Vosshall and Werner Vogels", - title = "Dynamo: {Amazon's} Highly Available Key-Value Store", - booktitle = "Proceedings of the 21st ACM Symposium on Operating Systems Principles (SOSP 2007)", - year = 2007, - address = "Stevenson, Washington", - pages = "205--220", -} - -@ARTICLE{DeWitt_etal_1984, - author = "David J. DeWitt and Randy H. Katz and Frank Olken and Leonard D. Shapiro and Michael R. Stonebraker and David Wood", - title = "Implementation Techniques for Main Memory Database Systems", - journal = "ACM SIGMOD Record", - volume = 14, - number = 2, - pages = "1--8", - year = 1984, -} - -@ARTICLE{DeWitt_Gray_CACM1992, - author = "David J. DeWitt and Jim Gray", - title = "Parallel Database Systems: The Future of High Performance Database Systems", - journal = "Communications of the ACM", - volume = 35, - number = 6, - pages = "85--98", - year = 1992, -} - -@ARTICLE{Dredze_etal_2009, - author = "Mark Dredze and Alex Kulesza and Koby Crammer", - title = "Multi-Domain Learning by Confidence-Weighted Parameter Combination", - journal = "Machine Learning", - volume = 79, - numbers = "1--2", - pages = "123--149", - year = 2010, -} - -@INPROCEEDINGS{Dumais_etal_SIGIR2002, - author = "Susan Dumais and Michele Banko and Eric Brill and Jimmy Lin and Andrew Ng", - title = "{Web} Question Answering: {Is} More Always Better?", - booktitle = "Proceedings of the 25th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR 2002)", - year = 2002, - pages = "291--298", - address = "Tampere, Finland", -} - -@INPROCEEDINGS{Dyer_etal_2008, - author = "Chris Dyer and Aaron Cordova and Alex Mont and Jimmy Lin", - title = "Fast, Easy, and Cheap: Construction of Statistical Machine Translation Models with {MapReduce}", - booktitle = "Proceedings of the Third Workshop on Statistical Machine Translation at ACL 2008", - address = "Columbus, Ohio", - year = 2008, - pages = "199--207", -} - -@INCOLLECTION{Firth_1957, - author = "John R. Firth", - title = "A Synopsis of Linguistic Theory 1930--55", - booktitle = "Studies in Linguistic Analysis, Special Volume of the Philological Society", - pages = "1--32", - address = "Oxford", - publisher = "Blackwell", - year = 1957 -} - -@INPROCEEDINGS{Ghemawat_etal_SOSP2003, - author = "Sanjay Ghemawat and Howard Gobioff and Shun-Tak Leung", - title = "The {Google} {File} {System}", - booktitle = "Proceedings of the 19th ACM Symposium on Operating Systems Principles (SOSP 2003)", - year = 2003, - address = "Bolton Landing, New York", - pages = "29--43" -} - -@ARTICLE{Gilbert_Lynch_2002, - author = "Seth Gilbert and Nancy Lynch", - title = "{Brewer's} {Conjecture} and the Feasibility of Consistent, Available, Partition-Tolerant Web Services", - journal = "ACM SIGACT News", - volume = 33, - number = 2, - pages = "51--59", - year = 2002, -} - -@INPROCEEDINGS{Garcia-Molina_etal_2005, - author = "{Zolt\'{a}n} {Gy\"{o}ngyi} and Hector Garcia-Molina", - title = "Web Spam Taxonomy", - booktitle = "Proceedings of the First International Workshop on Adversarial Information Retrieval on the Web (AIRWeb 2005)", - year = 2005, - address = "Chiba, Japan", - pages = "39--47", -} - -@ARTICLE{Girvan02, - author = "Michelle Girvan and Mark E. J. Newman", - title = "Community Structure in Social and Biological Networks", - journal = "Proceedings of the National Academy of Science", - volume = 99, - number = 12, - pages = "7821--7826", - year = 2002, -} - -@BOOK{Grama_etal_2003, - author = "Ananth Grama and Anshul Gupta and George Karypis and Vipin Kumar", - title = "Introduction to Parallel Computing", - publisher = "Addison-Wesley", - address = "Reading, Massachusetts", - year = 2003, -} - -@ARTICLE{Granovetter73, - author = "Mark S. Granovetter", - title = "The Strength of Weak Ties", - journal = "The American Journal of Sociology", - volume = 78, - number = 6, - pages = "1360--1380", - year = 1973, -} - -@ARTICLE{Granovetter83, - author = "Mark S. Granovetter", - title = "The Strength of Weak Ties: A Network Theory Revisited", - journal = "Sociological Theory", - volume = 1, - pages = "201--233", - year = 1983, -} - -@BOOK{Hage_1996, - author = "Per Hage and Frank Harary", - title = "Island Networks: Communication, Kinship, and Classification Structures in {Oceania}", - publisher = "Cambridge University Press", - address = "Cambridge, England", - year = 1996 -} - -@ARTICLE{Halevy_etal_2009, - author = "Alon Halevy and Peter Norvig and Fernando Pereira", - title = "The Unreasonable Effectiveness of Data", - journal = "Communications of the ACM", - volume = 24, - number = 2, - pages = "8--12", - year = 2009, -} - -@INPROCEEDINGS{Hamilton_2007, - author = "James Hamilton", - title = "On Designing and Deploying {Internet}-Scale Services", - booktitle = "Proceedings of the 21st Large Installation System Administration Conference (LISA '07)", - year = 2007, - address = "Dallas, Texas", - pages = "233--244" -} - -@INPROCEEDINGS{Hamilton_2009, - author = "James Hamilton", - title = "{Cooperative} {Expendable} {Micro-Slice} {Servers} {(CEMS)}: Low Cost, Low Power Servers for {Internet}-Scale Services", - booktitle = "Proceedings of the Fourth Biennial Conference on Innovative Data Systems Research (CIDR 2009)", - year = 2009, - address = "Asilomar, California", -} - -@INCOLLECTION{Hammerbacher_2009, - title = "Information Platforms and the Rise of the Data Scientist", - author = "Jeff Hammerbacher", - editor = "Toby Segaran and Jeff Hammerbacher", - booktitle = "Beautiful Data", - publisher = "O'Reilly", - address = "Sebastopol, California", - year = 2009, - pages = "73--84", -} - -@BOOK{Harris_1968, - author = "Zelig S. Harris", - title = "Mathematical Structures of Language", - address = "New York", - publisher = "Wiley", - year = 1968 -} - -@INPROCEEDINGS{HeB_etal_2008, - author = "Bingsheng He and Wenbin Fang and Qiong Luo and Naga K. Govindaraju and Tuyong Wang", - title = "{Mars}: A {MapReduce} Framework on Graphics Processors", - booktitle = "Proceedings of the 17th International Conference on Parallel Architectures and Compilation Techniques (PACT 2008)", - year = 2008, - address = "Toronto, Ontario, Canada", - pages = "260--269", -} - -@BOOK{Hey_etal_2009, - author = "Tony Hey and Stewart Tansley and Kristin Tolle", - title = "The Fourth Paradigm: Data-Intensive Scientific Discovery", - publisher = "Microsoft Research", - address = "Redmond, Washington", - year = 2009, -} - -@INCOLLECTION{Hey_etal_2009-Gray, - title = "{Jim} {Gray} on {eScience}: A Transformed Scientific Method", - author = "Tony Hey and Stewart Tansley and Kristin Tolle", - editor = "Tony Hey and Stewart Tansley and Kristin Tolle", - booktitle = "The Fourth Paradigm: Data-Intensive Scientific Discovery", - publisher = "Microsoft Research", - address = "Redmond, Washington", - year = 2009, -} - -@ARTICLE{Howard_etal_1988, - author = "John Howard and Michael Kazar and Sherri Menees and David Nichols and Mahadev Satyanarayanan and Robert Sidebotham and Michael West", - title = "Scale and Performance in a Distributed File System", - journal = "ACM Transactions on Computer Systems", - volume = 6, - number = 1, - pages = "51--81", - year = 1988, -} - -@INPROCEEDINGS{Isard_etal_2007, - author = "Michael Isard and Mihai Budiu and Yuan Yu and Andrew Birrell and Dennis Fetterly", - title = "Dryad: Distributed Data-Parallel Programs from Sequential Building Blocks", - booktitle = "Proceedings of the ACM SIGOPS/EuroSys European Conference on Computer Systems 2007 (EuroSys 2007)", - address = "Lisbon, Portugal", - year = 2007, - pages = "59--72", -} - -@BOOK{JaJa_1992, - author = "Joseph JaJa", - title = "An Introduction to Parallel Algorithms", - publisher = "Addison-Wesley", - address = "Reading, Massachusetts", - year = 1992, -} - -@ARTICLE{JacobsAdam_2009, - author = "Adam Jacobs", - title = "The Pathologies of Big Data", - journal = "ACM Queue", - volume = 7, - number = 6, - year = 2009, -} - -@BOOK{Jurafsky_Martin_2009, - author = "Daniel Jurafsky and James H. Martin", - title = "Speech and Language Processing", - publisher = "Pearson", - address = "Upper Saddle River, New Jersey", - year = 2009, -} - -@TECHREPORT{KangU_etal_2008, - author = "U Kang and Charalampos Tsourakakis and Ana Paula Appel and Christos Faloutsos and Jure Leskovec", - title = "{HADI}: Fast Diameter Estimation and Mining in Massive Graphs with {Hadoop}", - number = "CMU-ML-08-117", - institution = "School of Computer Science, Carnegie Mellon University", - year = 2008, -} - -@INPROCEEDINGS{KangU_etal_2009, - author = "U Kang and Charalampos E. Tsourakakis and Christos Faloutsos", - title = "{PEGASUS}: A Peta-Scale Graph Mining System---Implementation and Observations", - booktitle = "Proceedings of the 2009 Ninth IEEE International Conference on Data Mining (ICDM 2009)", - year = 2009, - address = "Miami, Floria", - pages = "229--238", -} - -@INPROCEEDINGS{Karloff_etal_2010, - author = "Howard Karloff and Siddharth Suri and Sergei Vassilvitskii", - title = "A Model of Computation for {MapReduce}", - booktitle = "Proceedings of the 21st Annual ACM-SIAM Symposium on Discrete Algorithms (SODA 2010)", - year = 2010, - address = "Austin, Texas", -} - -@INPROCEEDINGS{KimballA_etal_2008, - author = "Aaron Kimball and Sierra Michels-Slettvet and Christophe Bisciglia", - title = "Cluster Computing for {Web}-Scale Data Processing", - booktitle = "Proceedings of the 39th ACM Technical Symposium on Computer Science Education (SIGCSE 2008)", - address = "Portland, Oregon", - year = 2008, - pages = "116--120", -} - -@ARTICLE{Kleinberg_JACM1999, - author = "Jon M. Kleinberg", - title = "Authoritative Sources in a Hyperlinked Environment", - journal = "Journal of the ACM", - volume = 46, - number = 5, - pages = "604--632", - year = 1999, -} - -@ARTICLE{Lempel_Moran_TOIS2001, - author = "Ronny Lempel and Shlomo Moran", - title = "{SALSA}: The {Stochastic} {Approach} for {Link-Structure} {Analysis}", - journal = "ACM Transactions on Information Systems", - volume = 19, - number = 2, - pages = "131--160", - year = 2001, -} - -@ARTICLE{Leventhal_2009, - author = "Adam Leventhal", - title = "Triple-Parity {RAID} and Beyond", - journal = "ACM Queue", - volume = 7, - number = 11, - year = 2009, -} - -@ARTICLE{Lin_TOIS2007, - author = "Jimmy Lin", - title = "An Exploration of the Principles Underlying Redundancy-Based Factoid Question Answering", - journal = "ACM Transactions on Information Systems", - volume = 27, - number = 2, - pages = "1--55", - year = "2007", -} - -@INPROCEEDINGS{Lin_TeachCL2008, - author = "Jimmy Lin", - title = "Exploring Large-Data Issues in the Curriculum: A Case Study with {MapReduce}", - booktitle = "Proceedings of the Third Workshop on Issues in Teaching Computational Linguistics (TeachCL-08) at ACL 2008", - address = "Columbus, Ohio", - year = 2008, - pages = "54--61", -} - -@INPROCEEDINGS{Lin_EMNLP2008, - author = "Jimmy Lin", - title = "Scalable Language Processing Algorithms for the Masses: A Case Study in Computing Word Co-occurrence Matrices with {MapReduce}", - booktitle = "Proceedings of the 2008 Conference on Empirical Methods in Natural Language Processing (EMNLP 2008)", - address = "Honolulu, Hawaii", - year = 2008, - pages = "419--428", -} - -@TECHREPORT{Lin_etal_TR2009, - author = "Jimmy Lin and Anand Bahety and Shravya Konda and Samantha Mahindrakar", - title = "Low-Latency, High-Throughput Access to Static Global Resources within the {Hadoop} Framework", - number = "HCIL-2009-01", - institution = "University of Maryland", - address = "College Park, Maryland", - month = "January", - year = 2009, -} - -@INPROCEEDINGS{Malewicz_etal_2009, - author = "Grzegorz Malewicz and Matthew H. Austern and Aart J. C. Bik and James C. Dehnert and Ilan Horn and Naty Leiser and Grzegorz Czajkowski", - title = "{Pregel}: A System for Large-Scale Graph Processing", - booktitle = "Proceedings of the 28th ACM Symposium on Principles of Distributed Computing (PODC 2009)", - address = "Calgary, Alberta, Canada", - year = 2009, - pages = "6", -} - -@INPROCEEDINGS{Malewicz_etal_SIGMOD2010, - author = "Grzegorz Malewicz and Matthew H. Austern and Aart J. C. Bik and James C. Dehnert and Ilan Horn and Naty Leiser and Grzegorz Czajkowski", - title = "{Pregel}: A System for Large-Scale Graph Processing", - booktitle = "Proceedings of the 2010 ACM SIGMOD International Conference on Management of Data", - address = "Indianapolis, Indiana", - year = 2010, -} - -@BOOK{Manning_Schutze_1999, - author = "Christopher D. Manning and Hinrich {Sch\"{u}tze}", - title = "Foundations of Statistical Natural Language Processing", - publisher = "MIT Press", - address = "Cambridge, Massachusetts", - year = 1999, -} - -@BOOK{Manning_etal_2008, - author = "Christopher D. Manning and Prabhakar Raghavan and Hinrich {Sch\"{u}tze}", - title = "An Introduction to Information Retrieval", - publisher = "Cambridge University Press", - address = "Cambridge, England", - year = 2008, -} - -@ARTICLE{Mardis_2008, - author = "Elaine R. Mardis", - title = "The Impact of Next-Generation Sequencing Technology on Genetics", - journal = "Trends in Genetics", - volume = 24, - number = 3, - pages = "133--141", - year = 2008, -} - -@ARTICLE{McCool_2008, - author = "Michael D. McCool", - title = "Scalable Programming Models for Massively Multicore Processors", - journal = "Proceedings of the IEEE", - volume = 96, - number = 5, - pages = "816--831", - year = 2008, -} - -@ARTICLE{McKusick_Quinlan_2009, - author = "Marshall K. McKusick and Sean Quinlan", - title = "{GFS}: Evolution on Fast-forward", - journal = "ACM Queue", - volume = 7, - number = 7, - year = 2009, -} - -@ARTICLE{Mellor-Crummey_etal_2001, - author = "John Mellor-Crummey and David Whalley and Ken Kennedy", - title = "Improving Memory Hierarchy Performance for Irregular Applications Using Data and Computation Reorderings", - journal = "International Journal of Parallel Programming", - volume = 29, - number = 3, - pages = "217--247", - year = 2001, -} - -@INPROCEEDINGS{Metzler_etal_2009, - author = "Donald Metzler and Jasmine Novak and Hang Cui and Srihari Reddy", - title = "Building Enriched Document Representations Using Aggregated Anchor Text", - booktitle = "Proceedings of the 32nd Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR 2009)", - year = 2009, - pages = "219--226", -} - -@INPROCEEDINGS{MillerD99, - author = "David R. H. Miller and Tim Leek and Richard M. Schwartz", - title = "A Hidden {Markov} Model Information Retrieval System", - booktitle = "Proceedings of the 22nd Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR 1999)", - address = "Berkeley, California", - year = 1999, - pages = "214--221", -} - -@INPROCEEDINGS{Moffat_etal_SIGIR2006, - author = "Alistair Moffat and William Webber and Justin Zobel", - title = "Load Balancing for Term-Distributed Parallel Retrieval", - booktitle = "Proceedings of the 29th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR 2006)", - address = "Seattle, Washington", - year = 2006, - pages = "348--355", -} - -@INPROCEEDINGS{Nurmi_etal_2009, - author = "Daniel Nurmi and Rich Wolski and Chris Grzegorczyk and Graziano Obertelli and Sunil Soman and Lamia Youseff and Dmitrii Zagorodnov", - title = "The {Eucalyptus} Open-Source Cloud-Computing System", - booktitle = "Proceedings of the 9th IEEE/ACM International Symposium on Cluster Computing and the Grid", - year = 2009, - address = "Washington, D.C.", - pages = "124--131", -} - -@INPROCEEDINGS{Olston_etal_SIGMOD2008, - author = "Christopher Olston and Benjamin Reed and Utkarsh Srivastava and Ravi Kumar and Andrew Tomkins", - title = "{Pig} {Latin}: A Not-So-Foreign Language for Data Processing", - booktitle = "Proceedings of the 2008 ACM SIGMOD International Conference on Management of Data", - address = "Vancouver, British Columbia, Canada", - year = 2008, - pages = "1099--1110", -} - -@article{Olston_Najork_2010, - author = "Christopher Olston and Marc Najork", - title = "Web Crawling", - journal = "Foundations and Trends in Information Retrieval", - volume = 4, - number = 3, - pages = "175--246", - year = 2010 -} - -@ARTICLE{Olukotun_Hammond_2005, - author = "Kunle Olukotun and Lance Hammond", - title = "The Future of Microprocessors", - journal = "ACM Queue", - volume = 3, - number = 7, - pages = "27--34", - year = 2005, -} - -@ARTICLE{Pang_Lee_2008, - author = "Bo Pang and Lillian Lee", - title = "Opinion Mining and Sentiment Analysis", - journal = "Foundations and Trends in Information Retrieval", - volume = 2, - number = "1--2", - pages = "1--135", - year = 2008, -} - -@TECHREPORT{Page_etal_1999, - author = "Lawrence Page and Sergey Brin and Rajeev Motwani and Terry Winograd", - title = "The {PageRank} Citation Ranking: Bringing Order to the {Web}", - type = "Stanford Digital Library Working Paper", - number = "SIDL-WP-1999-0120", - institution = "Stanford University", - year = 1999, -} - -@ARTICLE{Patterson_CACM2008, - author = "David A. Patterson", - title = "The Data Center is the Computer", - journal = "Communications of the ACM", - volume = 52, - number = 1, - pages = "105", - year = 2008, -} - -@INPROCEEDINGS{Pavlo_etal_SIGMOD2009, - author = "Andrew Pavlo and Erik Paulson and Alexander Rasin and Daniel J. Abadi and David J. DeWitt and Samuel Madden and Michael Stonebraker", - title = "A Comparison of Approaches to Large-Scale Data Analysis", - booktitle = "Proceedings of the 35th ACM SIGMOD International Conference on Management of Data", - year = 2009, - pages = "165--178", - address = "Providence, Rhode Island", -} - -@ARTICLE{Pike_etal_2005, - author = "Rob Pike and Sean Dorward and Robert Griesemer and Sean Quinlan", - title = "Interpreting the Data: Parallel Analysis with {Sawzall}", - journal = "Scientific Programming Journal", - volume = 13, - number = 4, - pages = "277--298", - year = 2005, -} - -@INPROCEEDINGS{Pinheiro_etal_2007, - author = "Eduardo Pinheiro and Wolf-Dietrich Weber and Luiz {Andr\'{e}} Barroso", - title = "Failure Trends in a Large Disk Drive Population", - booktitle = "Proceedings of the 5th USENIX Conference on File and Storage Technologies (FAST 2007)", - year = 2008, - address = "San Jose, California", -} - -@article{qiACS09, - author = "Xiaoguang Qi and Brian D. Davison", - title = "Web Page Classification: Features and Algorithms", - journal = "ACM Computing Surveys", - volume = "41", - number = "2", - year = "2009" -} - -@ARTICLE{Rafique_etal_2009, - author = "M. Mustafa Rafique and Benjamin Rose and Ali R. Butt and Dimitrios S. Nikolopoulos", - title = "Supporting {MapReduce} on Large-Scale Asymmetric Multi-Core Clusters", - journal = "ACM Operating Systems Review", - volume = 43, - number = 2, - pages = "25--34", - year = 2009, -} - -@INPROCEEDINGS{Ranger_etal_2007, - author = "Colby Ranger and Ramanan Raghuraman and Arun Penmetsa and Gary Bradski and Christos Kozyrakis", - title = "Evaluating {MapReduce} for Multi-core and Multiprocessor Systems", - booktitle = "Proceedings of the 13th International Symposium on High-Performance Computer Architecture (HPCA 2007)", - address = "Phoenix, Arizona", - year = 2007, - pages = "205--218", -} - -@INPROCEEDINGS{Rao_Yarowsky_2009, - author = "Delip Rao and David Yarowsky", - title = "Ranking and Semi-supervised Classification on Large Scale Graphs Using {Map-Reduce}", - booktitle = "Proceedings of the \mbox{ACL/IJCNLP} 2009 Workshop on Graph-Based Methods for Natural Language Processing (TextGraphs-4)", - year = 2009, - address = "Singapore", -} - -@ARTICLE{Rappa_2004, - author = "Michael A. Rappa", - title = "The Utility Business Model and the Future of Computing Services", - journal = "IBM Systems Journal", - volume = 34, - number = 1, - pages = "32--42", - year = 2004, -} - -@INPROCEEDINGS{Sandholm_Lai_2009, - author = "Thomas Sandholm and Kevin Lai", - title = "{MapReduce} Optimization Using Regulated Dynamic Prioritization", - booktitle = "Proceedings of the Eleventh International Joint Conference on Measurement and Modeling of Computer Systems (\mbox{SIGMETRICS} '09)", - address = "Seattle, Washington", - year = 2009, - pages = "299--310", -} - -@PHDTHESIS{Schatz_2010, - author = "Michael Schatz", - title = "High Performance Computing for {DNA} Sequence Alignment and Assembly", - school = "University of Maryland, College Park", - year = 2010, -} - -@INPROCEEDINGS{Schneider_DeWitt_SIGMOD1989, - author = "Donovan A. Schneider and David J. DeWitt", - title = "A Performance Evaluation of Four Parallel Join Algorithms in a Shared-Nothing Multiprocessor Environment", - booktitle = "Proceedings of the 1989 ACM SIGMOD International Conference on Management of Data", - address = "Portland, Oregon", - year = 1989, - pages = "110--121", -} - -@INPROCEEDINGS{Schmuck_Haskin_2002, - author = "Frank Schmuck and Roger Haskin", - title = "{GPFS}: A Shared-Disk File System for Large Computing Clusters", - booktitle = "Proceedings of the First USENIX Conference on File and Storage Technologies", - year = 2002, - pages = "231--244", - address = "Monterey, California", -} - -@INPROCEEDINGS{Schroeder_etal_2009, - author = "Bianca Schroeder and Eduardo Pinheiro and Wolf-Dietrich Weber", - title = "{DRAM} Errors in the Wild: A Large-Scale Field Study", - booktitle = "Proceedings of the Eleventh International Joint Conference on Measurement and Modeling of Computer Systems (\mbox{SIGMETRICS} '09)", - year = 2009, - address = "Seattle, Washington", - pages = "193--204", -} - -@ARTICLE{Schutze_CL1998, - author = "Hinrich Sch{\"{u}}tze", - title = "Automatic Word Sense Discrimination", - journal = "Computational Linguistics", - volume = 24, - number = 1, - pages = "97--123", - year = 1998, -} - -@ARTICLE{Schutze_Pedersen_IPM1997, - author = "Hinrich Sch{\"{u}}tze and Jan O. Pedersen", - title = "A Cooccurrence-Based Thesaurus and Two Applications to Information Retrieval", - journal = "Information Processing and Management", - volume = 33, - number = 3, - pages = "307--318", - year = 1998, -} - -@BOOK{Sekine_Ranchhod_2009, - author = "Satoshi Sekine and Elisabete Ranchhod", - title = "Named Entities: Recognition, Classification and Use", - publisher = "John Benjamins", - address = "Amsterdam, The Netherlands", - year = 2009, -} - -@INCOLLECTION{Southan_Cameron_2009, - title = "Beyond the Tsunami: Developing the Infrastructure to Deal with Life Sciences Data", - author = "Christopher Southan and Graham Cameron", - editor = "Tony Hey and Stewart Tansley and Kristin Tolle", - booktitle = "The Fourth Paradigm: Data-Intensive Scientific Discovery", - publisher = "Microsoft Research", - address = "Redmond, Washington", - year = 2009, -} - -@ARTICLE{Stonebraker_etal_CACM2010, - author = "Michael Stonebraker and Daniel Abadi and David J. DeWitt and Sam Madden and Erik Paulson and Andrew Pavlo and Alexander Rasin", - title = "{MapReduce} and Parallel {DBMSs}: Friends or Foes?", - journal = "Communications of the ACM", - volume = 53, - number = 1, - pages = "64--71", - year = 2010, -} - -@ARTICLE{Szalay_etal_2000, - author = "Alexander S. Szalay and Peter Z. Kunszt and Ani Thakar and Jim Gray and Don Slutz and Robert J. Brunner", - title = "Designing and Mining Multi-Terabyte Astronomy Archives: The {Sloan} {Digital} {Sky} {Survey}", - journal = "SIGMOD Record", - volume = 29, - number = 2, - pages = "451--462", - year = 2000, -} - -@TECHREPORT{Tantisiriroj_etal_2008, - author = "Wittawat Tantisiriroj and Swapnil Patil and Garth Gibson", - title = "Data-intensive File systems for {Internet} Services: A Rose By Any Other Name\ldots", - number = "CMU-PDL-08-114", - institution = "Parallel Data Laboratory, Carnegie Mellon University", - year = 2008, -} - -@INPROCEEDINGS{Thekkath_etal_SOSP1997, - author = "Chandramohan A. Thekkath and Timothy Mann and Edward K. Lee", - title = "{Frangipani}: A Scalable Distributed File System", - booktitle = "Proceedings of the 16th ACM Symposium on Operating Systems Principles (SOSP 1997)", - year = 1997, - pages = "224--237", - address = "Saint-Malo, France", -} - -@ARTICLE{Valiant_CACM1990, - author = "Leslie G. Valiant", - title = "A Bridging Model for Parallel Computation", - journal = "Communications of the ACM", - volume = 33, - number = 8, - pages = "103--111", - year = 1990, -} - -@ARTICLE{Vaquero_etal_2009, - author = "Luis M. Vaquero and Luis Rodero-Merino and Juan Caceres and Maik Lindner", - title = "A Break in the Clouds: Towards a Cloud Definition", - journal = "ACM SIGCOMM Computer Communication Review", - volume = 39, - number = 1, - pages = "50--55", - year = 2009, -} - -@ARTICLE{Watts_Strogatz_1998, - author = "Duncan J. Watts and Steven H. Strogatz", - title = "Collective Dynamics of `Small-World' Networks", - journal = "Nature", - volume = 393, - pages = "440--442", - year = 1998, -} - -@INPROCEEDINGS{Wen_Vishkin_2008, - author = "Xingzhi Wen and Uzi Vishkin", - title = "{FPGA}-Based Prototype of a {PRAM-On-Chip} Processor", - booktitle = "Proceedings of the 5th Conference on Computing Frontiers", - address = "Ischia, Italy", - year = 2008, - pages = "55--66", -} - -@ARTICLE{Wigner_1960, - author = "Eugene Wigner", - title = "The Unreasonable Effectiveness of Mathematics in the Natural Sciences", - journal = "Communications in Pure and Applied Mathematics", - volume = 13, - number = 1, - pages = "1--14", - year = 1960, -} - -@BOOK{Witten_etal_1999, - author = "Ian H. Witten and Alistair Moffat and Timothy C. Bell", - title = "Managing Gigabytes: Compressing and Indexing Documents and Images", - publisher = "Morgan Kaufmann Publishing", - address = "San Francisco, California", - year = 1999, -} - -@BOOK{White_2009, - title = "{Hadoop}: The Definitive Guide", - author = "Tom White", - publisher = "O'Reilly", - address = "Sebastopol, California", - year = 2009, -} - -@ARTICLE{Xu_Croft_TOIS1998, - author = "Jinxi Xu and W. Bruce Croft", - title = "Corpus-Based Stemming Using Cooccurrence of Word Variants", - journal = "ACM Transactions on Information Systems", - volume = 16, - number = 1, - pages = "61--81", - year = 1998, -} - -@ARTICLE{XuR_Wunsch_2005b, - author = "Rui Xu and Donald Wunsch II", - title = "Survey of Clustering Algorithms", - journal = "IEEE Transactions on Neural Networks", - volume = 16, - number = 3, - pages = "645--678", - year = 2005, -} - -@INPROCEEDINGS{YangHungchih_etal_SIGMOD2007, - author = "Hung-chih Yang and Ali Dasdan and Ruey-Lung Hsiao and D. Stott Parker", - title = "{Map-Reduce-Merge}: Simplified Relational Data Processing on Large Clusters", - booktitle = "Proceedings of the 2007 ACM SIGMOD International Conference on Management of Data", - address = "Beijing, China", - year = 2007, - pages = "1029--1040", -} - -@INPROCEEDINGS{YuYuan_etal_OSDI2008, - author = "Yuan Yu and Michael Isard and Dennis Fetterly and Mihai Budiu and {\'{U}lfar} Erlingsson and Pradeep Kumar Gunda and Jon Currey", - title = "{DryadLINQ}: A System for General-Purpose Distributed Data-Parallel Computing Using a High-Level Language", - booktitle = "Proceedings of the 8th Symposium on Operating System Design and Implementation (OSDI 2008)", - address = "San Diego, California", - year = 2008, - pages = "1--14", -} - -@INPROCEEDINGS{Zaharia_etal_OSDI2008, - author = "Matei Zaharia and Andy Konwinski and Anthony D. Joseph and Randy Katz and Ion Stoica", - title = "Improving {MapReduce} Performance in Heterogeneous Environments", - booktitle = "Proceedings of the 8th Symposium on Operating System Design and Implementation (OSDI 2008)", - address = "San Diego, California", - year = 2008, - pages = "29--42", -} - -@TECHREPORT{Zaharia_etal_2009, - author = "Matei Zaharia and Dhruba Borthakur and Joydeep Sen Sarma and Khaled Elmeleegy and Scott Shenker and Ion Stoica", - title = "Job Scheduling for Multi-User {MapReduce} Clusters", - number = "UCB/EECS-2009-55", - institution = "Electrical Engineering and Computer Sciences, University of California at Berkeley", - year = 2009, -} - -@ARTICLE{Zobel_Moffat_2006, - author = "Justin Zobel and Alistair Moffat", - title = "Inverted Files for Text Search Engines", - journal = "ACM Computing Surveys", - volume = 38, - number = 6, - pages = "1--56", - year = 2006, -} +@INPROCEEDINGS{WangYi_etal_2009, + author = "Yi Wang and Hongjie Bai and Matt Stanton and Wen-Yen Chen and Edward Y. Chang", + title = "{PLDA}: Parallel Latent {Dirichlet} Allocation for Large-Scale Applications", + booktitle = "Proceedings of the Fifth International Conference on Algorithmic Aspects in Information and Management (AAIM 2009)", + address = "San Francisco, California", + year = 2009, + pages = "301--314", +} + +@INPROCEEDINGS{ChuCT_etal_2006, + author = "Cheng-Tao Chu and Sang Kyun Kim and Yi-An Lin and YuanYuan Yu and Gary Bradski and Andrew Ng and Kunle Olukotun", + title = "{Map-Reduce} for Machine Learning on Multicore", + booktitle = "Advances in Neural Information Processing Systems 19 (NIPS 2006)", + year = 2006, + address = "Vancouver, British Columbia, Canada", + pages = "281--288", +} + +@book{Owen_2010, + author={Sean Owen and Robin Anil}, + title={Mahout in Action}, + publisher={Manning Publications Co.}, + year=2010, + address={Greenwich, Connecticut} +} + +@inproceedings{Asuncion_2008, + author = {Arthur Asuncion and Padhraic Smyth and Max Welling}, + title = {Asynchronous Distributed Learning of Topic Models}, + booktitle = "Advances in Neural Information Processing Systems 21 (NIPS 2008)", + year = 2008, + address = "Vancouver, British Columbia, Canada", + pages = {81--88} +} + +@incollection{Bottou_2004, + author = {Bottou, L\'{e}on}, + title = {Stochastic Learning}, + booktitle = {Advanced Lectures on Machine Learning}, + pages = {146-168}, + publisher = {Springer Verlag}, + year = {2004}, + editor = {Bousquet, Olivier and von Luxburg, Ulrike}, + series = {Lecture Notes in Artificial Intelligence, LNAI~3176}, + address = {Berlin}, + url = {http://leon.bottou.org/papers/bottou-mlss-2004}, +} + +@INPROCEEDINGS{Feigenbaum_2004, + author = {Joan Feigenbaum and Sampath Kannan and Andrew Mcgregor and Siddharth Suri and Jian Zhang}, + title = {On Graph Problems in a Semi-Streaming Model}, + booktitle = {31st International Colloquium on Automata, Languages and Programming}, + year = {2004}, + pages = {531--543} +} + +@INPROCEEDINGS{Taura_2003, + author = {Kenjiro Taura and Toshio Endo and Kenji Kaneda and Akinori Yonezawa}, + title = {Phoenix: a Parallel Programming Model for Accommodating Dynamically Joining Resources}, + booktitle = {Proceedings of the Ninth ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming}, + year = {2003}, + pages = {216--229}, + publisher = {ACM} +} + +@InProceedings{Levenberg_2009, + author = {Levenberg, Abby and Osborne, Miles}, + title = {Stream-based Randomised Language Models for {SMT}}, + booktitle = {Proceedings of the 2009 Conference on Empirical Methods in Natural Language Processing}, + year = {2009}, + address = {Singapore}, + pages = {756--764}, +} + +@inproceedings{Levenberg_2010, + Address = "Los Angeles, California", + Author = "Abby Levenberg and Chris Callison-Burch and Miles Osborne", + booktitle = "Proceedings of the 11th Annual Conference of the North American Chapter of the Association for Computational Linguistics (NAACL HLT 2010)", + Title = {Stream-based Translation Models for Statistical Machine Translation}, + Year = 2010 +} + +@inproceedings{Petrovic_2010, + Address = "Los Angeles, California", + Author = "Sasa Petrovic and Miles Osborne and Victor Lavrenko", + booktitle = "Proceedings of the 11th Annual Conference of the North American Chapter of the Association for Computational Linguistics (NAACL HLT 2010)", + Title = {Streaming First Story Detection with application to {Twitter}}, + Year = 2010 +} + +Sasa Petrovic, Miles Osborne and Victor Lavrenko. Streaming First Story Detection with application to Twitter. NAACL, Los Angeles, USA. June 2010. + +@inproceedings{Alon_1996, + author = {Alon, Noga and Matias, Yossi and Szegedy, Mario}, + title = {The space complexity of approximating the frequency moments}, + booktitle = {Proceedings of the 28th Annual ACM Symposium on Theory of Computing (STOC '96)}, + year = {1996}, + isbn = {0-89791-785-5}, + pages = {20--29}, + address = {Philadelphia, Pennsylvania}, +} + +@UNPUBLISHED{Smith_2004, + author = "Noah Smith", + title = {Log-linear models}, + note = "http://www.cs.cmu.edu/~nasmith/papers/smith.tut04.pdf", + year = 2004 + } + +@article{Brown_1993, + author = {Brown, Peter F. and Della Pietra, Vincent J. and Della Pietra, Stephen A. and Mercer, Robert L. }, + citeulike-article-id = {1286336}, + journal = {Computational Linguistics}, + number = {2}, + pages = {263--311}, + publisher = {MIT Press}, + title = {The Mathematics of Statistical Machine Translation: Parameter Estimation}, + volume = {19}, + year = {1993} +} + + +@BOOK{Brants_2010, + author = "Thorsten Brants and Peng Xu", + title = "Distributed Language Models ", + publisher = "Morgan \& Claypool Publishers", + year = 2010, +} + +@inproceedings{Callison_Burch_2009, + author = {Callison-Burch, Chris and Koehn, Philipp and Monz, Christof and Schroeder, Josh}, + title = {Findings of the 2009 workshop on statistical machine translation}, + booktitle = {Proceedings of the Fourth Workshop on Statistical Machine Translation (StatMT '09)}, + year = {2009}, + pages = {1--28}, + address = {Athens, Greece}, + } + +@article{Gao_2010, + Author={Qin Gao and Stephan Vogel}, + title={Training phrase-based machine translation models on the cloud: Open source machine translation toolkit {Chaski}}, + journal={The Prague Bulletin of Mathematical Linguistics}, + Volume={93}, + year=2010, + pages={37--46} +} + +@inproceedings{Koehn_2003, + Address = "Edmonton, Alberta, Canada", + Author = "Philipp Koehn and Franz J. Och and Daniel Marcu", + booktitle = "Proceedings of the 2003 Human Language Technology Conference of the North American Chapter of the Association for Computational Linguistics (HLT/NAACL 2003)", + Doi = {http://dx.doi.org/10.3115/1073445.1073462}, + Pages = {48--54}, + Title = {Statistical phrase-based translation}, + Year = {2003}} + + +@inproceedings{Sha_2003, + author = {Sha, Fei and Pereira, Fernando}, + pages = {134--141}, + title = {Shallow parsing with conditional random fields}, + booktitle = "Proceedings of the 2003 Human Language Technology Conference of the North American Chapter of the Association for Computational Linguistics (\mbox{HLT/NAACL} 2003)", + address = "Edmonton, Alberta, Canada", + year = 2003, +} + +@inproceedings{Malouf_2002, + author = {Malouf, Robert}, + title = {A comparison of algorithms for maximum entropy parameter estimation}, + booktitle = "Proceedings of the Sixth Conference on Natural Language Learning (CoNLL-2002)", + year = {2002}, + pages = {49--55}, + address = "Taipei, Taiwan" + } + +@ARTICLE{LBFGS, + author = {Dong C. Liu and Jorge Nocedal and Dong C. Liu and Jorge Nocedal}, + title = {On the limited memory {B}{F}{G}{S} method for large scale optimization}, + journal = {Mathematical Programming B}, + year = {1989}, + volume = {45}, + number = {3}, + pages = {503--528} +} + + +@inproceedings{Lafferty_2001, + author = {Lafferty, John D. and McCallum, Andrew and Pereira, Fernando}, + title = "Conditional Random Fields: Probabilistic Models for Segmenting and Labeling Sequence Data", + booktitle = {Proceedings of the Eighteenth International Conference on Machine Learning (ICML '01)}, + year = {2001}, + isbn = {1-55860-778-1}, + pages = {282--289}, + address = {San Francisco, California}, + } + +@conference{Nigam_1999, + title={{Using maximum entropy for text classification}}, + author={Kamal Nigam and John Lafferty and Andrew McCallum}, + booktitle={Proceedings of the IJCAI-99 Workshop on Machine Learning for Information Filtering}, + pages={61--67}, + year={1999}, + address ="Stockholm, Sweden" +} + +@inproceedings{Liang_2009, + title = {Online {EM} for Unsupervised Models}, + author = {Percy Liang and Dan Klein}, + booktitle = {North American Association for Computational Linguistics (NAACL)}, + year = {2009}, +} + +@article{Och_2003, + Author = {Franz J. Och and Hermann Ney}, + Journal = {Computational Linguistics}, + Number = {1}, + Pages = {19--51}, + Title = {A systematic comparison of various statistical alignment models}, + Volume = {29}, + Year = {2003} +} + +@conference{Seymore_1999, + title={Learning hidden {Markov} model structure for information extraction}, + author="Kristie Seymore and Andrew Mccallum and Ronald Rosenfeld", + booktitle={Proceedings of the \mbox{AAAI-99} Workshop on Machine Learning for Information Extraction}, + pages={37--42}, + year={1999}, + address = "Orlando, Florida" +} + +@book{Koehn_2009, + title={Statistical Machine Translation}, + author={Philipp Koehn}, + publisher = "Cambridge University Press", + address = "Cambridge, England", + year={2010}, +} + +@article{Lopez_2008, + author={Adam Lopez}, + title={Statistical Machine Translation}, + journal={ACM Computing Surveys}, + volume={40}, + number={3}, + article={8}, + pages={1--49}, + year=2008 +} + +@book{Ross_1996, + title={Stochastic processes}, + author={Ross, Sheldon M.}, + year={1996}, + publisher={Wiley}, + address={New York} +} + +@inproceedings{Cutting_1992, + author = {Cutting, Doug and Kupiec, Julian and Pedersen, Jan and Sibun, Penelope}, + title = {A practical part-of-speech tagger}, + booktitle = {Proceedings of the Third Conference on Applied Natural Language Processing}, + year = {1992}, + pages = {133--140}, + address = {Trento, Italy}, +} + +@article{Stanke_2003, + address = {Institut f\"{u}r Mikrobiologie und Genetik, Abteilung Bioinformatik, Universit\"{a}t G\"{o}ttingen, G\"{o}ttingen, Germany. mstanke@gwdg.de}, + author = {Stanke, Mario and Waack, Stephan}, + citeulike-article-id = {2002786}, + citeulike-linkout-0 = {http://view.ncbi.nlm.nih.gov/pubmed/14534192}, + citeulike-linkout-1 = {http://www.hubmed.org/display.cgi?uids=14534192}, + issn = {1367-4811}, + journal = {Bioinformatics}, + keywords = {algorithm, gene-prediction}, + month = {October}, + posted-at = {2007-11-28 10:30:57}, + priority = {2}, + title = {Gene prediction with a hidden {Markov} model and a new intron submodel}, + url = {http://view.ncbi.nlm.nih.gov/pubmed/14534192}, + volume = {19 Suppl 2}, + year = {2003}, + pages = "ii215--225", +} + +@INPROCEEDINGS{Abouzeid_etal_VLDB2009, + author = "Azza Abouzeid and Kamil Bajda-Pawlikowski and Daniel Abadi and Avi Silberschatz and Alexander Rasin", + title = "{HadoopDB}: An Architectural Hybrid of {MapReduce} and {DBMS} Technologies for Analytical Workloads", + booktitle = "Proceedings of the 35th International Conference on Very Large Data Base (VLDB 2009)", + address = "Lyon, France", + year = 2009, + pages = "922--933", +} + +@INPROCEEDINGS{Amdahl_1967, + author = "Gene Amdahl", + title = "Validity of the Single Processor Approach to Achieving Large-Scale Computing Capabilities", + booktitle = "Proceedings of the AFIPS Spring Joint Computer Conference", + year = 1967, + pages = "483--485", +} + +@INPROCEEDINGS{Ananthanarayanan_etal_2009, + author = "Rajagopal Ananthanarayanan and Karan Gupta and Prashant Pandey and Himabindu Pucha and Prasenjit Sarkar and Mansi Shah and Renu Tewari", + title = "Cloud Analytics: Do We \emph{Really} Need to Reinvent the Storage Stack?", + booktitle = "Proceedings of the 2009 Workshop on Hot Topics in Cloud Computing (HotCloud 09)", + address = "San Diego, California", + year = 2009, +} + +@TECHREPORT{Armbrust_etal_2009, + author = "Michael Armbrust and Armando Fox and Rean Griffith and Anthony D. Joseph and Randy H. Katz and Andrew Konwinski and Gunho Lee and David A. Patterson and Ariel Rabkin and Ion Stoica and Matei Zaharia", + title = "Above the Clouds: A {Berkeley} View of Cloud Computing", + number = "UCB/EECS-2009-28", + institution = "Electrical Engineering and Computer Sciences, University of California at Berkeley", + year = 2009, +} + +@INPROCEEDINGS{asuncion08asynchronous, + title = {Asynchronous Distributed Learning of Topic Models.}, + author = {Arthur Asuncion and Padhraic Smyth and Max Welling}, + booktitle = {NIPS}, + publisher = {MIT Press}, + year = {2008} +} + +@article{Dempster_Laird_Rubin_1977, + abstract = {A broadly applicable algorithm for computing maximum likelihood estimates from incomplete data is presented at various levels of generality. Theory showing the monotone behaviour of the likelihood and convergence of the algorithm is derived. Many examples are sketched, including missing value situations, applications to grouped, censored or truncated data, finite mixture models, variance component estimation, hyperparameter estimation, iteratively reweighted least squares and factor analysis.}, + author = {Dempster, Arthur P. and Laird, Nan M. and Rubin, Donald B.}, + doi = {10.2307/2984875}, + issn = {00359246}, + journal = {Journal of the Royal Statistical Society. Series B (Methodological)}, + number = {1}, + pages = {1--38}, + publisher = {Blackwell Publishing for the Royal Statistical Society}, + title = {Maximum Likelihood from Incomplete Data via the {EM} Algorithm}, + volume = {39}, + year = {1977} +} + +@book{Jelinek_1997, + author = {Jelinek, Frederick}, + title = {Statistical methods for speech recognition}, + year = {1997}, + isbn = {0-262-10066-5}, + publisher = {MIT Press}, + address = {Cambridge, Massachusetts}, +} + +@conference{Vogel_1996, + title={{HMM-based word alignment in statistical translation}}, + author={Vogel, Stephan and Ney, Hermann and Tillmann, Christoph}, + booktitle = "Proceedings of the 16th International Conference on Computational Linguistics (COLING 1996)", + pages={836--841}, + year={1996}, + address = "Copenhagen, Denmark" +} + +@conference{Hassan_2005, + title={Stock Market Forecasting Using Hidden {Markov} Models: A New Approach}, + author={Hassan, Md. Rafiul and Nath, Baikunth}, + booktitle={Proceedings of the 5th International Conference on Intelligent Systems Design and Applications (ISDA '05)}, + pages={192--196}, + year={2005}, + address = "Wroclaw, Poland" +} + +@INCOLLECTION{Rabiner_1990, + author = {Rabiner, Lawrence R.}, + title = {A tutorial on hidden {Markov} models and selected applications in speech recognition}, + booktitle = {Readings in Speech Recognition}, + year = {1990}, + isbn = {1-55860-124-4}, + pages = {267--296}, + publisher = {Morgan Kaufmann Publishers}, + address = {San Francisco, California}, + } + + +------ + +@ARTICLE{Albert_Barabasi_2002, + author = "{R\'{e}ka} Albert and {Albert-L\'{a}szl\'{o}} {Barab\'{a}si}", + title = "Statistical Mechanics of Complex Networks", + journal = "Reviews of Modern Physics", + volume = 74, + pages = "47--97", + year = 2002, +} + +@TECHREPORT{Alvaro_etal_2009, + author = "Peter Alvaro and Tyson Condie and Neil Conway and Khaled Elmeleegy and Joseph M. Hellerstein and Russell C. Sears", + title = "{BOOM}: Data-Centric Programming in the Datacenter", + number = "UCB/EECS-2009-98", + institution = "Electrical Engineering and Computer Sciences, University of California at Berkeley", + year = 2009, +} + +@INPROCEEDINGS{Anderson_etal_SOSP1995, + author = "Thomas Anderson and Michael Dahlin and Jeanna Neefe and David Patterson and Drew Roselli and Randolph Wang", + title = "Serverless Network File Systems", + booktitle = "Proceedings of the 15th ACM Symposium on Operating Systems Principles (SOSP 1995)", + year = 1995, + pages = "109--126", + address = "Copper Mountain Resort, Colorado", +} + +@ARTICLE{Anh_Moffat_2005, + author = "Vo Ngoc Anh and Alistair Moffat", + title = "Inverted index compression using word-aligned binary codes", + journal = "Information Retrieval", + volume = 8, + number = 1, + year = 2005, + pages = "151--166", +} + +@INPROCEEDINGS{Baeza-Yates_etal_2005, + author = "Ricardo Baeza-Yates and Carlos Castillo and Vicente {L\'{o}pez}", + title = "{PageRank} Increase under Different Collusion Topologies", + booktitle = "Proceedings of the First International Workshop on Adversarial Information Retrieval on the Web (AIRWeb 2005)", + year = 2005, + address = "Chiba, Japan", + pages = "17--24", +} + +@INPROCEEDINGS{Baeza-Yates_etal_SIGIR2007, + author = "Ricardo Baeza-Yates and Aristides Gionis and Flavio Junqueira and Vanessa Murdock and Vassilis Plachouras and Fabrizio Silvestri", + title = "The Impact of Caching on Search Engines", + booktitle = "Proceedings of the 30th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR 2007)", + address = "Amsterdam, The Netherlands", + year = 2007, + pages = "183--190", +} + +@INPROCEEDINGS{Baeza-Yates_etal_2007, + author = "Ricardo Baeza-Yates and Carlos Castillo and Flavio Junqueira and Vassilis Plachouras and Fabrizio Silvestri", + title = "Challenges on Distributed Web Retrieval", + booktitle = "Proceedings of the IEEE 23rd International Conference on Data Engineering (ICDE 2007)", + address = "Istanbul, Turkey", + year = 2007, + pages = "6--20", +} + +@INPROCEEDINGS{Banko01, + author = "Michele Banko and Eric Brill", + title = "Scaling to Very Very Large Corpora for Natural Language Disambiguation", + booktitle = "Proceedings of the 39th Annual Meeting of the Association for Computational Linguistics (ACL 2001)", + year = 2001, + pages = "26--33", + address = "Toulouse, France", +} + +@INPROCEEDINGS{Barham_etal_2003, + author = "Paul Barham and Boris Dragovic and Keir Fraser and Steven Hand and Tim Harris and Alex Ho and Rolf Neugebauer and Ian Pratt and Andrew Warfield", + title = "Xen and the Art of Virtualization", + booktitle = "Proceedings of the 19th ACM Symposium on Operating Systems Principles (SOSP 2003)", + year = 2003, + address = "Bolton Landing, New York", + pages = "164--177", +} + +@ARTICLE{Barroso03, + author = "Luiz {Andr\'{e}} Barroso and Jeffrey Dean and Urs {H\"{o}lzle}", + title = "Web Search for a Planet: The {Google} Cluster Architecture", + journal = "IEEE Micro", + volume = 23, + number = 2, + pages = "22--28", + year = 2003, +} + +@ARTICLE{Barroso_Holzle_2007, + author = "Luiz {Andr\'{e}} Barroso and Urs {H\"{o}lzle}", + title = "The Case for Energy-Proportional Computing", + journal = "Computer", + volume = 40, + number = 12, + pages = "33--37", + year = 2007, +} + +@BOOK{Barroso_Holzle_2009, + author = "Luiz {Andr\'{e}} Barroso and Urs {H\"{o}lzle}", + title = "The Datacenter as a Computer: An Introduction to the Design of Warehouse-Scale Machines", + publisher = "Morgan \& Claypool Publishers", + year = 2009, +} + +@INPROCEEDINGS{Becla_Wang_2005, + author = "Jacek Becla and Daniel L. Wang", + title = "Lessons Learned from Managing a Petabyte", + booktitle = "Proceedings of the Second Biennial Conference on Innovative Data Systems Research (CIDR 2005)", + year = 2005, + address = "Asilomar, California", +} + +@TECHREPORT{Becla_etal_2006, + author = "Jacek Becla and Andrew Hanushevsky and Sergei Nikolaev and Ghaleb Abdulla and Alex Szalay and Maria Nieto-Santisteban and Ani Thakar and Jim Gray", + title = "Designing a Multi-petabyte Database for {LSST}", + type = "SLAC Publications", + number = "SLAC-PUB-12292", + institution = "Stanford Linear Accelerator Center", + year = 2006, + month = "May", +} + +@ARTICLE{Bell_etal_2009, + author = "Gordon Bell and Tony Hey and Alex Szalay", + title = "Beyond the Data Deluge", + journal = "Science", + volume = 323, + number = 5919, + pages = "1297--1298", + year = 2009, +} + +@ARTICLE{Bianchini_etal_2005, + author = "Monica Bianchini and Marco Gori and Franco Scarselli", + title = "Inside {PageRank}", + journal = "ACM Transactions on Internet Technology", + volume = 5, + number = 1, + pages = "92--128", + year = 2005, +} + +@BOOK{Borges_1999, + author = "Jorge Luis Borges", + title = "Collected Fictions (translated by {Andrew} {Hurley})", + publisher = "Penguin", + year = 1999, +} + +@INPROCEEDINGS{Brants_etal_EMNLP2007, + author = "Thorsten Brants and Ashok C. Popat and Peng Xu and Franz J. Och and Jeffrey Dean", + title = "Large Language Models in Machine Translation", + booktitle = "Proceedings of the 2007 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning", + year = 2007, + pages = "858--867", + address = "Prague, Czech Republic", +} + +@INPROCEEDINGS{Brill_etal_TREC2001, + author = "Eric Brill and Jimmy Lin and Michele Banko and Susan Dumais and Andrew Ng", + title = "Data-Intensive Question Answering", + booktitle = "Proceedings of the Tenth Text REtrieval Conference (TREC 2001)", + year = 2001, + pages = "393--400", + address = "Gaithersburg, Maryland", +} + +@BOOK{Brooks_1995, + author = "Frederick P. Brooks", + title = "The Mythical Man-Month: Essays on Software Engineering, Anniversary Edition", + publisher = "Addison-Wesley", + address = "Reading, Massachusetts", + year = 1995, +} + +@BOOK{Buttcher_etal_2010, + author = "Stefan {B\"{u}ttcher} and Charles L. A. Clarke and Gordon V. Cormack", + title = "Information Retrieval: Implementing and Evaluating Search Engines", + publisher = "MIT Press", + address = "Cambridge, Massachusetts", + year = 2010, +} + +@ARTICLE{Buyya_etal_2009, + author = "Rajkumar Buyya and Chee Shin Yeo and Srikumar Venugopal and James Broberg and Ivona Brandic", + title = "Cloud Computing and Emerging {IT} Platforms: Vision, Hype, and Reality for Delivering Computing as the 5th Utility", + journal = "Future Generation Computer Systems", + volume = 25, + number = 6, + pages = "599--616", + year = 2009, +} + +@ARTICLE{Cabrera_Long_1991, + author = "Luis-Felipe Cabrera and Darrell D. E. Long", + title = "{Swift}: Using Distributed Disk Striping to Provide High {I/O} Data Rates", + journal = "Computer Systems", + volume = 4, + number = 4, + pages = "405--436", + year = 1991, +} + +@INPROCEEDINGS{ChangFay_etal_OSDI2006, + author = "Fay Chang and Jeffrey Dean and Sanjay Ghemawat and Wilson C. Hsieh and Deborah A. Wallach and Michael Burrows and Tushar Chandra and Andrew Fikes and Robert Gruber", + title = "{Bigtable}: A Distributed Storage System for Structured Data", + booktitle = "Proceedings of the 7th Symposium on Operating System Design and Implementation (OSDI 2006)", + address = "Seattle, Washington", + year = 2006, + pages = "205--218", +} + +@INPROCEEDINGS{Chen_Goodman_ACL1996, + author = "Stanley F. Chen and Joshua Goodman", + title = "An Empirical Study of Smoothing Techniques for Language Modeling", + booktitle = "Proceedings of the 34th Annual Meeting of the Association for Computational Linguistics (ACL 1996)", + year = 1996, + pages = "310--318", + address = "Santa Cruz, California", +} + +@ARTICLE{Church_Hanks_1990, + author = "Kenneth W. Church and Patrick Hanks", + title = "Word Association Norms, Mutual Information, and Lexicography", + journal = "Computational Linguistics", + volume = 16, + number = 1, + pages = "22--29", + year = 1990, +} + +@ARTICLE{CohenJonathan_2009, + author = "Jonathan Cohen", + title = "Graph Twiddling in a {MapReduce} World", + journal = "Computing in Science and Engineering", + volume = 11, + number = 4, + pages = "29--41", + year = 2009, +} + +@INPROCEEDINGS{CooperBrian_etal_2010, + author = "Brian F. Cooper and Adam Silberstein and Erwin Tam and Raghu Ramakrishnan and Russell Sears", + title = "Benchmarking Cloud Serving Systems with {YCSB}", + booktitle = "Proceedings of the First ACM Symposium on Cloud Computing (ACM SOCC 2010)", + address = "Indianapolis, Indiana", + year = 2010, +} + +@BOOK{CLRS, + author = "Thomas H. Cormen and Charles E. Leiserson, Ronald L. Rivest, and Clifford Stein", + title = "Introduction to Algorithms", + publisher = "MIT Press", + address = "Cambridge, Massachusetts", + year = 1990, +} + +@BOOK{Croft_etal_2009, + author = "W. Bruce Croft and Donald Meztler and Trevor Strohman", + title = "Search Engines: Information Retrieval in Practice", + publisher = "Addison-Wesley", + address = "Reading, Massachusetts", + year = 2009, +} + +@ARTICLE{Culler_etal_1993, + author = "David Culler and Richard Karp and David Patterson and Abhijit Sahay and Klaus Erik Schauser and Eunice Santos and Ramesh Subramonian and Thorsten von Eicken", + title = "{LogP}: Towards a Realistic Model of Parallel Computation", + journal = "ACM SIGPLAN Notices", + volume = 28, + number = 7, + year = 1993, + pages = "1--12", +} + +@INPROCEEDINGS{Dean_Ghemawat_OSDI2004, + author = "Jeffrey Dean and Sanjay Ghemawat", + title = "{MapReduce}: Simplified Data Processing on Large Clusters", + booktitle = "Proceedings of the 6th Symposium on Operating System Design and Implementation (OSDI 2004)", + address = "San Francisco, California", + year = 2004, + pages = "137--150", +} + +@ARTICLE{Dean_Ghemawat_CACM2008, + author = "Jeffrey Dean and Sanjay Ghemawat", + title = "{MapReduce}: Simplified Data Processing on Large Clusters", + journal = "Communications of the ACM", + volume = 51, + number = 1, + pages = "107--113", + year = 2008, +} + +@ARTICLE{Dean_Ghemawat_CACM2010, + author = "Jeffrey Dean and Sanjay Ghemawat", + title = "{MapReduce}: A Flexible Data Processing Tool", + journal = "Communications of the ACM", + volume = 53, + number = 1, + pages = "72--77", + year = 2010, +} + +@INPROCEEDINGS{DeCandia_etal_2007, + author = "Giuseppe DeCandia and Deniz Hastorun and Madan Jampani and Gunavardhan Kakulapati and Avinash Lakshman and Alex Pilchin and Swami Sivasubramanian and Peter Vosshall and Werner Vogels", + title = "Dynamo: {Amazon's} Highly Available Key-Value Store", + booktitle = "Proceedings of the 21st ACM Symposium on Operating Systems Principles (SOSP 2007)", + year = 2007, + address = "Stevenson, Washington", + pages = "205--220", +} + +@ARTICLE{DeWitt_etal_1984, + author = "David J. DeWitt and Randy H. Katz and Frank Olken and Leonard D. Shapiro and Michael R. Stonebraker and David Wood", + title = "Implementation Techniques for Main Memory Database Systems", + journal = "ACM SIGMOD Record", + volume = 14, + number = 2, + pages = "1--8", + year = 1984, +} + +@ARTICLE{DeWitt_Gray_CACM1992, + author = "David J. DeWitt and Jim Gray", + title = "Parallel Database Systems: The Future of High Performance Database Systems", + journal = "Communications of the ACM", + volume = 35, + number = 6, + pages = "85--98", + year = 1992, +} + +@ARTICLE{Dredze_etal_2009, + author = "Mark Dredze and Alex Kulesza and Koby Crammer", + title = "Multi-Domain Learning by Confidence-Weighted Parameter Combination", + journal = "Machine Learning", + volume = 79, + numbers = "1--2", + pages = "123--149", + year = 2010, +} + +@INPROCEEDINGS{Dumais_etal_SIGIR2002, + author = "Susan Dumais and Michele Banko and Eric Brill and Jimmy Lin and Andrew Ng", + title = "{Web} Question Answering: {Is} More Always Better?", + booktitle = "Proceedings of the 25th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR 2002)", + year = 2002, + pages = "291--298", + address = "Tampere, Finland", +} + +@INPROCEEDINGS{Dyer_etal_2008, + author = "Chris Dyer and Aaron Cordova and Alex Mont and Jimmy Lin", + title = "Fast, Easy, and Cheap: Construction of Statistical Machine Translation Models with {MapReduce}", + booktitle = "Proceedings of the Third Workshop on Statistical Machine Translation at ACL 2008", + address = "Columbus, Ohio", + year = 2008, + pages = "199--207", +} + +@INCOLLECTION{Firth_1957, + author = "John R. Firth", + title = "A Synopsis of Linguistic Theory 1930--55", + booktitle = "Studies in Linguistic Analysis, Special Volume of the Philological Society", + pages = "1--32", + address = "Oxford", + publisher = "Blackwell", + year = 1957 +} + +@INPROCEEDINGS{Ghemawat_etal_SOSP2003, + author = "Sanjay Ghemawat and Howard Gobioff and Shun-Tak Leung", + title = "The {Google} {File} {System}", + booktitle = "Proceedings of the 19th ACM Symposium on Operating Systems Principles (SOSP 2003)", + year = 2003, + address = "Bolton Landing, New York", + pages = "29--43" +} + +@ARTICLE{Gilbert_Lynch_2002, + author = "Seth Gilbert and Nancy Lynch", + title = "{Brewer's} {Conjecture} and the Feasibility of Consistent, Available, Partition-Tolerant Web Services", + journal = "ACM SIGACT News", + volume = 33, + number = 2, + pages = "51--59", + year = 2002, +} + +@INPROCEEDINGS{Garcia-Molina_etal_2005, + author = "{Zolt\'{a}n} {Gy\"{o}ngyi} and Hector Garcia-Molina", + title = "Web Spam Taxonomy", + booktitle = "Proceedings of the First International Workshop on Adversarial Information Retrieval on the Web (AIRWeb 2005)", + year = 2005, + address = "Chiba, Japan", + pages = "39--47", +} + +@ARTICLE{Girvan02, + author = "Michelle Girvan and Mark E. J. Newman", + title = "Community Structure in Social and Biological Networks", + journal = "Proceedings of the National Academy of Science", + volume = 99, + number = 12, + pages = "7821--7826", + year = 2002, +} + +@BOOK{Grama_etal_2003, + author = "Ananth Grama and Anshul Gupta and George Karypis and Vipin Kumar", + title = "Introduction to Parallel Computing", + publisher = "Addison-Wesley", + address = "Reading, Massachusetts", + year = 2003, +} + +@ARTICLE{Granovetter73, + author = "Mark S. Granovetter", + title = "The Strength of Weak Ties", + journal = "The American Journal of Sociology", + volume = 78, + number = 6, + pages = "1360--1380", + year = 1973, +} + +@ARTICLE{Granovetter83, + author = "Mark S. Granovetter", + title = "The Strength of Weak Ties: A Network Theory Revisited", + journal = "Sociological Theory", + volume = 1, + pages = "201--233", + year = 1983, +} + +@BOOK{Hage_1996, + author = "Per Hage and Frank Harary", + title = "Island Networks: Communication, Kinship, and Classification Structures in {Oceania}", + publisher = "Cambridge University Press", + address = "Cambridge, England", + year = 1996 +} + +@ARTICLE{Halevy_etal_2009, + author = "Alon Halevy and Peter Norvig and Fernando Pereira", + title = "The Unreasonable Effectiveness of Data", + journal = "Communications of the ACM", + volume = 24, + number = 2, + pages = "8--12", + year = 2009, +} + +@INPROCEEDINGS{Hamilton_2007, + author = "James Hamilton", + title = "On Designing and Deploying {Internet}-Scale Services", + booktitle = "Proceedings of the 21st Large Installation System Administration Conference (LISA '07)", + year = 2007, + address = "Dallas, Texas", + pages = "233--244" +} + +@INPROCEEDINGS{Hamilton_2009, + author = "James Hamilton", + title = "{Cooperative} {Expendable} {Micro-Slice} {Servers} {(CEMS)}: Low Cost, Low Power Servers for {Internet}-Scale Services", + booktitle = "Proceedings of the Fourth Biennial Conference on Innovative Data Systems Research (CIDR 2009)", + year = 2009, + address = "Asilomar, California", +} + +@INCOLLECTION{Hammerbacher_2009, + title = "Information Platforms and the Rise of the Data Scientist", + author = "Jeff Hammerbacher", + editor = "Toby Segaran and Jeff Hammerbacher", + booktitle = "Beautiful Data", + publisher = "O'Reilly", + address = "Sebastopol, California", + year = 2009, + pages = "73--84", +} + +@BOOK{Harris_1968, + author = "Zelig S. Harris", + title = "Mathematical Structures of Language", + address = "New York", + publisher = "Wiley", + year = 1968 +} + +@INPROCEEDINGS{HeB_etal_2008, + author = "Bingsheng He and Wenbin Fang and Qiong Luo and Naga K. Govindaraju and Tuyong Wang", + title = "{Mars}: A {MapReduce} Framework on Graphics Processors", + booktitle = "Proceedings of the 17th International Conference on Parallel Architectures and Compilation Techniques (PACT 2008)", + year = 2008, + address = "Toronto, Ontario, Canada", + pages = "260--269", +} + +@BOOK{Hey_etal_2009, + author = "Tony Hey and Stewart Tansley and Kristin Tolle", + title = "The Fourth Paradigm: Data-Intensive Scientific Discovery", + publisher = "Microsoft Research", + address = "Redmond, Washington", + year = 2009, +} + +@INCOLLECTION{Hey_etal_2009-Gray, + title = "{Jim} {Gray} on {eScience}: A Transformed Scientific Method", + author = "Tony Hey and Stewart Tansley and Kristin Tolle", + editor = "Tony Hey and Stewart Tansley and Kristin Tolle", + booktitle = "The Fourth Paradigm: Data-Intensive Scientific Discovery", + publisher = "Microsoft Research", + address = "Redmond, Washington", + year = 2009, +} + +@ARTICLE{Howard_etal_1988, + author = "John Howard and Michael Kazar and Sherri Menees and David Nichols and Mahadev Satyanarayanan and Robert Sidebotham and Michael West", + title = "Scale and Performance in a Distributed File System", + journal = "ACM Transactions on Computer Systems", + volume = 6, + number = 1, + pages = "51--81", + year = 1988, +} + +@INPROCEEDINGS{Isard_etal_2007, + author = "Michael Isard and Mihai Budiu and Yuan Yu and Andrew Birrell and Dennis Fetterly", + title = "Dryad: Distributed Data-Parallel Programs from Sequential Building Blocks", + booktitle = "Proceedings of the ACM SIGOPS/EuroSys European Conference on Computer Systems 2007 (EuroSys 2007)", + address = "Lisbon, Portugal", + year = 2007, + pages = "59--72", +} + +@BOOK{JaJa_1992, + author = "Joseph JaJa", + title = "An Introduction to Parallel Algorithms", + publisher = "Addison-Wesley", + address = "Reading, Massachusetts", + year = 1992, +} + +@ARTICLE{JacobsAdam_2009, + author = "Adam Jacobs", + title = "The Pathologies of Big Data", + journal = "ACM Queue", + volume = 7, + number = 6, + year = 2009, +} + +@BOOK{Jurafsky_Martin_2009, + author = "Daniel Jurafsky and James H. Martin", + title = "Speech and Language Processing", + publisher = "Pearson", + address = "Upper Saddle River, New Jersey", + year = 2009, +} + +@TECHREPORT{KangU_etal_2008, + author = "U Kang and Charalampos Tsourakakis and Ana Paula Appel and Christos Faloutsos and Jure Leskovec", + title = "{HADI}: Fast Diameter Estimation and Mining in Massive Graphs with {Hadoop}", + number = "CMU-ML-08-117", + institution = "School of Computer Science, Carnegie Mellon University", + year = 2008, +} + +@INPROCEEDINGS{KangU_etal_2009, + author = "U Kang and Charalampos E. Tsourakakis and Christos Faloutsos", + title = "{PEGASUS}: A Peta-Scale Graph Mining System---Implementation and Observations", + booktitle = "Proceedings of the 2009 Ninth IEEE International Conference on Data Mining (ICDM 2009)", + year = 2009, + address = "Miami, Floria", + pages = "229--238", +} + +@INPROCEEDINGS{Karloff_etal_2010, + author = "Howard Karloff and Siddharth Suri and Sergei Vassilvitskii", + title = "A Model of Computation for {MapReduce}", + booktitle = "Proceedings of the 21st Annual ACM-SIAM Symposium on Discrete Algorithms (SODA 2010)", + year = 2010, + address = "Austin, Texas", +} + +@INPROCEEDINGS{KimballA_etal_2008, + author = "Aaron Kimball and Sierra Michels-Slettvet and Christophe Bisciglia", + title = "Cluster Computing for {Web}-Scale Data Processing", + booktitle = "Proceedings of the 39th ACM Technical Symposium on Computer Science Education (SIGCSE 2008)", + address = "Portland, Oregon", + year = 2008, + pages = "116--120", +} + +@ARTICLE{Kleinberg_JACM1999, + author = "Jon M. Kleinberg", + title = "Authoritative Sources in a Hyperlinked Environment", + journal = "Journal of the ACM", + volume = 46, + number = 5, + pages = "604--632", + year = 1999, +} + +@ARTICLE{Lempel_Moran_TOIS2001, + author = "Ronny Lempel and Shlomo Moran", + title = "{SALSA}: The {Stochastic} {Approach} for {Link-Structure} {Analysis}", + journal = "ACM Transactions on Information Systems", + volume = 19, + number = 2, + pages = "131--160", + year = 2001, +} + +@ARTICLE{Leventhal_2009, + author = "Adam Leventhal", + title = "Triple-Parity {RAID} and Beyond", + journal = "ACM Queue", + volume = 7, + number = 11, + year = 2009, +} + +@ARTICLE{Lin_TOIS2007, + author = "Jimmy Lin", + title = "An Exploration of the Principles Underlying Redundancy-Based Factoid Question Answering", + journal = "ACM Transactions on Information Systems", + volume = 27, + number = 2, + pages = "1--55", + year = "2007", +} + +@INPROCEEDINGS{Lin_TeachCL2008, + author = "Jimmy Lin", + title = "Exploring Large-Data Issues in the Curriculum: A Case Study with {MapReduce}", + booktitle = "Proceedings of the Third Workshop on Issues in Teaching Computational Linguistics (TeachCL-08) at ACL 2008", + address = "Columbus, Ohio", + year = 2008, + pages = "54--61", +} + +@INPROCEEDINGS{Lin_EMNLP2008, + author = "Jimmy Lin", + title = "Scalable Language Processing Algorithms for the Masses: A Case Study in Computing Word Co-occurrence Matrices with {MapReduce}", + booktitle = "Proceedings of the 2008 Conference on Empirical Methods in Natural Language Processing (EMNLP 2008)", + address = "Honolulu, Hawaii", + year = 2008, + pages = "419--428", +} + +@TECHREPORT{Lin_etal_TR2009, + author = "Jimmy Lin and Anand Bahety and Shravya Konda and Samantha Mahindrakar", + title = "Low-Latency, High-Throughput Access to Static Global Resources within the {Hadoop} Framework", + number = "HCIL-2009-01", + institution = "University of Maryland", + address = "College Park, Maryland", + month = "January", + year = 2009, +} + +@INPROCEEDINGS{Malewicz_etal_2009, + author = "Grzegorz Malewicz and Matthew H. Austern and Aart J. C. Bik and James C. Dehnert and Ilan Horn and Naty Leiser and Grzegorz Czajkowski", + title = "{Pregel}: A System for Large-Scale Graph Processing", + booktitle = "Proceedings of the 28th ACM Symposium on Principles of Distributed Computing (PODC 2009)", + address = "Calgary, Alberta, Canada", + year = 2009, + pages = "6", +} + +@INPROCEEDINGS{Malewicz_etal_SIGMOD2010, + author = "Grzegorz Malewicz and Matthew H. Austern and Aart J. C. Bik and James C. Dehnert and Ilan Horn and Naty Leiser and Grzegorz Czajkowski", + title = "{Pregel}: A System for Large-Scale Graph Processing", + booktitle = "Proceedings of the 2010 ACM SIGMOD International Conference on Management of Data", + address = "Indianapolis, Indiana", + year = 2010, +} + +@BOOK{Manning_Schutze_1999, + author = "Christopher D. Manning and Hinrich {Sch\"{u}tze}", + title = "Foundations of Statistical Natural Language Processing", + publisher = "MIT Press", + address = "Cambridge, Massachusetts", + year = 1999, +} + +@BOOK{Manning_etal_2008, + author = "Christopher D. Manning and Prabhakar Raghavan and Hinrich {Sch\"{u}tze}", + title = "An Introduction to Information Retrieval", + publisher = "Cambridge University Press", + address = "Cambridge, England", + year = 2008, +} + +@ARTICLE{Mardis_2008, + author = "Elaine R. Mardis", + title = "The Impact of Next-Generation Sequencing Technology on Genetics", + journal = "Trends in Genetics", + volume = 24, + number = 3, + pages = "133--141", + year = 2008, +} + +@ARTICLE{McCool_2008, + author = "Michael D. McCool", + title = "Scalable Programming Models for Massively Multicore Processors", + journal = "Proceedings of the IEEE", + volume = 96, + number = 5, + pages = "816--831", + year = 2008, +} + +@ARTICLE{McKusick_Quinlan_2009, + author = "Marshall K. McKusick and Sean Quinlan", + title = "{GFS}: Evolution on Fast-forward", + journal = "ACM Queue", + volume = 7, + number = 7, + year = 2009, +} + +@ARTICLE{Mellor-Crummey_etal_2001, + author = "John Mellor-Crummey and David Whalley and Ken Kennedy", + title = "Improving Memory Hierarchy Performance for Irregular Applications Using Data and Computation Reorderings", + journal = "International Journal of Parallel Programming", + volume = 29, + number = 3, + pages = "217--247", + year = 2001, +} + +@INPROCEEDINGS{Metzler_etal_2009, + author = "Donald Metzler and Jasmine Novak and Hang Cui and Srihari Reddy", + title = "Building Enriched Document Representations Using Aggregated Anchor Text", + booktitle = "Proceedings of the 32nd Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR 2009)", + year = 2009, + pages = "219--226", +} + +@INPROCEEDINGS{MillerD99, + author = "David R. H. Miller and Tim Leek and Richard M. Schwartz", + title = "A Hidden {Markov} Model Information Retrieval System", + booktitle = "Proceedings of the 22nd Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR 1999)", + address = "Berkeley, California", + year = 1999, + pages = "214--221", +} + +@INPROCEEDINGS{Moffat_etal_SIGIR2006, + author = "Alistair Moffat and William Webber and Justin Zobel", + title = "Load Balancing for Term-Distributed Parallel Retrieval", + booktitle = "Proceedings of the 29th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR 2006)", + address = "Seattle, Washington", + year = 2006, + pages = "348--355", +} + +@INPROCEEDINGS{Nurmi_etal_2009, + author = "Daniel Nurmi and Rich Wolski and Chris Grzegorczyk and Graziano Obertelli and Sunil Soman and Lamia Youseff and Dmitrii Zagorodnov", + title = "The {Eucalyptus} Open-Source Cloud-Computing System", + booktitle = "Proceedings of the 9th IEEE/ACM International Symposium on Cluster Computing and the Grid", + year = 2009, + address = "Washington, D.C.", + pages = "124--131", +} + +@INPROCEEDINGS{Olston_etal_SIGMOD2008, + author = "Christopher Olston and Benjamin Reed and Utkarsh Srivastava and Ravi Kumar and Andrew Tomkins", + title = "{Pig} {Latin}: A Not-So-Foreign Language for Data Processing", + booktitle = "Proceedings of the 2008 ACM SIGMOD International Conference on Management of Data", + address = "Vancouver, British Columbia, Canada", + year = 2008, + pages = "1099--1110", +} + +@article{Olston_Najork_2010, + author = "Christopher Olston and Marc Najork", + title = "Web Crawling", + journal = "Foundations and Trends in Information Retrieval", + volume = 4, + number = 3, + pages = "175--246", + year = 2010 +} + +@ARTICLE{Olukotun_Hammond_2005, + author = "Kunle Olukotun and Lance Hammond", + title = "The Future of Microprocessors", + journal = "ACM Queue", + volume = 3, + number = 7, + pages = "27--34", + year = 2005, +} + +@ARTICLE{Pang_Lee_2008, + author = "Bo Pang and Lillian Lee", + title = "Opinion Mining and Sentiment Analysis", + journal = "Foundations and Trends in Information Retrieval", + volume = 2, + number = "1--2", + pages = "1--135", + year = 2008, +} + +@TECHREPORT{Page_etal_1999, + author = "Lawrence Page and Sergey Brin and Rajeev Motwani and Terry Winograd", + title = "The {PageRank} Citation Ranking: Bringing Order to the {Web}", + type = "Stanford Digital Library Working Paper", + number = "SIDL-WP-1999-0120", + institution = "Stanford University", + year = 1999, +} + +@ARTICLE{Patterson_CACM2008, + author = "David A. Patterson", + title = "The Data Center is the Computer", + journal = "Communications of the ACM", + volume = 52, + number = 1, + pages = "105", + year = 2008, +} + +@INPROCEEDINGS{Pavlo_etal_SIGMOD2009, + author = "Andrew Pavlo and Erik Paulson and Alexander Rasin and Daniel J. Abadi and David J. DeWitt and Samuel Madden and Michael Stonebraker", + title = "A Comparison of Approaches to Large-Scale Data Analysis", + booktitle = "Proceedings of the 35th ACM SIGMOD International Conference on Management of Data", + year = 2009, + pages = "165--178", + address = "Providence, Rhode Island", +} + +@ARTICLE{Pike_etal_2005, + author = "Rob Pike and Sean Dorward and Robert Griesemer and Sean Quinlan", + title = "Interpreting the Data: Parallel Analysis with {Sawzall}", + journal = "Scientific Programming Journal", + volume = 13, + number = 4, + pages = "277--298", + year = 2005, +} + +@INPROCEEDINGS{Pinheiro_etal_2007, + author = "Eduardo Pinheiro and Wolf-Dietrich Weber and Luiz {Andr\'{e}} Barroso", + title = "Failure Trends in a Large Disk Drive Population", + booktitle = "Proceedings of the 5th USENIX Conference on File and Storage Technologies (FAST 2007)", + year = 2008, + address = "San Jose, California", +} + +@article{qiACS09, + author = "Xiaoguang Qi and Brian D. Davison", + title = "Web Page Classification: Features and Algorithms", + journal = "ACM Computing Surveys", + volume = "41", + number = "2", + year = "2009" +} + +@ARTICLE{Rafique_etal_2009, + author = "M. Mustafa Rafique and Benjamin Rose and Ali R. Butt and Dimitrios S. Nikolopoulos", + title = "Supporting {MapReduce} on Large-Scale Asymmetric Multi-Core Clusters", + journal = "ACM Operating Systems Review", + volume = 43, + number = 2, + pages = "25--34", + year = 2009, +} + +@INPROCEEDINGS{Ranger_etal_2007, + author = "Colby Ranger and Ramanan Raghuraman and Arun Penmetsa and Gary Bradski and Christos Kozyrakis", + title = "Evaluating {MapReduce} for Multi-core and Multiprocessor Systems", + booktitle = "Proceedings of the 13th International Symposium on High-Performance Computer Architecture (HPCA 2007)", + address = "Phoenix, Arizona", + year = 2007, + pages = "205--218", +} + +@INPROCEEDINGS{Rao_Yarowsky_2009, + author = "Delip Rao and David Yarowsky", + title = "Ranking and Semi-supervised Classification on Large Scale Graphs Using {Map-Reduce}", + booktitle = "Proceedings of the \mbox{ACL/IJCNLP} 2009 Workshop on Graph-Based Methods for Natural Language Processing (TextGraphs-4)", + year = 2009, + address = "Singapore", +} + +@ARTICLE{Rappa_2004, + author = "Michael A. Rappa", + title = "The Utility Business Model and the Future of Computing Services", + journal = "IBM Systems Journal", + volume = 34, + number = 1, + pages = "32--42", + year = 2004, +} + +@INPROCEEDINGS{Sandholm_Lai_2009, + author = "Thomas Sandholm and Kevin Lai", + title = "{MapReduce} Optimization Using Regulated Dynamic Prioritization", + booktitle = "Proceedings of the Eleventh International Joint Conference on Measurement and Modeling of Computer Systems (\mbox{SIGMETRICS} '09)", + address = "Seattle, Washington", + year = 2009, + pages = "299--310", +} + +@PHDTHESIS{Schatz_2010, + author = "Michael Schatz", + title = "High Performance Computing for {DNA} Sequence Alignment and Assembly", + school = "University of Maryland, College Park", + year = 2010, +} + +@INPROCEEDINGS{Schneider_DeWitt_SIGMOD1989, + author = "Donovan A. Schneider and David J. DeWitt", + title = "A Performance Evaluation of Four Parallel Join Algorithms in a Shared-Nothing Multiprocessor Environment", + booktitle = "Proceedings of the 1989 ACM SIGMOD International Conference on Management of Data", + address = "Portland, Oregon", + year = 1989, + pages = "110--121", +} + +@INPROCEEDINGS{Schmuck_Haskin_2002, + author = "Frank Schmuck and Roger Haskin", + title = "{GPFS}: A Shared-Disk File System for Large Computing Clusters", + booktitle = "Proceedings of the First USENIX Conference on File and Storage Technologies", + year = 2002, + pages = "231--244", + address = "Monterey, California", +} + +@INPROCEEDINGS{Schroeder_etal_2009, + author = "Bianca Schroeder and Eduardo Pinheiro and Wolf-Dietrich Weber", + title = "{DRAM} Errors in the Wild: A Large-Scale Field Study", + booktitle = "Proceedings of the Eleventh International Joint Conference on Measurement and Modeling of Computer Systems (\mbox{SIGMETRICS} '09)", + year = 2009, + address = "Seattle, Washington", + pages = "193--204", +} + +@ARTICLE{Schutze_CL1998, + author = "Hinrich Sch{\"{u}}tze", + title = "Automatic Word Sense Discrimination", + journal = "Computational Linguistics", + volume = 24, + number = 1, + pages = "97--123", + year = 1998, +} + +@ARTICLE{Schutze_Pedersen_IPM1997, + author = "Hinrich Sch{\"{u}}tze and Jan O. Pedersen", + title = "A Cooccurrence-Based Thesaurus and Two Applications to Information Retrieval", + journal = "Information Processing and Management", + volume = 33, + number = 3, + pages = "307--318", + year = 1998, +} + +@BOOK{Sekine_Ranchhod_2009, + author = "Satoshi Sekine and Elisabete Ranchhod", + title = "Named Entities: Recognition, Classification and Use", + publisher = "John Benjamins", + address = "Amsterdam, The Netherlands", + year = 2009, +} + +@INCOLLECTION{Southan_Cameron_2009, + title = "Beyond the Tsunami: Developing the Infrastructure to Deal with Life Sciences Data", + author = "Christopher Southan and Graham Cameron", + editor = "Tony Hey and Stewart Tansley and Kristin Tolle", + booktitle = "The Fourth Paradigm: Data-Intensive Scientific Discovery", + publisher = "Microsoft Research", + address = "Redmond, Washington", + year = 2009, +} + +@ARTICLE{Stonebraker_etal_CACM2010, + author = "Michael Stonebraker and Daniel Abadi and David J. DeWitt and Sam Madden and Erik Paulson and Andrew Pavlo and Alexander Rasin", + title = "{MapReduce} and Parallel {DBMSs}: Friends or Foes?", + journal = "Communications of the ACM", + volume = 53, + number = 1, + pages = "64--71", + year = 2010, +} + +@ARTICLE{Szalay_etal_2000, + author = "Alexander S. Szalay and Peter Z. Kunszt and Ani Thakar and Jim Gray and Don Slutz and Robert J. Brunner", + title = "Designing and Mining Multi-Terabyte Astronomy Archives: The {Sloan} {Digital} {Sky} {Survey}", + journal = "SIGMOD Record", + volume = 29, + number = 2, + pages = "451--462", + year = 2000, +} + +@TECHREPORT{Tantisiriroj_etal_2008, + author = "Wittawat Tantisiriroj and Swapnil Patil and Garth Gibson", + title = "Data-intensive File systems for {Internet} Services: A Rose By Any Other Name\ldots", + number = "CMU-PDL-08-114", + institution = "Parallel Data Laboratory, Carnegie Mellon University", + year = 2008, +} + +@INPROCEEDINGS{Thekkath_etal_SOSP1997, + author = "Chandramohan A. Thekkath and Timothy Mann and Edward K. Lee", + title = "{Frangipani}: A Scalable Distributed File System", + booktitle = "Proceedings of the 16th ACM Symposium on Operating Systems Principles (SOSP 1997)", + year = 1997, + pages = "224--237", + address = "Saint-Malo, France", +} + +@ARTICLE{Valiant_CACM1990, + author = "Leslie G. Valiant", + title = "A Bridging Model for Parallel Computation", + journal = "Communications of the ACM", + volume = 33, + number = 8, + pages = "103--111", + year = 1990, +} + +@ARTICLE{Vaquero_etal_2009, + author = "Luis M. Vaquero and Luis Rodero-Merino and Juan Caceres and Maik Lindner", + title = "A Break in the Clouds: Towards a Cloud Definition", + journal = "ACM SIGCOMM Computer Communication Review", + volume = 39, + number = 1, + pages = "50--55", + year = 2009, +} + +@ARTICLE{Watts_Strogatz_1998, + author = "Duncan J. Watts and Steven H. Strogatz", + title = "Collective Dynamics of `Small-World' Networks", + journal = "Nature", + volume = 393, + pages = "440--442", + year = 1998, +} + +@INPROCEEDINGS{Wen_Vishkin_2008, + author = "Xingzhi Wen and Uzi Vishkin", + title = "{FPGA}-Based Prototype of a {PRAM-On-Chip} Processor", + booktitle = "Proceedings of the 5th Conference on Computing Frontiers", + address = "Ischia, Italy", + year = 2008, + pages = "55--66", +} + +@ARTICLE{Wigner_1960, + author = "Eugene Wigner", + title = "The Unreasonable Effectiveness of Mathematics in the Natural Sciences", + journal = "Communications in Pure and Applied Mathematics", + volume = 13, + number = 1, + pages = "1--14", + year = 1960, +} + +@BOOK{Witten_etal_1999, + author = "Ian H. Witten and Alistair Moffat and Timothy C. Bell", + title = "Managing Gigabytes: Compressing and Indexing Documents and Images", + publisher = "Morgan Kaufmann Publishing", + address = "San Francisco, California", + year = 1999, +} + +@BOOK{White_2009, + title = "{Hadoop}: The Definitive Guide", + author = "Tom White", + publisher = "O'Reilly", + address = "Sebastopol, California", + year = 2009, +} + +@ARTICLE{Xu_Croft_TOIS1998, + author = "Jinxi Xu and W. Bruce Croft", + title = "Corpus-Based Stemming Using Cooccurrence of Word Variants", + journal = "ACM Transactions on Information Systems", + volume = 16, + number = 1, + pages = "61--81", + year = 1998, +} + +@ARTICLE{XuR_Wunsch_2005b, + author = "Rui Xu and Donald Wunsch II", + title = "Survey of Clustering Algorithms", + journal = "IEEE Transactions on Neural Networks", + volume = 16, + number = 3, + pages = "645--678", + year = 2005, +} + +@INPROCEEDINGS{YangHungchih_etal_SIGMOD2007, + author = "Hung-chih Yang and Ali Dasdan and Ruey-Lung Hsiao and D. Stott Parker", + title = "{Map-Reduce-Merge}: Simplified Relational Data Processing on Large Clusters", + booktitle = "Proceedings of the 2007 ACM SIGMOD International Conference on Management of Data", + address = "Beijing, China", + year = 2007, + pages = "1029--1040", +} + +@INPROCEEDINGS{YuYuan_etal_OSDI2008, + author = "Yuan Yu and Michael Isard and Dennis Fetterly and Mihai Budiu and {\'{U}lfar} Erlingsson and Pradeep Kumar Gunda and Jon Currey", + title = "{DryadLINQ}: A System for General-Purpose Distributed Data-Parallel Computing Using a High-Level Language", + booktitle = "Proceedings of the 8th Symposium on Operating System Design and Implementation (OSDI 2008)", + address = "San Diego, California", + year = 2008, + pages = "1--14", +} + +@INPROCEEDINGS{Zaharia_etal_OSDI2008, + author = "Matei Zaharia and Andy Konwinski and Anthony D. Joseph and Randy Katz and Ion Stoica", + title = "Improving {MapReduce} Performance in Heterogeneous Environments", + booktitle = "Proceedings of the 8th Symposium on Operating System Design and Implementation (OSDI 2008)", + address = "San Diego, California", + year = 2008, + pages = "29--42", +} + +@TECHREPORT{Zaharia_etal_2009, + author = "Matei Zaharia and Dhruba Borthakur and Joydeep Sen Sarma and Khaled Elmeleegy and Scott Shenker and Ion Stoica", + title = "Job Scheduling for Multi-User {MapReduce} Clusters", + number = "UCB/EECS-2009-55", + institution = "Electrical Engineering and Computer Sciences, University of California at Berkeley", + year = 2009, +} + +@ARTICLE{Zobel_Moffat_2006, + author = "Justin Zobel and Alistair Moffat", + title = "Inverted Files for Text Search Engines", + journal = "ACM Computing Surveys", + volume = 38, + number = 6, + pages = "1--56", + year = 2006, +} diff --git a/ed1n/chapter4-indexing.tex b/ed1n/chapter4-indexing.tex index 1bee2b9..ad10684 100644 --- a/ed1n/chapter4-indexing.tex +++ b/ed1n/chapter4-indexing.tex @@ -587,7 +587,7 @@ \section{Index Compression} detour into compression techniques, particularly for coding integers. Compression, in general, can be characterized as either \emph{lossless} -or \emph{lossy}:\ it's fairly obvious that loseless compression is +or \emph{lossy}:\ it's fairly obvious that lossless compression is required in this context. To start, it is important to understand that all compression techniques represent a time--space tradeoff. That is, we reduce the amount of space on disk necessary to store diff --git a/ed1n/chapter5-graphs.tex b/ed1n/chapter5-graphs.tex index 4695d42..d3a2bcc 100644 --- a/ed1n/chapter5-graphs.tex +++ b/ed1n/chapter5-graphs.tex @@ -215,8 +215,8 @@ \section{Parallel Breadth-First Search} As a refresher and also to serve as a point of comparison, Dijkstra's algorithm is shown in Algorithm~\ref{algorithm:chapter-graphs:Dijkstra}, -adapted from Cormen, Leiserson, and Rivest's classic algorithms -textbook~\cite{CLR} (often simply known as \emph{CLR}). The input to +adapted from Cormen, Leiserson, Rivest, and Stein's classic algorithms +textbook~\cite{CLRS} (often simply known as \emph{CLRS}). The input to the algorithm is a directed, connected graph $G=(V,E)$ represented with adjacency lists, $w$ containing edge distances such that $w(u,v) \geq 0$, and the source node $s$. The algorithm begins by first @@ -281,7 +281,7 @@ \section{Parallel Breadth-First Search} A sample trace of the algorithm running on a simple graph is shown in Figure~\ref{figure:chapter-graphs:Dijkstra-example} (example also adapted -from \emph{CLR}). We start out in (a) with $n_1$ having a +from \emph{CLRS}). We start out in (a) with $n_1$ having a distance of zero (since it's the source) and all other nodes having a distance of $\infty$. In the first iteration (a), $n_1$ is selected as the node to expand (indicated by the thicker border). After the