diff --git a/.gitignore b/.gitignore index 364d3f76f6427..01cccf5507345 100644 --- a/.gitignore +++ b/.gitignore @@ -52,6 +52,9 @@ missing py-compile release stamp-h1 +systemd/ceph-osd@.service +systemd/ceph-rgw.tmpfiles.d +systemd/Makefile vgcore.* # specific local dir files @@ -64,7 +67,7 @@ vgcore.* /ceph-[0-9]*/ # M4 Macro directory -m4/ +/m4/ # where is this from? web/*.html diff --git a/.gitmodule_mirrors b/.gitmodule_mirrors index 2a926cbb7f1b3..0e10b99a9a601 100644 --- a/.gitmodule_mirrors +++ b/.gitmodule_mirrors @@ -2,10 +2,10 @@ # Only used by autobuild-ceph. ceph-object-corpus git://apt-mirror.front.sepia.ceph.com/ceph-object-corpus.git src/civetweb git://apt-mirror.front.sepia.ceph.com/civetweb.git -src/libs3 git://apt-mirror.front.sepia.ceph.com/libs3.git src/mongoose git://apt-mirror.front.sepia.ceph.com/mongoose.git src/leveldb git://apt-mirror.front.sepia.ceph.com/leveldb.git src/erasure-code/jerasure/jerasure git://apt-mirror.front.sepia.ceph.com/jerasure.git src/erasure-code/jerasure/gf-complete git://apt-mirror.front.sepia.ceph.com/gf-complete.git src/rocksdb git://apt-mirror.front.sepia.ceph.com/rocksdb.git ceph-erasure-code-corpus git://apt-mirror.front.sepia.ceph.com/ceph-erasure-code-corpus.git +src/gmock git://apt-mirror.front.sepia.ceph.com/gmock.git diff --git a/.gitmodules b/.gitmodules index acd9d513d14e6..e7c52a91baffe 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,12 +1,9 @@ [submodule "ceph-object-corpus"] path = ceph-object-corpus - url = git://ceph.com/git/ceph-object-corpus.git -[submodule "src/libs3"] - path = src/libs3 - url = git://github.com/ceph/libs3.git + url = https://github.com/ceph/ceph-object-corpus.git [submodule "src/civetweb"] path = src/civetweb - url = git://github.com/ceph/civetweb + url = https://github.com/ceph/civetweb [submodule "src/erasure-code/jerasure/jerasure"] path = src/erasure-code/jerasure/jerasure url = https://github.com/ceph/jerasure.git @@ -17,9 +14,12 @@ branch = v2-ceph [submodule "src/rocksdb"] path = src/rocksdb - url = git://github.com/ceph/rocksdb + url = https://github.com/ceph/rocksdb ignore = dirty [submodule "ceph-erasure-code-corpus"] path = ceph-erasure-code-corpus url = https://github.com/ceph/ceph-erasure-code-corpus.git - +[submodule "src/gmock"] + path = src/gmock + url = https://github.com/ceph/gmock.git + branch = ceph-release-1.7.0 diff --git a/.mailmap b/.mailmap index 9771c1194f87a..3e8b2788dda53 100644 --- a/.mailmap +++ b/.mailmap @@ -5,8 +5,10 @@ # See .organizationmap for organization affiliation # See .peoplemap for unique list of people # -Ailing Zhang +# Abhishek Lekshmanan +Ahoussi Armand delco225 +Ailing Zhang Alexandre Marangone Alexandre Maragone Alexandre Marangone Alexandre Oliva @@ -14,18 +16,28 @@ Alexandre Oliva Alex Elder Alex Elder Alex Elder +Alexis Normand smagtony@gmail.com Alfredo Deza Andreas Peters +Andrew Bartlett Andrew Bartlett Andrew Leung anwleung +Andy Allan +Anis Ayari Anols Aristoteles Neto Ashish Chandra +Baptiste Veuillez +Billy Olsen +Boris Ranto branto1 Brian Rak devicenull Caleb Miles caleb miles Caleb Miles caleb miles Caleb Miles Caleb Miles -Casey Marshall rsdio Carlos Maltzahn carlosm +Casey Marshall rsdio +Chendi Xue Chendi.Xue +Chendi Xue Chendi Xue Cheng Cheng cchengleo +Chris Holcombe cholcombe973 Christian Brunner Christian Marie Christophe Courtaut Kri5 @@ -33,93 +45,118 @@ Colin P. McCabe Colin P. McCabe Dan Chai danchai Dan Mick -Dan Mick +Dan Mick +Dan Mick Danny Al-Gaaf David Moreau Simard Dmitry Smirnov +Dmitry Yatsushkevich Dominik Hannen Dominik Hannen Eric Mourgaya Erwin, Brock A Esteban Molina-Estolano eestolan -François Lafont François Lafont +Federico Gimenez fgimenez +Feng Wang cyclonew Florent Bautista FlorentCoppint Florent Flament -Federico Gimenez fgimenez +Florian Coste nairolf21 +Florian Marsylle +François Lafont Gary Lowell Gary Lowelll Gary Lowell Gary Lowell +Gaurav Kumar Garg Gerben Meijer Greg Farnum +Greg Farnum +Greg Farnum Greg Farnum Greg Farnu Greg Farnum Greg Farnum Greg Farnum -Greg Farnum Greg Farnum Greg Farnum Gregory Farnum Greg Farnum Reviewed-by: Greg Farnum Guang Yang Guang Yang Guang Yang Guilhem Lettron -Feng Wang cyclonew -Huamin Chen rootfs +Haomai Wang +Hazem Amara Henry C Chang Holger Macht +Huamin Chen rootfs Huang Jun huang jun Huang Jun huangjun Ilya Dryomov +Ilja Slepnev +Ismael Serrano +Jean-Charles Lopez jeanchlopez Jiang Heng jiangheng Jiantao He hejiantao5 -João Eduardo Luís +Jian Wen João Eduardo Luís Joao Eduardo Luis João Eduardo Luís João Eduardo Luís Joao Luis +João Eduardo Luís João Eduardo Luís João Eduardo Luís Joao Eduardo Luis João Eduardo Luís Joao Eduardo Luis +Joao Eduardo Luis Joe Buck +John Spray John Spray john John Spray -John Spray +Johnu George John Wilkins John Wilkins John Wilkins -John Wilkins -John Wilkins John Wilkins -Johnu George +John Wilkins +John Wilkins +Josh Durgin Josh During Josh Durgin Josh Durgin Kacper Kowalik Kacper Kowalik (Xarthisius) +Kefu Chai +Kiseleva Alyona Ved-vampir Laszlo Boszormenyi Laszlo Boszormenyi (GCS) Lluis Pamies-Juarez +Loic Dachary +Loic Dachary Loic Dachary Loic Dachary Loïc Dachary -Loic Dachary +Ma Jianpeng Jianpeng Ma Ma Jianpeng Ma Jianpeng Ma, Jianpeng -Ma Jianpeng Jianpeng Ma Marco Garcês Marco Garcês Mark Nelson +Matt Benjamin Matthew Roy Matthew Roy Matthew Wodrich Michael Riederer dynamike67 Michael Rodriguez Michael Rodriguez +Mykola Golub Mykola Golub +Nathan Cutler Neil Levine +Ning Yao Noah Watkins Noah Watkins +Neha Ummareddy nehaummareddy Pascal de Bruijn Pascal de Bruijn | Unilogic Networks B.V Patience Warnick -Patrick McGarry Patrick McGarry scuttlemonkey +Patrick McGarry Pavan Rallabhandi Pavan Rallabhandi +Pete Zaitcev +Riccardo Ferretti rferrett Roald J. van Loon +Robert Jansen +Robin Dehu Ron Allred rallred Ross Turk Ross Turk -Riccardo Ferretti rferrett Sage Weil Sage Weil Sage Weil @@ -131,45 +168,57 @@ Sage Weil Sage Weil Sage Weil Sage Weil +Sage Weil Reviewed-by: Sage Weil Sage Weil Sage Weil Sahid Orentino Ferdjaoui Sam Lang -Samuel Just -Samuel Just Samuel Just +Samuel Just +Samuel Just Samuel Just Sam Just Samuel Just Samuel Just Samuel Just Samuel Just -Samuel Just -Scott A. Brandt sbrandt +Samuel Just Sandon Van Ness SandonV +Sandon Van Ness +Scott A. Brandt sbrandt Sebastien Han Sebastien Ponce Sebastien Ponce -Shu, Xinxin xinxinsh +Sergey Arkhipov 9seconds +Shanggao Qiu qiushanggao Shu, Xinxin xinxin shu +Shu, Xinxin xinxinsh Stephen F Taylor Sushma Gurram sushma Swami Reddy M Ranga Swami Reddy Swami Reddy Sylvain Munaut +Takeshi Miyamae t-miyamae Tamil Muthamizhan tamil Tamil Muthamizhan Tamil Muthamizhan Thomas Bechtold +Thomas Cantin ThomasCantin +Thomas Johnson Tommi Virtanen Tommi Virtanen Tommi Virtanen +Travis Rhoden Tyler Brekke Concubidated Volker Assmann Volker Assmann +Varada Kari Walter Huf Walter J. Huf Wang, Yaguang ywang19 Warren Usui wusui Wei Luo luowei Wido den Hollander Xan Peng xan +Xavier Roche +Xie Rui <875016668@qq.com> Jerry7X <875016668@qq.com> +Xingyi Wu Yan, Zheng Yan, Zheng Zheng Yan Yan, Zheng Zheng, Yan @@ -183,10 +232,13 @@ Yehuda Sadeh Yehuda Sadeh Yehuda Sadeh Yehuda Sadeh +Yehuda Sadeh Yehuda Sadeh Yongyue Sun Abioy Yuan Zhou +Zhi (David) Zhang +Zhi (David) Zhang Zhi Z Zhang Zhiqiang Wang Signed-off-by: Zhiqiang Wang +Zhiqiang Wang Wang, Zhiqiang Zhiqiang Wang Zhiqiang Wang Zhiqiang Wang Zhiqiang Wang -Chendi Xue Chendi Xue diff --git a/.organizationmap b/.organizationmap index 24da7b956a9ff..3b64ad1b28a63 100644 --- a/.organizationmap +++ b/.organizationmap @@ -23,27 +23,35 @@ # 9 27 TCloud Computing # 10 22 GNU # +Acaleph Alistair Israel +Alcatel Lucent Joseph McDonald Anchor Hosting Christian Marie Anchor Hosting Sharif Olorin ArtiBit Rutger ter Borg Bayan Mohammad Salehe BCI Marco Garcês Bigpoint.com Moritz Möller +Canonical Billy Olsen Canonical Chris Glass Canonical James Page Canonical Jonathan Davies Carnegie Mellon University Jan Harkes +Catalyst Andrew Bartlett CCM Benchmark Laurent Barbe CERN Andreas Peters CERN Dan van der Ster +CERN Hervé Rousseau +CERN Joaquim Rocha CERN Sebastien Ponce Choopa, LLC Adam Twardowski -CISCO Kai Zhang CISCO Johnu George +CISCO Kai Zhang Cloudwatt Christophe Courtaut Cloudwatt Florent Flament Cloudwatt Loic Dachary Cloudwatt Sahid Orentino Ferdjaoui +CohortFS, LLC Matt Benjamin +CohortFS, LLC Casey Bodley Commerce Guys Nikola Kotur Corvisa LLC Walter Huf Credit Mutuel Arkea Eric Mourgaya @@ -72,6 +80,10 @@ DreamHost Wesley Spikes eNovance Babu Shanmugam eNovance Sebastien Han EPAM Andrey Kuznetsov +Exalead +Fairbanks Robert Jansen +Fujitsu Piotr Dałek +Fujitsu Takeshi Miyamae GameServers.com Brian Rak Gentoo Kacper Kowalik Gentoo Robin H. Johnson @@ -84,9 +96,13 @@ Hastexo Florian Haas HGST Kevin Dalley HGST Lluis Pamies-Juarez Hostplex Hosting Andras Elso +HP Blaine Gardner +Igalia Javier M. Mellid +Imagination Technologies Ltd. Alistair Strachan iNic Bjørnar Ness Inktank Alexandre Marangone Inktank Alex Elder +Inktank Alfredo Deza Inktank Caleb Miles Inktank Dan Mick Inktank David Zafman @@ -117,23 +133,35 @@ Inktank Tommi Virtanen Inktank Tyler Brekke Inktank Warren Usui Inktank Yehuda Sadeh +Inktank Yuri Weinstein +Intel Chendi Xue Intel Ma Jianpeng Intel Shu, Xinxin Intel Wang, Yaguang -Intel Yan, Zheng Intel Xiaoxi Chen -Intel Zhiqiang Wang +Intel Yan, Zheng Intel Yuan Zhou -Intel Chendi Xue +Intel Zhiqiang Wang Iron Systems Inc. Harpreet Dhillon IT Refined Ron Allred IWeb David Moreau Simard -Keeper Technology Wyllys Ingersoll Karlsruhe Institute of Technology Daniel J. Hofmann +Keeper Technology Wyllys Ingersoll Lebanon Evangelical School Jonathan Dieter +Linaro Yazen Ghannam +Linaro Steve Capper Los Alamos National Laboratory Esteban Molina-Estolano +Mellanox Vu Pham Mirantis Andrew Woodward +Mirantis Dmitry Yatsushkevich +Mirantis Dmytro Iurchenko +Mirantis Kiseleva Alyona +Mirantis Mykola Golub +Mirantis Radoslaw Zarzynski MIT Computer Science and Artificial Intelligence Laboratory Stephen Jahl +MSys Technologies Rajesh Nambiar +Nebula Chris Holcombe +Nebula Anton Aksola Opower Derrick Schneider Pacific Northwest National Laboratory Brown, David M JR Pacific Northwest National Laboratory Erwin, Brock A @@ -141,111 +169,188 @@ Pacific Northwest National Laboratory Evan Felix Scott Devoid Piston Cloud Computing Mike Lundy Pogoapp Paul Meserve -Red Hat Alexandre Marangone +Red Hat Alexandre Marangone Red Hat Alex Elder Red Hat Alfredo Deza Red Hat Boris Ranto Red Hat Dan Mick Red Hat David Zafman +Red Hat Douglas Fuller Red Hat Federico Simoncelli Red Hat Gary Lowell Red Hat Greg Farnum +Red Hat Gregory Meno +Red Hat Haïkel Guémar Red Hat Huamin Chen Red Hat Ilya Dryomov Red Hat Jason Dillaman +Red Hat Jean-Charles Lopez Red Hat João Eduardo Luís Red Hat John Spray Red Hat John Wilkins Red Hat Josh Durgin Red Hat JuanJose 'JJ' Galvez +Red Hat Kefu Chai Red Hat Ken Dreyer -Red Hat Loic Dachary Red Hat Loic Dachary +Red Hat Loic Dachary Red Hat Luis Pabón Red Hat Mark Nelson Red Hat Neil Levine +Red Hat Nilamdyuti Goswami Red Hat Noah Watkins +Red Hat Orit Wasserman Red Hat Patrick McGarry +Red Hat Pete Zaitcev +Red Hat Petr Machata Red Hat Ross Turk Red Hat Sage Weil Red Hat Sahid Orentino Ferdjaoui Red Hat Sam Lang Red Hat Samuel Just Red Hat Sandon Van Ness +Red Hat Shylesh Kumar Red Hat Tamil Muthamizhan Red Hat Tom Callaway +Red Hat Travis Rhoden Red Hat Tyler Brekke Red Hat Venky Shankar +Red Hat Vasu Kulkarni Red Hat Warren Usui Red Hat Yan, Zheng Red Hat Yehuda Sadeh +Red Hat Yuri Weinstein Reliance Jio Infocomm Ltd. Abhishek Lekshmanan Reliance Jio Infocomm Ltd. Ashish Chandra Reliance Jio Infocomm Ltd. Swami Reddy Roald van Loon Consultancy Roald J. van Loon +Ruijie Networks Ning Yao Sandia National Laboratories Jim Schutt SanDisk Allen Samuels SanDisk Anand Bhat SanDisk Pavan Rallabhandi +SanDisk Raju Kurunkad +SanDisk Shishir Gowda SanDisk Somnath Roy SanDisk Sushma Gurram +SanDisk Varada Kari Science & Technology Facilities Council George Ryall SendFaster Christopher O'Connell Spectra Logic Alan Somers +SUSE Adam Spiers SUSE David Disseldorp SUSE Hannes Reinecke SUSE Holger Macht +SUSE Joao Eduardo Luis +SUSE Karl Eichwalder +SUSE Nathan Cutler SUSE Owen Synge SUSE Thorsten Behrens +SUSE Tim Serong SWITCH Jens-Christian Fischer SWITCH Simon Leinen TCloud Computing CC Lien TCloud Computing Henry C Chang TCloud Computing Herb Shiu TCloud Computing Paul Chiang +Telecom Bretagne Ahoussi Armand +Telecom Bretagne Baptiste Veuillez +Telecom Bretagne Hazem Amara +Telecom Bretagne Thomas Cantin The Linux Box Adam C. Emerson +The Linux Box Ali Maredia The Linux Box Casey Bodley The Linux Box Matt Benjamin The University of Arizona James Ryan Cresawn Ubuntu Kylin Li Wang -Unaffiliated Yongyue Sun +Ubuntu Kylin Min Chen +Ubuntu Kylin MingXin Liu +Ubuntu Kylin Yunchuan Wen Unaffiliated Accela Zhao Unaffiliated Ailing Zhang +Unaffiliated Alexis Normand +Unaffiliated Andy Allan +Unaffiliated Anis Ayari +Unaffiliated Armando Segnini Unaffiliated BJ Lougee +Unaffiliated Bosse Klykken Unaffiliated Cheng Cheng -Unaffiliated Daniel Schepler +Unaffiliated Christos Stavrakakis Unaffiliated Colin Mattson Unaffiliated Dan Chai +Unaffiliated Daniel Schepler +Unaffiliated David Anderson +Unaffiliated Ding Dinghua Unaffiliated Dominik Hannen +Unaffiliated Dongmao Zhang Unaffiliated Erik Logtenberg +Unaffiliated Fabio Alessandro Locati Unaffiliated Federico Gimenez +Unaffiliated Feng He Unaffiliated Florent Bautista +Unaffiliated Florian Coste +Unaffiliated Florian Marsylle Unaffiliated François Lafont +Unaffiliated Frank Yu +Unaffiliated Jon Bernard +Unaffiliated Gaurav Kumar Garg Unaffiliated Huang Jun -Unaffiliated Jiang Heng +Unaffiliated Haomai Wang +Unaffiliated Henry Chang +Unaffiliated Ian Kelling +Unaffiliated Ilja Slepnev +Unaffiliated Ismael Serrano Unaffiliated Janne Grunau +Unaffiliated Javier Guerra +Unaffiliated Jiang Heng Unaffiliated Jiantao He +Unaffiliated Jian Wen +Unaffiliated Karel Striegel +Unaffiliated Kefu Chai +Unaffiliated Kernel Neophyte +Unaffiliated Ketor Meng Unaffiliated Kevin Cox +Unaffiliated Kim Vandry Unaffiliated koleosfuscus Unaffiliated Laurent Guerby +Unaffiliated Lee Revell +Unaffiliated Matt Richards Unaffiliated Mehdi Abaakouk Unaffiliated Michael Nelson Unaffiliated Michael Riederer +Unaffiliated Michal Jarzabek +Unaffiliated Neha Ummareddy Unaffiliated (no author) <(no author)@29311d96-e01e-0410-9327-a35deaab8ce9> +Unaffiliated Robin Dehu +Unaffiliated Rohan Mars Unaffiliated Roman Haritonov +Unaffiliated Sergey Arkhipov +Unaffiliated Shanggao Qiu Unaffiliated Shawn Edwards +Unaffiliated Simon Guinot Unaffiliated Stephen F Taylor Unaffiliated Steve Stock +Unaffiliated Tim Freund +Unaffiliated Thomas Johnson +Unaffiliated Vartika Rai +Unaffiliated Vicente Cheng +Unaffiliated Viktor Suprun Unaffiliated Volker Voigt Unaffiliated VRan Liu Unaffiliated William A. Kennington III -Unaffiliated xan +Unaffiliated Xingyi Wu +Unaffiliated Xan Peng +Unaffiliated Xie Rui <875016668@qq.com> +Unaffiliated Xinze Chi +Unaffiliated Xiong Yiliang Unaffiliated Yann Dupont +Unaffiliated Yongyue Sun +Unaffiliated Zhicheng Wei +Unaffiliated Zhe Zhang Unilogic Networks B.V Pascal de Bruijn +UnitedStack Dong Yuan UnitedStack Guangliang Zhao -UnitedStack Haomai Wang UnitedStack Kun Huang -UnitedStack Dong Yuan UnitedStack Rongze Zhu University of California, Santa Cruz Adam Crume University of California, Santa Cruz Andrew Leung @@ -256,11 +361,12 @@ University of California, Santa Cruz Joe Buck Noah Watkins University of California, Santa Cruz Riccardo Ferretti University of California, Santa Cruz Scott A. Brandt -University of Mississippi Derek Yarnell -University of Mississippi Liam Monahan -University of Mississippi Padraig O'Sullivan +University of Maryland Derek Yarnell +University of Maryland Liam Monahan +University of Maryland Padraig O'Sullivan University of Utah Xing Lin VRT Stuart Longland +Warpnet B.V. Gerhard Muntingh Web Drive Aristoteles Neto Whatever Sylvain Munaut Wido 42on Wido den Hollander @@ -268,9 +374,11 @@ X-ION Mouad Benchchaoui X-ION Stephan Renatus Yahoo! Guang Yang Yahoo! Haifeng Liu +Yahoo! Lei Dong Yahoo! Ray Lv Yahoo! Wei Luo Yahoo! Xihui He +Yahoo! Zhi (David) Zhang YouScribe Guilhem Lettron # # Local Variables: diff --git a/.peoplemap b/.peoplemap index 5406afbc67ce0..c29508a322df1 100644 --- a/.peoplemap +++ b/.peoplemap @@ -16,12 +16,13 @@ # git log --pretty='%aN <%aE>' $range | git -c mailmap.file=.peoplemap check-mailmap --stdin | sort | uniq | sed -e 's/\(.*\) \(<.*\)/\2 \1/' | uniq --skip-field=1 --all-repeated | sed -e 's/\(.*>\) \(.*\)/\2 \1/' # Alexandre Marangone Alexandre Marangone -Alfredo Deza Alfreda Deza +Alfredo Deza Alfredo Deza Dan Mick Dan Mick David Zafman David Zafman Greg Farnum Greg Farnum Ilya Dryomov Ilya Dryomov -João Eduardo Luís João Eduardo Luís +Joao Eduardo Luis João Eduardo Luís +Joao Eduardo Luis João Eduardo Luís John Spray John Spray John Wilkins John Wilkins Josh Durgin Josh Durgin @@ -34,14 +35,12 @@ Noah Watkins Noah Watkins Patrick McGarry Patrick McGarry Ross Turk Ross Turk Sage Weil Sage Weil +Sahid Orentino Ferdjaoui Sahid Orentino Ferdjaoui Samuel Just Samuel Just Sandon Van Ness Sandon Van Ness Tamil Muthamizhan Tamil Muthamizhan Tyler Brekke Tyler Brekke Warren Usui Warren Usui -Yehuda Sadeh Yehuda Sadeh -John Wilkins John Wilkins Yan, Zheng Yan, Zheng -Sahid Orentino Ferdjaoui Sahid Orentino Ferdjaoui -João Eduardo Luís João Eduardo Luís -Ilya Dryomov Ilya Dryomov +Yehuda Sadeh Yehuda Sadeh +Yuri Weinstein Yuri Weinstein diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000000000..c28362d55bfb3 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,248 @@ +cmake_minimum_required(VERSION 2.8.11) + +project(Ceph) +set(VERSION "0.90") + +if (NOT (CMAKE_MAJOR_VERSION LESS 3)) + # Tweak policies (this one disables "missing" dependency warning) + cmake_policy(SET CMP0046 OLD) +endif(NOT (CMAKE_MAJOR_VERSION LESS 3)) + +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/modules/") + +include_directories( + ${PROJECT_BINARY_DIR}/src/include + ${OFED_PREFIX}/include + ${LEVELDB_PREFIX}/include + ${PROJECT_SOURCE_DIR}/src +) + +link_directories( + ${OFED_PREFIX}/lib + ${LEVELDB_PREFIX}/lib +) + +#Check Includes +include(CheckIncludeFiles) +include(CheckIncludeFileCXX) +include(CheckFunctionExists) + +CHECK_FUNCTION_EXISTS(fallocate CEPH_HAVE_FALLOCATE) +CHECK_FUNCTION_EXISTS(posix_fadvise HAVE_POSIX_FADVISE) +CHECK_FUNCTION_EXISTS(posix_fallocate HAVE_POSIX_FALLOCATE) +CHECK_FUNCTION_EXISTS(syncfs HAVE_SYS_SYNCFS) +CHECK_FUNCTION_EXISTS(sync_file_range HAVE_SYNC_FILE_RANGE) +CHECK_INCLUDE_FILES("arpa/inet.h" HAVE_ARPA_INET_H) +CHECK_INCLUDE_FILES("boost/random/discrete_distribution.hpp" HAVE_BOOST_RANDOM_DISCRETE_DISTRIBUTION) +CHECK_INCLUDE_FILES("dirent.h" HAVE_DIRENT_H) +CHECK_INCLUDE_FILES("dlfcn.h" HAVE_DLFCN_H) +CHECK_INCLUDE_FILES("inttypes.h" HAVE_INTTYPES_H) +CHECK_INCLUDE_FILES("linux/types.h" HAVE_LINUX_TYPES_H) +CHECK_INCLUDE_FILES("leveldb/filter_policy.h" HAVE_LEVELDB_FILTER_POLICY) +CHECK_INCLUDE_FILES("memory.h" HAVE_MEMORY_H) +CHECK_INCLUDE_FILES("ndir.h" HAVE_NDIR_H) +CHECK_INCLUDE_FILES("netdb.h" HAVE_NETDB_H) +CHECK_INCLUDE_FILES("netinet/in.h" HAVE_NETINET_IN_H) +CHECK_INCLUDE_FILES("stdint.h" HAVE_STDINT_H) +CHECK_INCLUDE_FILES("stdlib.h" HAVE_STDLIB_H) +CHECK_INCLUDE_FILES("arpa/inet.h" HAVE_ARPA_INET_H) +CHECK_INCLUDE_FILES("strings.h" HAVE_STRINGS_H) +CHECK_INCLUDE_FILES("string.h" HAVE_STRING_H) +CHECK_INCLUDE_FILES("syslog.h" HAVE_SYSLOG_H) +CHECK_INCLUDE_FILES("sys/dir.h" HAVE_SYS_DIR_H) +CHECK_INCLUDE_FILES("sys/file.h" HAVE_SYS_FILE_H) +CHECK_INCLUDE_FILES("sys/ioctl.h" HAVE_SYS_IOCTL_H) +CHECK_INCLUDE_FILES("sys/mount.h" HAVE_SYS_MOUNT_H) +CHECK_INCLUDE_FILES("sys/ndir.h" HAVE_SYS_NDIR_H) +CHECK_INCLUDE_FILES("sys/param.h" HAVE_SYS_PARAM_H) +CHECK_INCLUDE_FILES("sys/socket.h" HAVE_SYS_SOCKET_H) +CHECK_INCLUDE_FILES("sys/statvfs.h" HAVE_SYS_STATVFS_H) +CHECK_INCLUDE_FILES("sys/stat.h" HAVE_SYS_STAT_H) +CHECK_INCLUDE_FILES("sys/time.h" HAVE_SYS_TIME_H) +CHECK_INCLUDE_FILES("sys/types.h" HAVE_SYS_TYPES_H) +CHECK_INCLUDE_FILES("sys/vfs.h" HAVE_SYS_VFS_H) +CHECK_INCLUDE_FILES("sys/wait.h" HAVE_SYS_WAIT_H) +CHECK_INCLUDE_FILES("sys/xattr.h" HAVE_SYS_XATTR_H) +CHECK_INCLUDE_FILES("unistd.h" HAVE_UNISTD_H) +CHECK_INCLUDE_FILES("utime.h" HAVE_UTIME_H) +CHECK_INCLUDE_FILES("${CMAKE_SOURCE_DIR}/src/include/fiemap.h" HAVE_FIEMAP_H) +CHECK_INCLUDE_FILES("expat.h" HAVE_EXPAT_H) +CHECK_INCLUDE_FILES("fuse/fuse_lowlevel.h" HAVE_FUSE_LOWLEVEL_H) +CHECK_INCLUDE_FILES("curl/curl.h" HAVE_FUSE_LOWLEVEL_H) +CHECK_INCLUDE_FILES("fuse/fuse.h" HAVE_FUSE_H) +CHECK_INCLUDE_FILES("google/profiler.h" HAVE_PROFILER_H) +CHECK_INCLUDE_FILES("libedit/vis.h" HAVE_LIBEDIT_VIS_H) +CHECK_INCLUDE_FILES("fcgi_config.h" HAVE_FASTCGI_CONFIG_H) +CHECK_INCLUDE_FILES("fastcgi.h" HAVE_FASTCGI_H) +CHECK_INCLUDE_FILES("fcgiapp.h" FASTCGI_FASTCGI_APP_DIR) +CHECK_INCLUDE_FILES("fcgimisc.h" HAVE_FASTCGI_MISC_H) +CHECK_INCLUDE_FILES("fcgio.h" HAVE_FASTCGIO_H) +CHECK_INCLUDE_FILES("fcgios.h" FASTCGI_FASTCGIOS_DIR) +CHECK_INCLUDE_FILES("fcgi_stdio.h" HAVE_FASTCGI_STDIO_H) +CHECK_INCLUDE_FILES("openssl/ssl.h" HAVE_SSL_H) +CHECK_INCLUDE_FILES("snappy.h" HAVE_SNAPPY_H) +CHECK_INCLUDE_FILES("uuid/uuid.h" HAVE_UUID_H) +CHECK_INCLUDE_FILES("atomic_ops.h" HAVE_ATOMIC_OPS) +CHECK_INCLUDE_FILES("keyutils.h" HAVE_KEYUTILS_H) + +include(CheckSymbolExists) +CHECK_SYMBOL_EXISTS(__u8 "sys/types.h;linux/types.h" HAVE___U8) +CHECK_SYMBOL_EXISTS(__u64 "sys/types.h;linux/types.h" HAVE___U64) +CHECK_SYMBOL_EXISTS(__s64 "sys/types.h;linux/types.h" HAVE___S64) + +set(CEPH_MAN_DIR "share/man" CACHE STRING "Install location for man pages (relative to prefix).") + +option(ENABLE_SHARED "build shared libraries" ON) +if(${ENABLE_SHARED}) + set(CEPH_SHARED SHARED) +else(${ENABLE_SHARED}) + set(CEPH_SHARED STATIC) +endif(${ENABLE_SHARED}) + +find_package(udev REQUIRED) +set(HAVE_UDEV ${UDEV_FOUND}) +message(STATUS "${UDEV_LIBS}") + +option(WITH_AIO "AIO is here ON" ON) +if(${WITH_AIO}) +find_package(aio REQUIRED) +set(HAVE_LIBAIO ${AIO_FOUND}) +message(STATUS "${AIO_LIBS}") +endif(${WITH_AIO}) + +option(WITH_FUSE "Fuse is here" ON) +if(${WITH_FUSE}) +find_package(fuse REQUIRED) +set(HAVE_LIBFUSE ${FUSE_FOUND}) +CHECK_FUNCTION_EXISTS(fuse_getgroups HAVE_FUSE_GETGROUPS) +endif(${WITH_FUSE}) + +find_package(blkid REQUIRED) +set(HAVE_LIBBLKID ${BLKID_FOUND}) + +# probably fuse etc need this -- how to check? XXX +option(WITH_MDS "MDS is here" ON) + +# needs mds and? XXX +option(WITH_CLIENT "Client is here" ON) +option(WITH_LIBCEPHFS "libcephfs client library" ON) + +# key-value store +option(WITH_KVS "Key value store is here" ON) + +# remote block storage +option(WITH_RBD "Remote block storage is here" ON) + +option(WITH_LEVELDB "LevelDB is here" ON) +if(${WITH_LEVELDB}) +find_package(leveldb REQUIRED) +set(HAVE_LIBLEVELDB ${LEVELDB_FOUND}) +CHECK_INCLUDE_FILES("leveldb/filter_policy.h" HAVE_LEVELDB_FILTER_POLICY) +endif(${WITH_LEVELDB}) + +option(WITH_EXPAT "EXPAT is here" ON) +if(${WITH_EXPAT}) +find_package(expat REQUIRED) +set(HAVE_EXPAT ${EXPAT_FOUND}) +endif(${WITH_EXPAT}) + +option(WITH_FCGI "FCGI is here" ON) +if(${WITH_FCGI}) +find_package(fcgi REQUIRED) +set(HAVE_FCGI ${FCGI_FOUND}) +endif(${WITH_FCGI}) + +option(WITH_ATOMICS_OPS "Atomic Ops is here" ON) +if(${WITH_ATOMIC_OPS}) +find_package(atomic_ops REQUIRED) +set(HAVE_ATOMIC_OPS ${ATOMIC_OPS_FOUND}) +endif(${WITH_ATOMIC_OPS}) + +option(WITH_PROFILER "The Profiler is here" ON) +if(${WITH_PROFILER}) +find_package(profiler REQUIRED) +set(HAVE_PROFILER ${PROFILER_FOUND}) +endif(${WITH_PROFILER}) + +option(WITH_SNAPPY "Snappy is here" ON) +if(${WITH_SNAPPY}) +find_package(snappy REQUIRED) +set(HAVE_LIBSNAPPY ${SNAPPY_FOUND}) +endif(${WITH_SNAPPY}) + +option(WITH_TCMALLOC "Use TCMalloc as Allocator" ON) +if(${WITH_TCMALLOC}) +find_package(tcmalloc REQUIRED) +set(HAVE_LIBTCMALLOC ${Tcmalloc_FOUND}) +endif(${WITH_TCMALLOC}) + +option(USE_CRYPTOPP "Cryptopp is ON" ON) +find_package(cryptopp) +if(CRYPTOPP_FOUND) + MESSAGE(STATUS "${CRYPTOPP_LIBRARIES}") + set(CRYPTO_LIBS ${CRYPTOPP_LIBRARIES}) + set(USE_NSS 0) +else() + MESSAGE(STATUS "Cryptopp not found using NSS instead") + find_package(NSS) + if(NSS_FOUND) + set(USE_NSS 1) + set(USE_CRYPTOPP 0) + find_package(NSPR) + if(NSPR_FOUND) + set(CRYPTO_LIBS ${NSS_LIBRARIES} ${NSPR_LIBRARIES}) + #MESSAGE(STATUS "${CRYPTO_LIBS}") + #MESSAGE(STATUS "${NSS_INCLUDE_DIR} ${NSPR_INCLUDE_DIR}") + endif(NSPR_FOUND) + endif(NSS_FOUND) +endif(CRYPTOPP_FOUND) + +option(WITH_XIO "Enable XIO messaging" ON) +if(WITH_XIO) +find_package(xio) +set(HAVE_XIO ${Xio_FOUND}) +endif(WITH_XIO) + +#option for RGW +option(WITH_RADOSGW "Rados Gateway is enabled" ON) + +#option for CephFS +option(WITH_CEPHFS "CephFS is enabled" ON) + +# Now create a useable config.h +configure_file( + ${PROJECT_SOURCE_DIR}/src/include/config-h.in.cmake + ${PROJECT_BINARY_DIR}/include/acconfig.h +) +include_directories(${PROJECT_BINARY_DIR}/include) + +add_definitions(-D__linux__) + +if(${ENABLE_SHARED}) + set(Boost_USE_STATIC_LIBS OFF) +else(${ENABLE_SHARED}) + set(Boost_USE_STATIC_LIBS ON) +endif(${ENABLE_SHARED}) + +set(Boost_USE_MULTITHREADED ON) +find_package(Boost COMPONENTS thread system regex REQUIRED) +include_directories(${Boost_INCLUDE_DIRS}) + +find_package(Threads REQUIRED) + +# find out which platform we are building on +if(${CMAKE_SYSTEM_NAME} MATCHES "Linux") + set(LINUX ON) + set(UNIX ON) + FIND_PACKAGE(Threads) +endif(${CMAKE_SYSTEM_NAME} MATCHES "Linux") + +# find out which platform we are building on +if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin") +set(OperatingSystem "Mac OS X") +endif(${CMAKE_SYSTEM_NAME} MATCHES "Darwin") + +add_subdirectory(src) + +# man pages must be preprocessed, not supported yet +#add_subdirectory(man) diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index ed7e4eadc1175..3bc51fd4184e5 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -6,5 +6,13 @@ For documentation patches the the following guide will help you get started `Documenting Ceph`_ +Performance enhancements must come with test data and detailed +explanations. + +Code cleanup is appreciated along with a patch that fixes a bug or +implements a feature. Except on rare occasions, code cleanup that only +involve coding style or whitespace modifications are discouraged, +primarily because they cause problems when rebasing and backporting. + .. _Submitting Patches: SubmittingPatches .. _Documenting Ceph: doc/start/documenting-ceph.rst diff --git a/COPYING b/COPYING index 1c3035af664c0..1b88923155cc8 100644 --- a/COPYING +++ b/COPYING @@ -11,13 +11,16 @@ Files: doc/* Copyright: (c) 2010-2012 New Dream Network and contributors License: Creative Commons Attribution-ShareAlike (CC BY-SA) +Files: bin/git-archive-all.sh +License: GPL3 + Files: src/mount/canonicalize.c Copyright: Copyright (C) 1993 Rick Sladkey License: LGPL2 or later Files: src/os/btrfs_ioctl.h Copyright: Copyright (C) 2007 Oracle. All rights reserved. -License: GPL2 +License: GPL2 (see COPYING-GPL2) Files: src/include/ceph_hash.cc Copyright: None diff --git a/COPYING-GPL2 b/COPYING-GPL2 new file mode 100644 index 0000000000000..d159169d10508 --- /dev/null +++ b/COPYING-GPL2 @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/Doxyfile b/Doxyfile index 18204c681a33d..b3f413dd30506 100644 --- a/Doxyfile +++ b/Doxyfile @@ -7,9 +7,10 @@ SYMBOL_CACHE_SIZE = 2 WARN_IF_UNDOCUMENTED = NO INPUT = src RECURSIVE = YES -EXCLUDE = src/gtest \ +EXCLUDE = src/gmock \ src/test/virtualenv \ - src/out + src/out \ + src/tracing VERBATIM_HEADERS = NO GENERATE_HTML = NO GENERATE_LATEX = NO diff --git a/Makefile.am b/Makefile.am index 621fd59b5c392..d6f7bbdf19edf 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,53 +1,73 @@ AUTOMAKE_OPTIONS = gnu ACLOCAL_AMFLAGS = -I m4 EXTRA_DIST = autogen.sh ceph.spec.in ceph.spec install-deps.sh -# the "." here makes sure check-local builds gtest before it is used -SUBDIRS = . src man +# the "." here makes sure check-local builds gtest and gmock before they are used +SUBDIRS = . src man doc systemd selinux EXTRA_DIST += \ src/test/run-cli-tests \ src/test/run-cli-tests-maybe-unset-ccache \ src/test/cli \ src/test/downloads \ + systemd/ceph.tmpfiles.d \ udev/50-rbd.rules \ udev/60-ceph-partuuid-workaround.rules \ udev/95-ceph-osd.rules \ + udev/95-ceph-osd.rules.systemd \ udev/95-ceph-osd-alt.rules \ share/known_hosts_drop.ceph.com \ share/id_dsa_drop.ceph.com \ share/id_dsa_drop.ceph.com.pub # why is it so hard to make autotools to this? -install-data-local: +install-data-local:: -mkdir -p $(DESTDIR)$(datadir)/ceph - -install -m 644 share/known_hosts_drop.ceph.com $(DESTDIR)$(datadir)/ceph/known_hosts_drop.ceph.com - -install -m 644 share/id_dsa_drop.ceph.com $(DESTDIR)$(datadir)/ceph/id_dsa_drop.ceph.com - -install -m 644 share/id_dsa_drop.ceph.com.pub $(DESTDIR)$(datadir)/ceph/id_dsa_drop.ceph.com.pub + -install -m 600 share/known_hosts_drop.ceph.com $(DESTDIR)$(datadir)/ceph/known_hosts_drop.ceph.com + -install -m 600 share/id_dsa_drop.ceph.com $(DESTDIR)$(datadir)/ceph/id_dsa_drop.ceph.com + -install -m 600 share/id_dsa_drop.ceph.com.pub $(DESTDIR)$(datadir)/ceph/id_dsa_drop.ceph.com.pub -all-local: +all-local:: if WITH_DEBUG # We need gtest to build the rados-api tests. We only build those in # a debug build, though. - @cd src/gtest && $(MAKE) $(AM_MAKEFLAGS) lib/libgtest.a lib/libgtest_main.a + @cd src/gmock/gtest && $(MAKE) $(AM_MAKEFLAGS) lib/libgtest.la lib/libgtest_main.la + @cd src/gmock && $(MAKE) $(AM_MAKEFLAGS) lib/libgmock.la lib/libgmock_main.la endif -check-local: all +CHECK_ULIMIT := true + +check-local:: all # We build gtest this way, instead of using SUBDIRS, because with that, # gtest's own tests would be run and that would slow us down. - @cd src/gtest && $(MAKE) $(AM_MAKEFLAGS) lib/libgtest.a lib/libgtest_main.a + @cd src/gmock/gtest && $(MAKE) $(AM_MAKEFLAGS) lib/libgtest.la lib/libgtest_main.la + @cd src/gmock && $(MAKE) $(AM_MAKEFLAGS) lib/libgmock.la lib/libgmock_main.la # exercise cli tools - $(srcdir)/src/test/run-cli-tests '$(top_builddir)/src/test' + u=`ulimit -u` ; \ + p=`expr $(shell nproc) / 2` ; \ + n=`expr $$p \* 1024` ; \ + if ${CHECK_ULIMIT} && echo ${MAKEFLAGS} | grep --quiet -e -j && test $$u -lt $$n ; then \ + echo "ulimit -u is $$u which is lower than $$n = $$p / 2 * 1024" ; \ + echo "If running make -j$$p check you will likely exceed this limit" ; \ + echo "and the tests will fail in mysterious ways." ; \ + echo "Update /etc/security/limits.conf to increase the limit" ; \ + echo "or run make CHECK_ULIMIT=false -j4 check to override this safeguard." ; \ + exit 1 ; \ + fi + +check_SCRIPTS = \ + src/test/run-cli-tests # "make distclean" both runs this and recurses into src/gtest, if # gtest is in DIST_SUBDIRS. Take extra care to not fail when # effectively cleaned twice. clean-local: - @if test -e src/gtest/Makefile; then \ - echo "Making clean in src/gtest"; \ - cd src/gtest && $(MAKE) $(AM_MAKEFLAGS) clean; \ + @if test -e src/gmock/Makefile; then \ + echo "Making clean in src/gmock"; \ + cd src/gmock && $(MAKE) $(AM_MAKEFLAGS) clean; \ fi @rm -rf src/test/virtualenv + @rm -rf install-deps-* # NOTE: This only works when enough dependencies are installed for diff --git a/PendingReleaseNotes b/PendingReleaseNotes index 67f1628a7e94c..4e6fe0da43779 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -1,30 +1,23 @@ -v0.90 ------ +v9.0.4 +====== -* Previously, the formatted output of 'ceph pg stat -f ...' was a full - pg dump that included all metadata about all PGs in the system. It - is now a concise summary of high-level PG stats, just like the - unformatted 'ceph pg stat' command. +v9.0.3 +====== +* The return code for librbd's rbd_aio_read and Image::aio_read API methods no + longer returns the number of bytes read upon success. Instead, it returns 0 + upon success and a negative value upon failure. +* 'ceph scrub', 'ceph compact' and 'ceph sync force are now DEPRECATED. Users + should instead use 'ceph mon scrub', 'ceph mon compact' and + 'ceph mon sync force'. -* All JSON dumps of floating point values were incorrecting surrounding the - value with quotes. These quotes have been removed. Any consumer of structured - JSON output that was consuming the floating point values was previously having - to interpret the quoted string and will most likely need to be fixed to take - the unquoted number. +* 'ceph mon_metadata' should now be used as 'ceph mon metadata'. There is no + need to deprecate this command (same major release since it was first + introduced). -* The 'category' field for objects has been removed. This was originally added - to track PG stat summations over different categories of objects for use by - radosgw. It is no longer has any known users and is prone to abuse because it - can lead to a pg_stat_t structure that is unbounded. The librados API calls - that accept this field now ignore it, and the OSD no longers tracks the - per-category summations. +* The `--dump-json` option of "osdmaptool" is replaced by `--dump json`. -* The output for 'rados df' has changed. The 'category' level has been - eliminated, so there is now a single stat object per pool. The structure of - the JSON output is different, and the plaintext output has one less column. - -* The 'rados create [category]' optional category argument is no - longer supported or recognized. +* The commands of "pg ls-by-{pool,primary,osd}" and "pg ls" now take "recovering" + instead of "recovery", to include the recovering pgs in the listed pgs. diff --git a/README.cmake b/README.cmake new file mode 100644 index 0000000000000..ad1aa01a9ab28 --- /dev/null +++ b/README.cmake @@ -0,0 +1,62 @@ +Overview +======== +This is a work in progress Cmake build system. Currently it builds alimited set of targets, +and only on Linux/posix. The goals include faster builds (see for yourself), cleaner +builds (no libtool), and improved portability (e.g., Windows). + +Building Ceph +============= +To build out of source make an empty directory called "build" and run: +$ cmake [path to top level ceph-local directory] + +To build in source make an empty directory called "build" and run: +$ cmake .. + +Once the Configuring is done and the Build files have been written to the current +build directory run: +$ make + +To build only certain targets type in: +$ make [target name] + +To install, once all the targets are built run: +$ make install + +Options +======= +There is an option to build the Rados Gateway that is defaulted to ON +To build without the Rados Gateway: +$ cmake [path to top level ceph-local directory] -DWITH_RADOSGW=OFF + +To build with debugging and alternate locations for (a couple of) +external dependencies: +$ cmake -DLEVELDB_PREFIX="/opt/hyperleveldb" -DOFED_PREFIX="/opt/ofed" \ + -DCMAKE_INSTALL_PREFIX=/opt/accelio -DCMAKE_C_FLAGS="-O0 -g3 -gdwarf-4" \ + .. + +With future development efforts more options will be implemented + +Targets Built +============== +ceph-mon +ceph-osd +ceph-mds +cephfs +ceph-syn +rados +radosgw (set ON as a default) +librados-config +ceph-conf +monmaptool +osdmaptool +crushtool +ceph-authtool +init-ceph +mkcephfs +mon_store_converter +ceph-fuse + +Future work will be done to build more targets, check for libraries and headers more thoroughly +and include tests to make this build become more robust. CMake allows ceph to build onto many +platforms such as Windows though the shell scripts need bash/unix to run. + diff --git a/README.md b/README.md index 3661d0964fd4f..db891ea91b3d2 100644 --- a/README.md +++ b/README.md @@ -26,23 +26,12 @@ contributed under the terms of the applicable license. Build Prerequisites =================== -debian-based ------------- - -The list of debian packages dependencies can be installed with: +The list of Debian or RPM packages dependencies can be installed with: ./install-deps.sh Note: libsnappy-dev and libleveldb-dev are not available upstream for -natty, oneiric, and squeeze. Backports for Ceph can be found at -ceph.com/debian-leveldb. - -rpm-based ---------- - -The list of RPM packages dependencies can be installed with: - - ./install-deps.sh +Debian Squeeze. Backports for Ceph can be found at ceph.com/debian-leveldb. Building Ceph ============= diff --git a/README.xio b/README.xio new file mode 100644 index 0000000000000..857211a9ef6f5 --- /dev/null +++ b/README.xio @@ -0,0 +1,37 @@ + + +Building + +The Accelio XioMessenger is built conditionally only, when --enable-xio is +provided to configure. + +Accelio depends on OpenFabrics verbs development headers/libraries. +Prior to installing the Accelio package, the following prerequisites are required: + +RedHat (RHEL 6.4 and above) + + yum groupinstall "Infiniband Support" + yum install libtool autoconf automake + yum install infiniband-diags perftest libibverbs-utils librdmacm-utils + yum install librdmacm-devel libibverbs-devel numactl numactl-devel libaio-devel libevent-devel + + +Ubuntu (Ubuntu 13.04 and above) + + apt-get install libtool autoconf automake build-essential + apt-get install ibverbs-utils rdmacm-utils infiniband-diags perftest + apt-get install librdmacm-dev libibverbs-dev numactl libnuma-dev libaio-dev libevent-dev + + +MLNX_OFED + + Mellanox OFED pacakage is optional and recommended + look for latest OFED packages at: + http://www.mellanox.com/page/products_dyn?product_family=26 + + +If either the Accelio or OFED development environments are in non-standard locations, +include and library paths must be added to both CFLAGS and CXXFLAGS, in the Autotools +build as of 1/1/2015. + +-Matt diff --git a/SubmittingPatches b/SubmittingPatches index 5802c876156f1..4831215a87588 100644 --- a/SubmittingPatches +++ b/SubmittingPatches @@ -73,7 +73,7 @@ exactly the same in your tree and the submitters'. If you stick strictly to rule (c), you should ask the submitter to rediff, but this is a totally counter-productive waste of time and energy. Rule (b) allows you to adjust the code, but then it is very impolite to change one submitter's code and -make him endorse your bugs. To solve this problem, it is recommended that +make them endorse your bugs. To solve this problem, it is recommended that you add a line between the last Signed-off-by header and yours, indicating the nature of your changes. While there is nothing mandatory about this, it seems like prepending the description with your mail and/or name, all @@ -233,7 +233,7 @@ allows you to submit pull requests directly from the command line: $ hub pull-request -b ceph:master -h you:mything -Pull rqeuests appear in the review queue at +Pull requests appear in the review queue at https://github.com/organizations/ceph/dashboard/pulls @@ -242,7 +242,7 @@ email list to ensure your submission is noticed. When addressing review comments, can should either add additional patches to your branch or (better yet) squash those changes into the relevant commits so -that the sequence of changes of "clean" and gets things right the first time. +that the sequence of changes is "clean" and gets things right the first time. The 'git rebase -i' command is very helpful in this process. Once you have updated your local branch, you can simply force-push to the existing branch in your public repository that is referenced by the pull request with @@ -253,6 +253,34 @@ and your changes will be visible from the existing pull-request. You may want to ping the reviewer again or comment on the pull request to ensure the updates are noticed. +Q: Which branch should I target in my pull request? + +A: The target branch depends on the nature of your change: + + If you are adding a feature, target the "master" branch in your pull + request. + + If you are fixing a bug, target the "next" branch in your pull request. + The Ceph core developers will periodically merge "next" into "master". When + this happens, the master branch will contain your fix as well. + + If you are fixing a bug (see above) *and* the bug exists in older stable + branches (for example, the "dumpling" or "firefly" branches), then you + should file a Redmine ticket describing your issue and fill out the + "Backport: " form field. This will notify other developers that + your commit should be cherry-picked to these stable branches. For example, + you should set "Backport: firefly" in your Redmine ticket to indicate that + you are fixing a bug that exists on the "firefly" branch and that you + desire that your change be cherry-picked to that branch. + +Q: How to include Reviewed-by: tag(s) in my pull request? + + You don't. If someone reviews your pull request, they should indicate they + have done so by commenting on it with "+1", "looks good to me", "LGTM", + and/or the entire "Reviewed-by: ..." line with their name and email address. + + The developer merging the pull request should note positive reviews and + include the appropriate Reviewed-by: lines in the merge commit. 2) Patch submission via ceph-devel@vger.kernel.org diff --git a/admin/build-doc b/admin/build-doc index ae9050d7a7557..0b6e88eb81ed5 100755 --- a/admin/build-doc +++ b/admin/build-doc @@ -1,4 +1,5 @@ #!/bin/sh + cd "$(dirname "$0")" cd .. TOPDIR=`pwd` @@ -17,6 +18,17 @@ if command -v dpkg >/dev/null; then echo "sudo apt-get install $missing" exit 1 fi +elif command -v yum >/dev/null; then + for package in python-devel python-pip python-virtualenv doxygen ditaa ant libxml-devel libxslt-devel; do + if ! rpm -q $package >/dev/null ; then + missing="${missing:+$missing }$package" + fi + done + if [ -n "$missing" ]; then + echo "$0: missing required packages, please install them:" 1>&2 + echo "yum install $missing" + exit 1 + fi else for command in virtualenv doxygen ant ditaa; do command -v "$command" > /dev/null; @@ -33,9 +45,9 @@ else fi fi -if [ ! -e build-doc/doxygen/xml ]; then - doxygen -fi +# Don't enable -e until after running all the potentially-erroring checks +# for availability of commands +set -e cat src/osd/PG.h src/osd/PG.cc | doc/scripts/gen_state_diagram.py > doc/dev/peering_graph.generated.dot diff --git a/admin/doc-requirements.txt b/admin/doc-requirements.txt index ce4654c660007..aba92c28bef9d 100644 --- a/admin/doc-requirements.txt +++ b/admin/doc-requirements.txt @@ -1,3 +1,3 @@ Sphinx == 1.1.3 -e git+https://github.com/ceph/sphinx-ditaa.git#egg=sphinx-ditaa --e git+https://github.com/ceph/asphyxiate.git#egg=asphyxiate +-e git+https://github.com/michaeljones/breathe#egg=breathe diff --git a/autogen.sh b/autogen.sh index f332cdd4e54f9..99d4f7b446610 100755 --- a/autogen.sh +++ b/autogen.sh @@ -29,7 +29,8 @@ else fi if test -d ".git" ; then - if ! git submodule sync || ! git submodule update --init; then + force=$(if git submodule usage 2>&1 | grep --quiet 'update.*--force'; then echo --force ; fi) + if ! git submodule sync || ! git submodule update $force --init --recursive; then echo "Error: could not initialize submodule projects" echo " Network connectivity might be required." exit 1 @@ -44,6 +45,6 @@ aclocal -I m4 --install autoconf autoheader automake -a --add-missing -Wall -( cd src/gtest && autoreconf -fvi; ) +( cd src/gmock && autoreconf -fvi; ) ( cd src/rocksdb && autoreconf -fvi; ) exit diff --git a/bin/git-archive-all.sh b/bin/git-archive-all.sh new file mode 100755 index 0000000000000..68c31eac49759 --- /dev/null +++ b/bin/git-archive-all.sh @@ -0,0 +1,285 @@ +#!/bin/bash - +# +# File: git-archive-all.sh +# +# Description: A utility script that builds an archive file(s) of all +# git repositories and submodules in the current path. +# Useful for creating a single tarfile of a git super- +# project that contains other submodules. +# +# Examples: Use git-archive-all.sh to create archive distributions +# from git repositories. To use, simply do: +# +# cd $GIT_DIR; git-archive-all.sh +# +# where $GIT_DIR is the root of your git superproject. +# +# License: GPL3 +# +############################################################################### +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +# +############################################################################### + +# DEBUGGING +set -e +set -C # noclobber + +# TRAP SIGNALS +trap 'cleanup' QUIT EXIT + +# For security reasons, explicitly set the internal field separator +# to newline, space, tab +OLD_IFS=$IFS +IFS=' + ' + +function cleanup () { + rm -f $TMPFILE + rm -f $TOARCHIVE + IFS="$OLD_IFS" +} + +function usage () { + echo "Usage is as follows:" + echo + echo "$PROGRAM <--version>" + echo " Prints the program version number on a line by itself and exits." + echo + echo "$PROGRAM <--usage|--help|-?>" + echo " Prints this usage output and exits." + echo + echo "$PROGRAM [--format ] [--prefix ] [--verbose|-v] [--separate|-s]" + echo " [--tree-ish|-t ] [--ignore pattern] [output_file]" + echo " Creates an archive for the entire git superproject, and its submodules" + echo " using the passed parameters, described below." + echo + echo " If '--format' is specified, the archive is created with the named" + echo " git archiver backend. Obviously, this must be a backend that git archive" + echo " understands. The format defaults to 'tar' if not specified." + echo + echo " If '--prefix' is specified, the archive's superproject and all submodules" + echo " are created with the prefix named. The default is to not use one." + echo + echo " If '--separate' or '-s' is specified, individual archives will be created" + echo " for each of the superproject itself and its submodules. The default is to" + echo " concatenate individual archives into one larger archive." + echo + echo " If '--tree-ish' is specified, the archive will be created based on whatever" + echo " you define the tree-ish to be. Branch names, commit hash, etc. are acceptable." + echo " Defaults to HEAD if not specified. See git archive's documentation for more" + echo " information on what a tree-ish is." + echo + echo " If '--ignore' is specified, we will filter out any submodules that" + echo " match the specified pattern." + echo + echo " If 'output_file' is specified, the resulting archive is created as the" + echo " file named. This parameter is essentially a path that must be writeable." + echo " When combined with '--separate' ('-s') this path must refer to a directory." + echo " Without this parameter or when combined with '--separate' the resulting" + echo " archive(s) are named with a dot-separated path of the archived directory and" + echo " a file extension equal to their format (e.g., 'superdir.submodule1dir.tar')." + echo + echo " If '--verbose' or '-v' is specified, progress will be printed." +} + +function version () { + echo "$PROGRAM version $VERSION" +} + +# Internal variables and initializations. +readonly PROGRAM=`basename "$0"` +readonly VERSION=0.2 + +OLD_PWD="`pwd`" +TMPDIR=${TMPDIR:-/tmp} +TMPFILE=`mktemp "$TMPDIR/$PROGRAM.XXXXXX"` # Create a place to store our work's progress +TOARCHIVE=`mktemp "$TMPDIR/$PROGRAM.toarchive.XXXXXX"` +OUT_FILE=$OLD_PWD # assume "this directory" without a name change by default +SEPARATE=0 +VERBOSE=0 + +TARCMD=tar +[[ $(uname) == "Darwin" ]] && TARCMD=gnutar +FORMAT=tar +PREFIX= +TREEISH=HEAD +IGNORE= + +# RETURN VALUES/EXIT STATUS CODES +readonly E_BAD_OPTION=254 +readonly E_UNKNOWN=255 + +# Process command-line arguments. +while test $# -gt 0; do + case $1 in + --format ) + shift + FORMAT="$1" + shift + ;; + + --prefix ) + shift + PREFIX="$1" + shift + ;; + + --separate | -s ) + shift + SEPARATE=1 + ;; + + --tree-ish | -t ) + shift + TREEISH="$1" + shift + ;; + + --ignore ) + shift + IGNORE="$1" + shift + ;; + + --version ) + version + exit + ;; + + --verbose | -v ) + shift + VERBOSE=1 + ;; + + -? | --usage | --help ) + usage + exit + ;; + + -* ) + echo "Unrecognized option: $1" >&2 + usage + exit $E_BAD_OPTION + ;; + + * ) + break + ;; + esac +done + +if [ ! -z "$1" ]; then + OUT_FILE="$1" + shift +fi + +# Validate parameters; error early, error often. +if [ $SEPARATE -eq 1 -a ! -d $OUT_FILE ]; then + echo "When creating multiple archives, your destination must be a directory." + echo "If it's not, you risk being surprised when your files are overwritten." + exit +elif [ `git config -l | grep -q '^core\.bare=false'; echo $?` -ne 0 ]; then + echo "$PROGRAM must be run from a git working copy (i.e., not a bare repository)." + exit +fi + +# Create the superproject's git-archive +if [ $VERBOSE -eq 1 ]; then + echo -n "creating superproject archive..." +fi +git archive --format=$FORMAT --prefix="$PREFIX" $TREEISH > $TMPDIR/$(basename "$(pwd)").$FORMAT +if [ $VERBOSE -eq 1 ]; then + echo "done" +fi +echo $TMPDIR/$(basename "$(pwd)").$FORMAT >| $TMPFILE # clobber on purpose +superfile=`head -n 1 $TMPFILE` + +if [ $VERBOSE -eq 1 ]; then + echo -n "looking for subprojects..." +fi +# find all '.git' dirs, these show us the remaining to-be-archived dirs +# we only want directories that are below the current directory +find . -mindepth 2 -name '.git' -type d -print | sed -e 's/^\.\///' -e 's/\.git$//' >> $TOARCHIVE +# as of version 1.7.8, git places the submodule .git directories under the superprojects .git dir +# the submodules get a .git file that points to their .git dir. we need to find all of these too +find . -mindepth 2 -name '.git' -type f -print | xargs grep -l "gitdir" | sed -e 's/^\.\///' -e 's/\.git$//' >> $TOARCHIVE + +if [ -n "$IGNORE" ]; then + cat $TOARCHIVE | grep -v $IGNORE > $TOARCHIVE.new + mv $TOARCHIVE.new $TOARCHIVE +fi + +if [ $VERBOSE -eq 1 ]; then + echo "done" + echo " found:" + cat $TOARCHIVE | while read arch + do + echo " $arch" + done +fi + +if [ $VERBOSE -eq 1 ]; then + echo -n "archiving submodules..." +fi +while read path; do + TREEISH=$(git submodule | grep "^ .*${path%/} " | cut -d ' ' -f 2) # git submodule does not list trailing slashes in $path + cd "$path" + git archive --format=$FORMAT --prefix="${PREFIX}$path" ${TREEISH:-HEAD} > "$TMPDIR"/"$(echo "$path" | sed -e 's/\//./g')"$FORMAT + if [ $FORMAT == 'zip' ]; then + # delete the empty directory entry; zipped submodules won't unzip if we don't do this + zip -d "$(tail -n 1 $TMPFILE)" "${PREFIX}${path%/}" >/dev/null # remove trailing '/' + fi + echo "$TMPDIR"/"$(echo "$path" | sed -e 's/\//./g')"$FORMAT >> $TMPFILE + cd "$OLD_PWD" +done < $TOARCHIVE +if [ $VERBOSE -eq 1 ]; then + echo "done" +fi + +if [ $VERBOSE -eq 1 ]; then + echo -n "concatenating archives into single archive..." +fi +# Concatenate archives into a super-archive. +if [ $SEPARATE -eq 0 ]; then + if [ $FORMAT == 'tar' ]; then + sed -e '1d' $TMPFILE | while read file; do + $TARCMD --concatenate -f "$superfile" "$file" && rm -f "$file" + done + elif [ $FORMAT == 'zip' ]; then + sed -e '1d' $TMPFILE | while read file; do + # zip incorrectly stores the full path, so cd and then grow + cd `dirname "$file"` + zip -g "$superfile" `basename "$file"` && rm -f "$file" + done + cd "$OLD_PWD" + fi + + echo "$superfile" >| $TMPFILE # clobber on purpose +fi +if [ $VERBOSE -eq 1 ]; then + echo "done" +fi + +if [ $VERBOSE -eq 1 ]; then + echo -n "moving archive to $OUT_FILE..." +fi +while read file; do + mv "$file" "$OUT_FILE" +done < $TMPFILE +if [ $VERBOSE -eq 1 ]; then + echo "done" +fi diff --git a/ceph-erasure-code-corpus b/ceph-erasure-code-corpus index f1f95a1de9dea..dc409e0b2095e 160000 --- a/ceph-erasure-code-corpus +++ b/ceph-erasure-code-corpus @@ -1 +1 @@ -Subproject commit f1f95a1de9dea91b0f8830a01e8b8673e3f241e1 +Subproject commit dc409e0b2095eeb960518ab9c8ee47a34264f4c1 diff --git a/ceph-object-corpus b/ceph-object-corpus index bb3cee6b85b93..20351c6bae6dd 160000 --- a/ceph-object-corpus +++ b/ceph-object-corpus @@ -1 +1 @@ -Subproject commit bb3cee6b85b93210af5fb2c65a33f3000e341a11 +Subproject commit 20351c6bae6dd4802936a5a9fd76e41b8ce2bad0 diff --git a/ceph.spec.in b/ceph.spec.in index c5a68c2af205f..09635f11dc313 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -1,13 +1,42 @@ +# vim: set noexpandtab ts=8 sw=8 : %bcond_with ocf +%bcond_without cephfs_java +%bcond_with tests +%bcond_without tcmalloc +%bcond_without libs_compat +%bcond_without selinux -%if ! (0%{?fedora} > 12 || 0%{?rhel} > 5) + +%if (0%{?el5} || (0%{?rhel_version} >= 500 && 0%{?rhel_version} <= 600)) %{!?python_sitelib: %global python_sitelib %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")} %{!?python_sitearch: %global python_sitearch %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib(1))")} %endif -################################################################################# -# common -################################################################################# +%if %{with selinux} +# get selinux policy version +%{!?_selinux_policy_version: %global _selinux_policy_version %(sed -e 's,.*selinux-policy-\\([^/]*\\)/.*,\\1,' /usr/share/selinux/devel/policyhelp 2>/dev/null || echo 0.0.0)} + +%define relabel_files() \ +restorecon -R /usr/bin/ceph-mon > /dev/null 2>&1; \ +restorecon -R /usr/bin/ceph-osd > /dev/null 2>&1; \ +restorecon -R /usr/bin/ceph-mds > /dev/null 2>&1; \ +restorecon -R /etc/rc\.d/init\.d/ceph > /dev/null 2>&1; \ +restorecon -R /var/run/ceph > /dev/null 2>&1; \ +restorecon -R /var/lib/ceph > /dev/null 2>&1; \ +restorecon -R /var/log/ceph > /dev/null 2>&1; +%endif + +%{!?_udevrulesdir: %global _udevrulesdir /lib/udev/rules.d} + +# Use systemd files on RHEL 7 and above and in SUSE/openSUSE. +# Note: We don't install unit files for the services yet. For now, +# the _with_systemd variable only implies that we'll install +# /etc/tmpfiles.d/ceph.conf in order to set up the socket directory in +# /var/run/ceph. +%if 0%{?fedora} || 0%{?rhel} >= 7 || 0%{?suse_version} > 1300 +%global _with_systemd 1 +%endif + Name: ceph Version: @VERSION@ Release: @RPM_RELEASE@%{?dist} @@ -17,80 +46,113 @@ License: GPL-2.0 Group: System Environment/Base URL: http://ceph.com/ Source0: http://ceph.com/download/%{name}-%{version}.tar.bz2 -%if 0%{?fedora} || 0%{?centos} || 0%{?rhel} +%if 0%{?fedora} || 0%{?rhel} Patch0: init-ceph.in-fedora.patch %endif +################################################################################# +# dependencies that apply across all distro families +################################################################################# Requires: librbd1 = %{epoch}:%{version}-%{release} Requires: librados2 = %{epoch}:%{version}-%{release} Requires: libcephfs1 = %{epoch}:%{version}-%{release} Requires: ceph-common = %{epoch}:%{version}-%{release} -Requires: python-ceph = %{epoch}:%{version}-%{release} +%if 0%{with selinux} +Requires: ceph-selinux = %{epoch}:%{version}-%{release} +%endif +Requires: python-rados = %{epoch}:%{version}-%{release} +Requires: python-rbd = %{epoch}:%{version}-%{release} +Requires: python-cephfs = %{epoch}:%{version}-%{release} Requires: python -Requires: python-argparse Requires: python-requests -Requires: python-flask +Requires: grep Requires: xfsprogs -Requires: cryptsetup +Requires: logrotate Requires: parted Requires: util-linux Requires: hdparm +Requires: cryptsetup +Requires: findutils +Requires: which Requires(post): binutils -BuildRequires: make +# We require this to be present for %%{_tmpfilesdir} +%if 0%{?_with_systemd} +Requires: systemd +%endif +%if 0%{with cephfs_java} +BuildRequires: sharutils +%endif +%if 0%{with selinux} +BuildRequires: checkpolicy +BuildRequires: selinux-policy-devel +BuildRequires: /usr/share/selinux/devel/policyhelp +%endif BuildRequires: gcc-c++ -BuildRequires: libtool BuildRequires: boost-devel -BuildRequires: bzip2-devel -BuildRequires: libedit-devel -BuildRequires: perl +BuildRequires: cryptsetup BuildRequires: gdbm -BuildRequires: pkgconfig -BuildRequires: python -BuildRequires: python-nose -BuildRequires: python-argparse +BuildRequires: hdparm BuildRequires: libaio-devel BuildRequires: libcurl-devel +BuildRequires: libedit-devel BuildRequires: libxml2-devel BuildRequires: libuuid-devel BuildRequires: libblkid-devel >= 2.17 BuildRequires: libudev-devel +BuildRequires: libtool BuildRequires: leveldb-devel > 1.2 +BuildRequires: make +BuildRequires: perl +BuildRequires: parted +BuildRequires: pkgconfig +BuildRequires: python +BuildRequires: python-nose +BuildRequires: python-requests +%if ( 0%{?rhel} > 0 && 0%{?rhel} < 7 ) || ( 0%{?centos} > 0 && 0%{?centos} < 7 ) +BuildRequires: python-sphinx10 +%endif +%if 0%{?fedora} || 0%{defined suse_version} || 0%{?rhel} >= 7 || 0%{?centos} >= 7 +BuildRequires: python-sphinx +%endif + +BuildRequires: python-virtualenv +BuildRequires: snappy-devel +BuildRequires: util-linux +BuildRequires: xfsprogs BuildRequires: xfsprogs-devel +BuildRequires: xmlstarlet BuildRequires: yasm -%if 0%{?rhel} || 0%{?centos} || 0%{?fedora} -BuildRequires: snappy-devel -%endif ################################################################################# -# specific +# distro-conditional dependencies ################################################################################# -%if ! 0%{?rhel} || 0%{?fedora} -BuildRequires: sharutils -%endif - %if 0%{defined suse_version} +Requires: python-Flask +BuildRequires: net-tools +BuildRequires: libbz2-devel %if 0%{?suse_version} > 1210 Requires: gptfdisk +%if 0%{with tcmalloc} BuildRequires: gperftools-devel +%endif %else Requires: scsirastools BuildRequires: google-perftools-devel %endif -Recommends: logrotate BuildRequires: %insserv_prereq BuildRequires: mozilla-nss-devel BuildRequires: keyutils-devel BuildRequires: libatomic-ops-devel -BuildRequires: fdupes %else -Requires: gdisk +BuildRequires: bzip2-devel BuildRequires: nss-devel BuildRequires: keyutils-libs-devel BuildRequires: libatomic_ops-devel Requires: gdisk Requires(post): chkconfig -Requires(preun):chkconfig -Requires(preun):initscripts +Requires(preun): chkconfig +Requires(preun): initscripts BuildRequires: gperftools-devel +Requires: python-flask %endif %description @@ -107,11 +169,23 @@ Summary: Ceph Common Group: System Environment/Base Requires: librbd1 = %{epoch}:%{version}-%{release} Requires: librados2 = %{epoch}:%{version}-%{release} -Requires: python-ceph = %{epoch}:%{version}-%{release} +Requires: python-rados = %{epoch}:%{version}-%{release} +Requires: python-rbd = %{epoch}:%{version}-%{release} +Requires: python-cephfs = %{epoch}:%{version}-%{release} Requires: python-requests +%if 0%{?rhel} || 0%{?fedora} Requires: redhat-lsb-core +%endif +%if 0%{defined suse_version} +Requires: lsb-release +%endif +# python-argparse is only needed in distros with Python 2.6 or lower +%if (0%{?rhel} && 0%{?rhel} <= 6) || (0%{?suse_version} && 0%{?suse_version} <= 1110) +Requires: python-argparse +BuildRequires: python-argparse +%endif %description -n ceph-common -common utilities to mount and interact with a ceph storage cluster +Common utilities to mount and interact with a ceph storage cluster. %package fuse Summary: Ceph fuse-based client @@ -131,20 +205,6 @@ BuildRequires: fuse-devel %description -n rbd-fuse FUSE based client to map Ceph rbd images to files -%package devel -Summary: Ceph headers -Group: Development/Libraries -License: LGPL-2.0 -Requires: %{name} = %{epoch}:%{version}-%{release} -Requires: librados2 = %{epoch}:%{version}-%{release} -Requires: libradosstriper1 = %{epoch}:%{version}-%{release} -Requires: librbd1 = %{epoch}:%{version}-%{release} -Requires: libcephfs1 = %{epoch}:%{version}-%{release} -Requires: libcephfs_jni1 = %{epoch}:%{version}-%{release} -%description devel -This package contains libraries and headers needed to develop programs -that use Ceph. - %package radosgw Summary: Rados REST gateway Group: Development/Libraries @@ -153,14 +213,15 @@ Requires: librados2 = %{epoch}:%{version}-%{release} %if 0%{defined suse_version} BuildRequires: libexpat-devel BuildRequires: FastCGI-devel -Requires: apache2-mod_fcgid -%else +%endif +%if 0%{?rhel} || 0%{?fedora} BuildRequires: expat-devel BuildRequires: fcgi-devel +Requires: mailcap %endif %description radosgw -radosgw is an S3 HTTP REST gateway for the RADOS object store. It is -implemented as a FastCGI module using libfcgi, and can be used in +This package is an S3 HTTP REST gateway for the RADOS object store. It +is implemented as a FastCGI module using libfcgi, and can be used in conjunction with any FastCGI capable web server. %if %{with ocf} @@ -180,7 +241,7 @@ managers such as Pacemaker. Summary: RADOS distributed object store client library Group: System Environment/Libraries License: LGPL-2.0 -%if 0%{?rhel} || 0%{?centos} || 0%{?fedora} +%if 0%{?rhel} || 0%{?fedora} Obsoletes: ceph-libs < %{epoch}:%{version}-%{release} %endif %description -n librados2 @@ -189,22 +250,53 @@ developed as part of the Ceph distributed storage system. This is a shared library allowing applications to access the distributed object store using a simple file-like interface. +%package -n librados2-devel +Summary: RADOS headers +Group: Development/Libraries +License: LGPL-2.0 +Requires: librados2 = %{epoch}:%{version}-%{release} +Obsoletes: ceph-devel < %{epoch}:%{version}-%{release} +%description -n librados2-devel +This package contains libraries and headers needed to develop programs +that use RADOS object store. + +%package -n python-rados +Summary: Python libraries for the RADOS object store +Group: System Environment/Libraries +License: LGPL-2.0 +Requires: librados2 = %{epoch}:%{version}-%{release} +Obsoletes: python-ceph < %{epoch}:%{version}-%{release} +%description -n python-rados +This package contains Python libraries for interacting with Cephs RADOS +object store. + %package -n libradosstriper1 Summary: RADOS striping interface Group: System Environment/Libraries License: LGPL-2.0 -Requires: librados2 = %{epoch}:%{version} +Requires: librados2 = %{epoch}:%{version}-%{release} %description -n libradosstriper1 Striping interface built on top of the rados library, allowing to stripe bigger objects onto several standard rados objects using an interface very similar to the rados one. +%package -n libradosstriper1-devel +Summary: RADOS striping interface headers +Group: Development/Libraries +License: LGPL-2.0 +Requires: libradosstriper1 = %{epoch}:%{version}-%{release} +Requires: librados2-devel = %{epoch}:%{version}-%{release} +Obsoletes: ceph-devel < %{epoch}:%{version}-%{release} +%description -n libradosstriper1-devel +This package contains libraries and headers needed to develop programs +that use RADOS striping interface. + %package -n librbd1 Summary: RADOS block device client library Group: System Environment/Libraries License: LGPL-2.0 Requires: librados2 = %{epoch}:%{version}-%{release} -%if 0%{?rhel} || 0%{?centos} || 0%{?fedora} +%if 0%{?rhel} || 0%{?fedora} Obsoletes: ceph-libs < %{epoch}:%{version}-%{release} %endif %description -n librbd1 @@ -213,11 +305,33 @@ RADOS, a reliable, autonomic distributed object storage cluster developed as part of the Ceph distributed storage system. This is a shared library allowing applications to manage these block devices. +%package -n librbd1-devel +Summary: RADOS block device headers +Group: Development/Libraries +License: LGPL-2.0 +Requires: librbd1 = %{epoch}:%{version}-%{release} +Requires: librados2-devel = %{epoch}:%{version}-%{release} +Obsoletes: ceph-devel < %{epoch}:%{version}-%{release} +%description -n librbd1-devel +This package contains libraries and headers needed to develop programs +that use RADOS block device. + +%package -n python-rbd +Summary: Python libraries for the RADOS block device +Group: System Environment/Libraries +License: LGPL-2.0 +Requires: librbd1 = %{epoch}:%{version}-%{release} +Requires: python-rados = %{epoch}:%{version}-%{release} +Obsoletes: python-ceph < %{epoch}:%{version}-%{release} +%description -n python-rbd +This package contains Python libraries for interacting with Cephs RADOS +block device. + %package -n libcephfs1 Summary: Ceph distributed file system client library Group: System Environment/Libraries License: LGPL-2.0 -%if 0%{?rhel} || 0%{?centos} || 0%{?fedora} +%if 0%{?rhel} || 0%{?fedora} Obsoletes: ceph-libs < %{epoch}:%{version}-%{release} Obsoletes: ceph-libcephfs %endif @@ -227,43 +341,45 @@ performance, reliability, and scalability. This is a shared library allowing applications to access a Ceph distributed file system via a POSIX-like interface. -%package -n python-ceph -Summary: Python libraries for the Ceph distributed filesystem -Group: System Environment/Libraries +%package -n libcephfs1-devel +Summary: Ceph distributed file system headers +Group: Development/Libraries License: LGPL-2.0 -Requires: librados2 = %{epoch}:%{version}-%{release} -Requires: librbd1 = %{epoch}:%{version}-%{release} -%if 0%{defined suse_version} -%py_requires -%endif -%description -n python-ceph -This package contains Python libraries for interacting with Cephs RADOS -object storage. +Requires: libcephfs1 = %{epoch}:%{version}-%{release} +Requires: librados2-devel = %{epoch}:%{version}-%{release} +Obsoletes: ceph-devel < %{epoch}:%{version}-%{release} +%description -n libcephfs1-devel +This package contains libraries and headers needed to develop programs +that use Cephs distributed file system. -%package -n rest-bench -Summary: RESTful benchmark +%package -n python-cephfs +Summary: Python libraries for Ceph distributed file system Group: System Environment/Libraries License: LGPL-2.0 -Requires: ceph-common = %{epoch}:%{version}-%{release} -%description -n rest-bench -RESTful bencher that can be used to benchmark radosgw performance. +Requires: libcephfs1 = %{epoch}:%{version}-%{release} +Requires: python-rados = %{epoch}:%{version}-%{release} +Obsoletes: python-ceph < %{epoch}:%{version}-%{release} +%description -n python-cephfs +This package contains Python libraries for interacting with Cephs distributed +file system. %package -n ceph-test Summary: Ceph benchmarks and test tools Group: System Environment/Libraries License: LGPL-2.0 -Requires: librados2 = %{epoch}:%{version}-%{release} -Requires: librbd1 = %{epoch}:%{version}-%{release} -Requires: libcephfs1 = %{epoch}:%{version}-%{release} -%if (0%{?fedora} >= 20 || 0%{?rhel} == 6) +Requires: ceph-common +Requires: xmlstarlet +%if (0%{?fedora} || 0%{?rhel} == 6) BuildRequires: lttng-ust-devel BuildRequires: libbabeltrace-devel %endif %description -n ceph-test This package contains Ceph benchmarks and test tools. +%if 0%{with cephfs_java} + %package -n libcephfs_jni1 -Summary: Java Native Interface library for CephFS Java bindings. +Summary: Java Native Interface library for CephFS Java bindings Group: System Environment/Libraries License: LGPL-2.0 Requires: java @@ -273,20 +389,56 @@ BuildRequires: java-devel This package contains the Java Native Interface library for CephFS Java bindings. +%package -n libcephfs_jni1-devel +Summary: Development files for CephFS Java Native Interface library +Group: System Environment/Libraries +License: LGPL-2.0 +Requires: java +Requires: libcephfs_jni1 = %{epoch}:%{version}-%{release} +Obsoletes: ceph-devel < %{epoch}:%{version}-%{release} +%description -n libcephfs_jni1-devel +This package contains the development files for CephFS Java Native Interface +library. + %package -n cephfs-java -Summary: Java libraries for the Ceph File System. +Summary: Java libraries for the Ceph File System Group: System Environment/Libraries License: LGPL-2.0 Requires: java Requires: libcephfs_jni1 = %{epoch}:%{version}-%{release} BuildRequires: java-devel +%if 0%{?el6} Requires: junit4 BuildRequires: junit4 +%else +Requires: junit +BuildRequires: junit +%endif %description -n cephfs-java This package contains the Java libraries for the Ceph File System. +%endif + +%if 0%{with selinux} + +%package selinux +Summary: SELinux support for Ceph MON, OSD and MDS +Group: System Environment/Base +Requires: %{name} +Requires: policycoreutils, libselinux-utils +Requires(post): selinux-policy >= %{_selinux_policy_version}, policycoreutils +Requires(postun): policycoreutils +%description selinux +This package contains SELinux support for Ceph MON, OSD and MDS. The package +also performs file-system relabelling which can take a long time on heavily +populated file-systems. + +%endif + +%if 0%{with libs_compat} + %package libs-compat -Summary: Meta package to include ceph libraries. +Summary: Meta package to include ceph libraries Group: System Environment/Libraries License: LGPL-2.0 Obsoletes: ceph-libs @@ -302,6 +454,43 @@ former ceph-libs package, which is now split up into these three subpackages. Packages still depending on ceph-libs should be fixed to depend on librados2, librbd1 or libcephfs1 instead. +%endif + +%package devel-compat +Summary: Compatibility package for Ceph headers +Group: Development/Libraries +License: LGPL-2.0 +Obsoletes: ceph-devel +Requires: %{name} = %{epoch}:%{version}-%{release} +Requires: librados2-devel = %{epoch}:%{version}-%{release} +Requires: libradosstriper1-devel = %{epoch}:%{version}-%{release} +Requires: librbd1-devel = %{epoch}:%{version}-%{release} +Requires: libcephfs1-devel = %{epoch}:%{version}-%{release} +%if 0%{with cephfs_java} +Requires: libcephfs_jni1-devel = %{epoch}:%{version}-%{release} +%endif +Provides: ceph-devel +%description devel-compat +This is a compatibility package to accommodate ceph-devel split into +librados2-devel, librbd1-devel and libcephfs1-devel. Packages still depending +on ceph-devel should be fixed to depend on librados2-devel, librbd1-devel, +libcephfs1-devel or libradosstriper1-devel instead. + +%package -n python-ceph-compat +Summary: Compatibility package for Cephs python libraries +Group: System Environment/Libraries +License: LGPL-2.0 +Obsoletes: python-ceph +Requires: python-rados = %{epoch}:%{version}-%{release} +Requires: python-rbd = %{epoch}:%{version}-%{release} +Requires: python-cephfs = %{epoch}:%{version}-%{release} +Provides: python-ceph +%description -n python-ceph-compat +This is a compatibility package to accommodate python-ceph split into +python-rados, python-rbd and python-cephfs. Packages still depending on +python-ceph should be fixed to depend on python-rados, python-rbd or +python-cephfs instead. + %if 0%{?opensuse} || 0%{?suse_version} %debug_package %endif @@ -311,15 +500,17 @@ librbd1 or libcephfs1 instead. ################################################################################# %prep %setup -q -%if 0%{?fedora} || 0%{?rhel} || 0%{?centos} +%if 0%{?fedora} || 0%{?rhel} %patch0 -p1 -b .init %endif %build +%if 0%{with cephfs_java} # Find jni.h for i in /usr/{lib64,lib}/jvm/java/include{,/linux}; do [ -d $i ] && java_inc="$java_inc -I$i" done +%endif ./autogen.sh MY_CONF_OPT="" @@ -332,15 +523,35 @@ export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'` --prefix=/usr \ --localstatedir=/var \ --sysconfdir=/etc \ +%if 0%{?_with_systemd} + --with-systemdsystemunitdir=%_unitdir \ +%endif --docdir=%{_docdir}/ceph \ + --with-man-pages \ + --mandir="%_mandir" \ --with-nss \ --without-cryptopp \ - --with-rest-bench \ --with-debug \ +%if 0%{with cephfs_java} --enable-cephfs-java \ +%endif +%if 0%{with selinux} + --with-selinux \ +%endif --with-librocksdb-static=check \ +%if 0%{?rhel} || 0%{?fedora} + --with-systemd-libexec-dir=/usr/libexec/ceph \ + --with-rgw-user=root \ + --with-rgw-group=root \ +%endif +%if 0%{?opensuse} || 0%{?suse_version} + --with-systemd-libexec-dir=/usr/lib/ceph/ \ + --with-rgw-user=wwwrun \ + --with-rgw-group=www \ +%endif $MY_CONF_OPT \ %{?_with_ocf} \ + %{?_with_tcmalloc} \ CFLAGS="$RPM_OPT_FLAGS" CXXFLAGS="$RPM_OPT_FLAGS" # fix bug in specific version of libedit-devel @@ -354,38 +565,68 @@ sed -i -e "s/-lcurses/-lncurses/g" src/java/Makefile make -j$(getconf _NPROCESSORS_ONLN) + +%if 0%{with tests} +%check +# run in-tree unittests +make -j$(getconf _NPROCESSORS_ONLN) check-local + +%endif + + + %install make DESTDIR=$RPM_BUILD_ROOT install find $RPM_BUILD_ROOT -type f -name "*.la" -exec rm -f {} ';' find $RPM_BUILD_ROOT -type f -name "*.a" -exec rm -f {} ';' -install -D src/init-ceph $RPM_BUILD_ROOT%{_initrddir}/ceph -install -D src/init-radosgw.sysv $RPM_BUILD_ROOT%{_initrddir}/ceph-radosgw -install -D src/init-rbdmap $RPM_BUILD_ROOT%{_initrddir}/rbdmap install -D src/rbdmap $RPM_BUILD_ROOT%{_sysconfdir}/ceph/rbdmap +install -D src/init-rbdmap $RPM_BUILD_ROOT%{_initrddir}/rbdmap +%if 0%{?_with_systemd} + install -m 0644 -D systemd/ceph.tmpfiles.d $RPM_BUILD_ROOT%{_tmpfilesdir}/%{name}.conf + install -m 0644 -D systemd/ceph-rgw.tmpfiles.d $RPM_BUILD_ROOT%{_tmpfilesdir}/%{name}-rgw.conf + install -m 0644 -D systemd/ceph-osd@.service $RPM_BUILD_ROOT%{_unitdir}/ceph-osd@.service + install -m 0644 -D systemd/ceph-mon@.service $RPM_BUILD_ROOT%{_unitdir}/ceph-mon@.service + install -m 0644 -D systemd/ceph-create-keys@.service $RPM_BUILD_ROOT%{_unitdir}/ceph-create-keys@.service + install -m 0644 -D systemd/ceph-mds@.service $RPM_BUILD_ROOT%{_unitdir}/ceph-mds@.service + install -m 0644 -D systemd/ceph-radosgw@.service $RPM_BUILD_ROOT%{_unitdir}/ceph-radosgw@.service + install -m 0644 -D systemd/ceph.target $RPM_BUILD_ROOT%{_unitdir}/ceph.target + install -m 0644 -D systemd/ceph-disk-activate-journal@.service $RPM_BUILD_ROOT%{_unitdir}/ceph-disk-activate-journal@.service + install -m 0644 -D systemd/ceph-disk-activate@.service $RPM_BUILD_ROOT%{_unitdir}/ceph-disk-activate@.service + install -m 0644 -D systemd/ceph-disk-dmcrypt-activate@.service $RPM_BUILD_ROOT%{_unitdir}/ceph-disk-dmcrypt-activate@.service + install -m 0755 -D systemd/ceph $RPM_BUILD_ROOT%{_sbindir}/rcceph +%else + install -D src/init-ceph $RPM_BUILD_ROOT%{_initrddir}/ceph + install -D src/init-radosgw $RPM_BUILD_ROOT%{_initrddir}/ceph-radosgw + ln -sf ../../etc/init.d/ceph %{buildroot}/%{_sbindir}/rcceph + ln -sf ../../etc/init.d/ceph-radosgw %{buildroot}/%{_sbindir}/rcceph-radosgw +%endif mkdir -p $RPM_BUILD_ROOT%{_sbindir} -ln -sf ../../etc/init.d/ceph %{buildroot}/%{_sbindir}/rcceph -ln -sf ../../etc/init.d/ceph-radosgw %{buildroot}/%{_sbindir}/rcceph-radosgw install -m 0644 -D src/logrotate.conf $RPM_BUILD_ROOT%{_sysconfdir}/logrotate.d/ceph install -m 0644 -D src/rgw/logrotate.conf $RPM_BUILD_ROOT%{_sysconfdir}/logrotate.d/radosgw chmod 0644 $RPM_BUILD_ROOT%{_docdir}/ceph/sample.ceph.conf chmod 0644 $RPM_BUILD_ROOT%{_docdir}/ceph/sample.fetch_config -# udev rules -%if 0%{?rhel} >= 7 || 0%{?fedora} -install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT/usr/lib/udev/rules.d/50-rbd.rules -install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT/usr/lib/udev/rules.d/60-ceph-partuuid-workaround.rules -%else -install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT/lib/udev/rules.d/50-rbd.rules -install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT/lib/udev/rules.d/60-ceph-partuuid-workaround.rules +# firewall templates +%if 0%{?suse_version} +install -m 0644 -D etc/sysconfig/SuSEfirewall2.d/services/ceph-mon %{buildroot}%{_sysconfdir}/sysconfig/SuSEfirewall2.d/services/ceph-mon +install -m 0644 -D etc/sysconfig/SuSEfirewall2.d/services/ceph-osd-mds %{buildroot}%{_sysconfdir}/sysconfig/SuSEfirewall2.d/services/ceph-osd-mds %endif +# udev rules +install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT%{_udevrulesdir}/50-rbd.rules +install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT%{_udevrulesdir}/60-ceph-partuuid-workaround.rules + %if (0%{?rhel} && 0%{?rhel} < 7) install -m 0644 -D udev/95-ceph-osd-alt.rules $RPM_BUILD_ROOT/lib/udev/rules.d/95-ceph-osd.rules %else +%if 0%{?_with_systemd} +install -m 0644 -D udev/95-ceph-osd.rules.systemd $RPM_BUILD_ROOT/lib/udev/rules.d/95-ceph-osd.rules +%else install -m 0644 -D udev/95-ceph-osd.rules $RPM_BUILD_ROOT/lib/udev/rules.d/95-ceph-osd.rules %endif +%endif -%if 0%{?rhel} >= 7 || 0%{?fedora} +%if 0%{?rhel} >= 7 || 0%{?fedora} || 0%{?suse_version} mv $RPM_BUILD_ROOT/lib/udev/rules.d/95-ceph-osd.rules $RPM_BUILD_ROOT/usr/lib/udev/rules.d/95-ceph-osd.rules mv $RPM_BUILD_ROOT/sbin/mount.ceph $RPM_BUILD_ROOT/usr/sbin/mount.ceph mv $RPM_BUILD_ROOT/sbin/mount.fuse.ceph $RPM_BUILD_ROOT/usr/sbin/mount.fuse.ceph @@ -393,38 +634,73 @@ mv $RPM_BUILD_ROOT/sbin/mount.fuse.ceph $RPM_BUILD_ROOT/usr/sbin/mount.fuse.ceph #set up placeholder directories mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/ceph +%if (! 0%{?suse_version}) || ( 0%{?suse_version} && (! 0%{?_with_systemd}) ) mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/run/ceph +%endif mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/log/ceph mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/tmp mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/mon mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/osd mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/mds +mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/radosgw mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-osd mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-mds +mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-rgw mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/log/radosgw -%if %{defined suse_version} -# Fedora seems to have some problems with this macro, use it only on SUSE -%fdupes -s $RPM_BUILD_ROOT/%{python_sitelib} -%fdupes %buildroot -%endif - %clean rm -rf $RPM_BUILD_ROOT +%pre +%if 0%{?_with_systemd} + %if 0%{?opensuse} || 0%{?suse_version} + # service_add_pre and friends don't work with parameterized systemd service + # instances, only with single services or targets, so we always pass + # ceph.target to these macros + %service_add_pre ceph.target + %endif +%endif + + %post /sbin/ldconfig -/sbin/chkconfig --add ceph +%if 0%{?_with_systemd} + %if 0%{?opensuse} || 0%{?suse_version} + %service_add_post ceph.target + %endif +%else + /sbin/chkconfig --add ceph +%endif mkdir -p %{_localstatedir}/run/ceph/ %preun -%if %{defined suse_version} -%stop_on_removal ceph +%if 0%{?_with_systemd} + %if 0%{?opensuse} || 0%{?suse_version} + %service_del_preun ceph.target + %endif + # Need a special case here when removing the RPM to disable specific + # service instance, or stale symlinks will be left lying around in + # /etc/systemd/system. May as well stop them too for completeness + # (although strictly service_del_preun would do that anyway by dint + # of stopping ceph.target) + SERVICE_LIST=$(systemctl | grep -E '^ceph-mon@|^ceph-create-keys@|^ceph-osd@|^ceph-mds@|^ceph-disk-' | cut -d' ' -f1) + if [ -n "$SERVICE_LIST" ]; then + for SERVICE in $SERVICE_LIST; do + /usr/bin/systemctl --no-reload disable $SERVICE > /dev/null 2>&1 || : + /usr/bin/systemctl stop $SERVICE > /dev/null 2>&1 || : + done + fi +%else + %if 0%{?opensuse} || 0%{?suse_version} + %stop_on_removal ceph + %endif + %if 0%{?rhel} || 0%{?fedora} + if [ $1 = 0 ] ; then + /sbin/service ceph stop >/dev/null 2>&1 + /sbin/chkconfig --del ceph + fi + %endif %endif -if [ $1 = 0 ] ; then - /sbin/service ceph stop >/dev/null 2>&1 - /sbin/chkconfig --del ceph -fi %postun /sbin/ldconfig @@ -432,7 +708,6 @@ fi %insserv_cleanup %endif - ################################################################################# # files ################################################################################# @@ -452,22 +727,36 @@ fi %{_bindir}/ceph-run %{_bindir}/ceph-mon %{_bindir}/ceph-mds +%{_bindir}/ceph-objectstore-tool %{_bindir}/ceph-osd -%{_bindir}/ceph-rbdnamer +%{_bindir}/ceph-detect-init %{_bindir}/librados-config %{_bindir}/ceph-client-debug %{_bindir}/cephfs-journal-tool +%{_bindir}/cephfs-table-tool +%{_bindir}/cephfs-data-scan %{_bindir}/ceph-debugpack %{_bindir}/ceph-coverage -%{_bindir}/ceph_mon_store_converter +%if 0%{?_with_systemd} +%{_tmpfilesdir}/%{name}.conf +%{_unitdir}/ceph-mds@.service +%{_unitdir}/ceph-mon@.service +%{_unitdir}/ceph-create-keys@.service +%{_unitdir}/ceph-osd@.service +%{_unitdir}/ceph-radosgw@.service +%{_unitdir}/ceph-disk-activate-journal@.service +%{_unitdir}/ceph-disk-activate@.service +%{_unitdir}/ceph-disk-dmcrypt-activate@.service +%{_unitdir}/ceph.target +%else %{_initrddir}/ceph +%{_sbindir}/rcceph +%endif %{_sbindir}/ceph-disk -%{_sbindir}/ceph-disk-activate -%{_sbindir}/ceph-disk-prepare %{_sbindir}/ceph-disk-udev %{_sbindir}/ceph-create-keys %{_sbindir}/rcceph -%if 0%{?rhel} >= 7 || 0%{?fedora} +%if 0%{?rhel} >= 7 || 0%{?fedora} || 0%{?suse_version} %{_sbindir}/mount.ceph %else /sbin/mount.ceph @@ -476,6 +765,7 @@ fi %{_libdir}/ceph/ceph_common.sh %{_libexecdir}/ceph/ceph-osd-prestart.sh %dir %{_libdir}/rados-classes +%{_libdir}/rados-classes/libcls_cephfs.so* %{_libdir}/rados-classes/libcls_rbd.so* %{_libdir}/rados-classes/libcls_hello.so* %{_libdir}/rados-classes/libcls_rgw.so* @@ -489,17 +779,19 @@ fi %{_libdir}/rados-classes/libcls_version.so* %dir %{_libdir}/ceph/erasure-code %{_libdir}/ceph/erasure-code/libec_*.so* -%if 0%{?rhel} >= 7 || 0%{?fedora} -/usr/lib/udev/rules.d/60-ceph-partuuid-workaround.rules -/usr/lib/udev/rules.d/95-ceph-osd.rules -%else -/lib/udev/rules.d/60-ceph-partuuid-workaround.rules -/lib/udev/rules.d/95-ceph-osd.rules -%endif +%{_udevrulesdir}/60-ceph-partuuid-workaround.rules +%{_udevrulesdir}/95-ceph-osd.rules %config %{_sysconfdir}/bash_completion.d/ceph %config(noreplace) %{_sysconfdir}/logrotate.d/ceph -%config(noreplace) %{_sysconfdir}/logrotate.d/radosgw +%if 0%{?suse_version} +%config %{_sysconfdir}/sysconfig/SuSEfirewall2.d/services/ceph-mon +%config %{_sysconfdir}/sysconfig/SuSEfirewall2.d/services/ceph-osd-mds +%endif +%{python_sitelib}/ceph_detect_init* +%{_mandir}/man8/ceph-deploy.8* +%{_mandir}/man8/ceph-detect-init.8* %{_mandir}/man8/ceph-disk.8* +%{_mandir}/man8/ceph-create-keys.8* %{_mandir}/man8/ceph-mon.8* %{_mandir}/man8/ceph-mds.8* %{_mandir}/man8/ceph-osd.8* @@ -510,10 +802,9 @@ fi %{_mandir}/man8/monmaptool.8* %{_mandir}/man8/cephfs.8* %{_mandir}/man8/mount.ceph.8* -%{_mandir}/man8/ceph-rbdnamer.8* %{_mandir}/man8/ceph-debugpack.8* -%{_mandir}/man8/ceph-clsinfo.8.gz -%{_mandir}/man8/librados-config.8.gz +%{_mandir}/man8/ceph-clsinfo.8* +%{_mandir}/man8/librados-config.8* #set up placeholder directories %dir %{_localstatedir}/lib/ceph/ %dir %{_localstatedir}/lib/ceph/tmp @@ -522,7 +813,10 @@ fi %dir %{_localstatedir}/lib/ceph/mds %dir %{_localstatedir}/lib/ceph/bootstrap-osd %dir %{_localstatedir}/lib/ceph/bootstrap-mds -%ghost %dir %{_localstatedir}/run/ceph/ +%dir %{_localstatedir}/lib/ceph/bootstrap-rgw +%if (! 0%{?suse_version}) || ( 0%{?suse_version} && (! 0%{?_with_systemd}) ) +%dir %{_localstatedir}/run/ceph/ +%endif ################################################################################# %files -n ceph-common @@ -531,6 +825,7 @@ fi %{_bindir}/ceph-authtool %{_bindir}/ceph-conf %{_bindir}/ceph-dencoder +%{_bindir}/ceph-rbdnamer %{_bindir}/ceph-syn %{_bindir}/ceph-crush-location %{_bindir}/rados @@ -540,6 +835,7 @@ fi %{_mandir}/man8/ceph-authtool.8* %{_mandir}/man8/ceph-conf.8* %{_mandir}/man8/ceph-dencoder.8* +%{_mandir}/man8/ceph-rbdnamer.8* %{_mandir}/man8/ceph-syn.8* %{_mandir}/man8/ceph-post-file.8* %{_mandir}/man8/ceph.8* @@ -550,10 +846,15 @@ fi %{_datadir}/ceph/id_dsa_drop.ceph.com.pub %dir %{_sysconfdir}/ceph/ %dir %{_localstatedir}/log/ceph/ +%dir %{_datarootdir}/ceph/ +%dir %{_libexecdir}/ceph/ %config %{_sysconfdir}/bash_completion.d/rados %config %{_sysconfdir}/bash_completion.d/rbd %config(noreplace) %{_sysconfdir}/ceph/rbdmap %{_initrddir}/rbdmap +%{python_sitelib}/ceph_argparse.py* +%{python_sitelib}/ceph_daemon.py* +%{_udevrulesdir}/50-rbd.rules %postun -n ceph-common # Package removal cleanup @@ -567,7 +868,7 @@ fi %defattr(-,root,root,-) %{_bindir}/ceph-fuse %{_mandir}/man8/ceph-fuse.8* -%if 0%{?rhel} >= 7 || 0%{?fedora} +%if 0%{?rhel} >= 7 || 0%{?fedora} || 0%{?suse_version} %{_sbindir}/mount.fuse.ceph %else /sbin/mount.fuse.ceph @@ -579,67 +880,77 @@ fi %{_bindir}/rbd-fuse %{_mandir}/man8/rbd-fuse.8* -################################################################################# -%files devel -%defattr(-,root,root,-) -%dir %{_includedir}/cephfs -%{_includedir}/cephfs/libcephfs.h -%dir %{_includedir}/rados -%{_includedir}/rados/librados.h -%{_includedir}/rados/librados.hpp -%{_includedir}/rados/buffer.h -%{_includedir}/rados/page.h -%{_includedir}/rados/crc32c.h -%{_includedir}/rados/rados_types.h -%{_includedir}/rados/rados_types.hpp -%{_includedir}/rados/memory.h -%dir %{_includedir}/radosstriper -%{_includedir}/radosstriper/libradosstriper.h -%{_includedir}/radosstriper/libradosstriper.hpp -%dir %{_includedir}/rbd -%{_includedir}/rbd/librbd.h -%{_includedir}/rbd/librbd.hpp -%{_includedir}/rbd/features.h -%{_libdir}/libcephfs.so -%{_libdir}/librbd.so -%{_libdir}/librados.so -%{_libdir}/libradosstriper.so -%{_libdir}/libcephfs_jni.so - ################################################################################# %files radosgw %defattr(-,root,root,-) -%{_initrddir}/ceph-radosgw %{_bindir}/radosgw %{_bindir}/radosgw-admin %{_mandir}/man8/radosgw.8* %{_mandir}/man8/radosgw-admin.8* -%{_sbindir}/rcceph-radosgw +%config(noreplace) %{_sysconfdir}/logrotate.d/radosgw %config %{_sysconfdir}/bash_completion.d/radosgw-admin %dir %{_localstatedir}/log/radosgw/ +%dir %{_localstatedir}/lib/ceph/radosgw +%if 0%{?_with_systemd} +%{_tmpfilesdir}/%{name}-rgw.conf +%else +%{_initrddir}/ceph-radosgw +%{_sbindir}/rcceph-radosgw +%endif %post radosgw /sbin/ldconfig -%if %{defined suse_version} -%fillup_and_insserv -f -y ceph-radosgw +%if 0%{?opensuse} || 0%{?suse_version} + # TODO: find out what exactly this systemd-tmpfiles inovcation is for + systemd-tmpfiles --create /%{_tmpfilesdir}/ceph-rgw.conf + # explicit systemctl daemon-reload (that's the only relevant bit of + # service_add_post; the rest is all sysvinit --> systemd migration which + # isn't applicable in this context (see above comment). + /usr/bin/systemctl daemon-reload >/dev/null 2>&1 || : +%else + %if 0%{?suse_version} || 0%{?opensuse} + %fillup_and_insserv -f -y ceph-radosgw + %endif %endif %preun radosgw -%if %{defined suse_version} -%stop_on_removal ceph-radosgw +%if 0%{?_with_systemd} + SERVICE_LIST=$(systemctl | grep -E '^ceph-radosgw@' | cut -d' ' -f1) + if [ -n "$SERVICE_LIST" ]; then + for SERVICE in $SERVICE_LIST; do + /usr/bin/systemctl --no-reload disable $SERVICE > /dev/null 2>&1 || : + /usr/bin/systemctl stop $SERVICE > /dev/null 2>&1 || : + done + fi +%else + %if 0%{?suse_version} || 0%{?opensuse} + %stop_on_removal ceph-radosgw + %endif %endif %postun radosgw /sbin/ldconfig -%if %{defined suse_version} -%restart_on_update ceph-radosgw -%insserv_cleanup +%if 0%{?_with_systemd} + SERVICE_LIST=$(systemctl | grep -E '^ceph-radosgw@' | cut -d' ' -f1) + if [ -n "$SERVICE_LIST" ]; then + for SERVICE in $SERVICE_LIST; do + /usr/bin/systemctl --no-reload disable $SERVICE > /dev/null 2>&1 || : + /usr/bin/systemctl try-restart $SERVICE > /dev/null 2>&1 || : + done + fi +%else + %if 0%{?suse_version} || 0%{?opensuse} + %restart_on_update ceph-radosgw + %endif %endif # Package removal cleanup if [ "$1" -eq "0" ] ; then rm -rf /var/log/radosgw fi +%if 0%{?suse_version} || 0%{?opensuse} + %insserv_cleanup +%endif ################################################################################# %if %{with ocf} @@ -662,6 +973,25 @@ fi %postun -n librados2 /sbin/ldconfig +################################################################################# +%files -n librados2-devel +%defattr(-,root,root,-) +%dir %{_includedir}/rados +%{_includedir}/rados/librados.h +%{_includedir}/rados/librados.hpp +%{_includedir}/rados/buffer.h +%{_includedir}/rados/page.h +%{_includedir}/rados/crc32c.h +%{_includedir}/rados/rados_types.h +%{_includedir}/rados/rados_types.hpp +%{_includedir}/rados/memory.h +%{_libdir}/librados.so + +################################################################################# +%files -n python-rados +%defattr(-,root,root,-) +%{python_sitelib}/rados.py* + ################################################################################# %files -n libradosstriper1 %defattr(-,root,root,-) @@ -673,15 +1003,18 @@ fi %postun -n libradosstriper1 /sbin/ldconfig +################################################################################# +%files -n libradosstriper1-devel +%defattr(-,root,root,-) +%dir %{_includedir}/radosstriper +%{_includedir}/radosstriper/libradosstriper.h +%{_includedir}/radosstriper/libradosstriper.hpp +%{_libdir}/libradosstriper.so + ################################################################################# %files -n librbd1 %defattr(-,root,root,-) %{_libdir}/librbd.so.* -%if 0%{?rhel} >= 7 || 0%{?fedora} -/usr/lib/udev/rules.d/50-rbd.rules -%else -/lib/udev/rules.d/50-rbd.rules -%endif %post -n librbd1 /sbin/ldconfig @@ -691,6 +1024,20 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1 %postun -n librbd1 /sbin/ldconfig +################################################################################# +%files -n librbd1-devel +%defattr(-,root,root,-) +%dir %{_includedir}/rbd +%{_includedir}/rbd/librbd.h +%{_includedir}/rbd/librbd.hpp +%{_includedir}/rbd/features.h +%{_libdir}/librbd.so + +################################################################################# +%files -n python-rbd +%defattr(-,root,root,-) +%{python_sitelib}/rbd.py* + ################################################################################# %files -n libcephfs1 %defattr(-,root,root,-) @@ -703,17 +1050,16 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1 /sbin/ldconfig ################################################################################# -%files -n python-ceph +%files -n libcephfs1-devel %defattr(-,root,root,-) -%{python_sitelib}/rados.py* -%{python_sitelib}/rbd.py* -%{python_sitelib}/cephfs.py* -%{python_sitelib}/ceph_argparse.py* +%dir %{_includedir}/cephfs +%{_includedir}/cephfs/libcephfs.h +%{_libdir}/libcephfs.so ################################################################################# -%files -n rest-bench +%files -n python-cephfs %defattr(-,root,root,-) -%{_bindir}/rest-bench +%{python_sitelib}/cephfs.py* ################################################################################# %files -n ceph-test @@ -724,6 +1070,10 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1 %{_bindir}/ceph_erasure_code %{_bindir}/ceph_erasure_code_benchmark %{_bindir}/ceph_omapbench +%{_bindir}/ceph_perf_objectstore +%{_bindir}/ceph_perf_local +%{_bindir}/ceph_perf_msgr_client +%{_bindir}/ceph_perf_msgr_server %{_bindir}/ceph_psim %{_bindir}/ceph_radosacl %{_bindir}/ceph_rgw_jsonparser @@ -734,7 +1084,6 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1 %{_bindir}/ceph_smalliobenchdumb %{_bindir}/ceph_smalliobenchfs %{_bindir}/ceph_smalliobenchrbd -%{_bindir}/ceph_objectstore_tool %{_bindir}/ceph_streamtest %{_bindir}/ceph_test_* %{_bindir}/ceph_tpbench @@ -747,21 +1096,83 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1 %{_mandir}/man8/rbd-replay-prep.8* %{_bindir}/rbd-replay %{_bindir}/rbd-replay-many -%if (0%{?fedora} >= 20 || 0%{?rhel} == 6) +%if (0%{?fedora} || 0%{?rhel} == 6) %{_bindir}/rbd-replay-prep %endif +%dir %{_libdir}/ceph +%{_libdir}/ceph/ceph-monstore-update-crush.sh +################################################################################# +%if 0%{with cephfs_java} %files -n libcephfs_jni1 %defattr(-,root,root,-) %{_libdir}/libcephfs_jni.so.* +%post -n libcephfs_jni1 +/sbin/ldconfig + +%postun -n libcephfs_jni1 +/sbin/ldconfig + +################################################################################# +%files -n libcephfs_jni1-devel +%defattr(-,root,root,-) +%{_libdir}/libcephfs_jni.so + +################################################################################# %files -n cephfs-java %defattr(-,root,root,-) %{_javadir}/libcephfs.jar %{_javadir}/libcephfs-test.jar +%endif + +################################################################################# +%if 0%{with selinux} +%files selinux +%defattr(-,root,root,-) +%attr(0600,root,root) %{_datadir}/selinux/packages/ceph.pp +%{_datadir}/selinux/devel/include/contrib/ceph.if +%{_mandir}/man8/ceph_selinux.8.* + +%post selinux +/sbin/service ceph stop >/dev/null 2>&1 +semodule -n -i %{_datadir}/selinux/packages/ceph.pp +if /usr/sbin/selinuxenabled ; then + /usr/sbin/load_policy + %relabel_files +fi +/sbin/service ceph start >/dev/null 2>&1 +exit 0 + +%postun selinux +if [ $1 -eq 0 ]; then + /sbin/service ceph stop >/dev/null 2>&1 + semodule -n -r ceph + if /usr/sbin/selinuxenabled ; then + /usr/sbin/load_policy + %relabel_files + fi; + /sbin/service ceph start >/dev/null 2>&1 +fi; +exit 0 + +%endif +################################################################################# +%if 0%{with libs_compat} %files libs-compat -# We need an empty %files list for ceph-libs-compat, to tell rpmbuild to actually +# We need an empty %%files list for ceph-libs-compat, to tell rpmbuild to actually # build this meta package. +################################################################################# +%files devel-compat +# We need an empty %%files list for ceph-devel-compat, to tell rpmbuild to +# actually build this meta package. +%endif + +################################################################################# +%files -n python-ceph-compat +# We need an empty %%files list for python-ceph-compat, to tell rpmbuild to +# actually build this meta package. + %changelog diff --git a/cmake/modules/FindNSPR.cmake b/cmake/modules/FindNSPR.cmake new file mode 100644 index 0000000000000..48f92951657e8 --- /dev/null +++ b/cmake/modules/FindNSPR.cmake @@ -0,0 +1,103 @@ +# - Try to find NSPR +# Once done this will define +# +# NSPR_FOUND - system has NSPR +# NSPR_INCLUDE_DIRS - the NSPR include directory +# NSPR_LIBRARIES - Link these to use NSPR +# NSPR_DEFINITIONS - Compiler switches required for using NSPR +# +# Copyright (c) 2010 Andreas Schneider +# +# Redistribution and use is allowed according to the terms of the New +# BSD license. +# For details see the accompanying COPYING-CMAKE-SCRIPTS file. +# + + +if (NSPR_LIBRARIES AND NSPR_INCLUDE_DIRS) + # in cache already + set(NSPR_FOUND TRUE) +else (NSPR_LIBRARIES AND NSPR_INCLUDE_DIRS) + find_package(PkgConfig) + if (PKG_CONFIG_FOUND) + pkg_check_modules(_NSPR nspr) + endif (PKG_CONFIG_FOUND) + + find_path(NSPR_INCLUDE_DIR + NAMES + nspr.h + PATHS + ${_NSPR_INCLUDEDIR} + /usr/include + /usr/local/include + /opt/local/include + /sw/include + PATH_SUFFIXES + nspr4 + ) + + find_library(PLDS4_LIBRARY + NAMES + plds4 + PATHS + ${_NSPR_LIBDIR} + /usr/lib + /usr/local/lib + /opt/local/lib + /sw/lib + ) + + find_library(PLC4_LIBRARY + NAMES + plc4 + PATHS + ${_NSPR_LIBDIR} + /usr/lib + /usr/local/lib + /opt/local/lib + /sw/lib + ) + + find_library(NSPR4_LIBRARY + NAMES + nspr4 + PATHS + ${_NSPR_LIBDIR} + /usr/lib + /usr/local/lib + /opt/local/lib + /sw/lib + ) + + set(NSPR_INCLUDE_DIRS + ${NSPR_INCLUDE_DIR} + ) + + if (PLDS4_LIBRARY) + set(NSPR_LIBRARIES + ${NSPR_LIBRARIES} + ${PLDS4_LIBRARY} + ) + endif (PLDS4_LIBRARY) + + if (PLC4_LIBRARY) + set(NSPR_LIBRARIES + ${NSPR_LIBRARIES} + ${PLC4_LIBRARY} + ) + endif (PLC4_LIBRARY) + + if (NSPR4_LIBRARY) + set(NSPR_LIBRARIES + ${NSPR_LIBRARIES} + ${NSPR4_LIBRARY} + ) + endif (NSPR4_LIBRARY) + + include(FindPackageHandleStandardArgs) + find_package_handle_standard_args(NSPR DEFAULT_MSG NSPR_LIBRARIES NSPR_INCLUDE_DIRS) + + # show the NSPR_INCLUDE_DIRS and NSPR_LIBRARIES variables only in the advanced view + mark_as_advanced(NSPR_INCLUDE_DIRS NSPR_LIBRARIES) + +endif (NSPR_LIBRARIES AND NSPR_INCLUDE_DIRS) diff --git a/cmake/modules/FindNSS.cmake b/cmake/modules/FindNSS.cmake new file mode 100644 index 0000000000000..e42907c1a0b94 --- /dev/null +++ b/cmake/modules/FindNSS.cmake @@ -0,0 +1,122 @@ +# - Try to find NSS +# Once done this will define +# +# NSS_FOUND - system has NSS +# NSS_INCLUDE_DIRS - the NSS include directory +# NSS_LIBRARIES - Link these to use NSS +# NSS_DEFINITIONS - Compiler switches required for using NSS +# +# Copyright (c) 2010 Andreas Schneider +# +# Redistribution and use is allowed according to the terms of the New +# BSD license. +# For details see the accompanying COPYING-CMAKE-SCRIPTS file. +# + + +if (NSS_LIBRARIES AND NSS_INCLUDE_DIRS) + # in cache already + set(NSS_FOUND TRUE) +else (NSS_LIBRARIES AND NSS_INCLUDE_DIRS) + find_package(PkgConfig) + if (PKG_CONFIG_FOUND) + pkg_check_modules(_NSS nss) + endif (PKG_CONFIG_FOUND) + + find_path(NSS_INCLUDE_DIR + NAMES + nss.h + PATHS + ${_NSS_INCLUDEDIR} + /usr/include + /usr/local/include + /opt/local/include + /sw/include + PATH_SUFFIXES + nss3 + ) + + find_library(SSL3_LIBRARY + NAMES + ssl3 + PATHS + ${_NSS_LIBDIR} + /usr/lib + /usr/local/lib + /opt/local/lib + /sw/lib + ) + + find_library(SMIME3_LIBRARY + NAMES + smime3 + PATHS + ${_NSS_LIBDIR} + /usr/lib + /usr/local/lib + /opt/local/lib + /sw/lib + ) + + find_library(NSS3_LIBRARY + NAMES + nss3 + PATHS + ${_NSS_LIBDIR} + /usr/lib + /usr/local/lib + /opt/local/lib + /sw/lib + /usr/lib/x86_64-linux-gnu + ) + + find_library(NSSUTIL3_LIBRARY + NAMES + nssutil3 + PATHS + ${_NSS_LIBDIR} + /usr/lib + /usr/local/lib + /opt/local/lib + /sw/lib + ) + + set(NSS_INCLUDE_DIRS + ${NSS_INCLUDE_DIR} + ) + + if (SSL3_LIBRARY) + set(NSS_LIBRARIES + ${NSS_LIBRARIES} + ${SSL3_LIBRARY} + ) + endif (SSL3_LIBRARY) + + if (SMIME3_LIBRARY) + set(NSS_LIBRARIES + ${NSS_LIBRARIES} + ${SMIME3_LIBRARY} + ) + endif (SMIME3_LIBRARY) + + if (NSS3_LIBRARY) + set(NSS_LIBRARIES + ${NSS_LIBRARIES} + ${NSS3_LIBRARY} + ) + endif (NSS3_LIBRARY) + + if (NSSUTIL3_LIBRARY) + set(NSS_LIBRARIES + ${NSS_LIBRARIES} + ${NSSUTIL3_LIBRARY} + ) + endif (NSSUTIL3_LIBRARY) + + include(FindPackageHandleStandardArgs) + find_package_handle_standard_args(NSS DEFAULT_MSG NSS_LIBRARIES NSS_INCLUDE_DIRS) + + # show the NSS_INCLUDE_DIRS and NSS_LIBRARIES variables only in the advanced view + mark_as_advanced(NSS_INCLUDE_DIRS NSS_LIBRARIES) + +endif (NSS_LIBRARIES AND NSS_INCLUDE_DIRS) diff --git a/cmake/modules/Findaio.cmake b/cmake/modules/Findaio.cmake new file mode 100644 index 0000000000000..ae2e36d3e164b --- /dev/null +++ b/cmake/modules/Findaio.cmake @@ -0,0 +1,45 @@ +# - Find AIO +# +# AIO_INCLUDE - Where to find AIO/aio.h +# AIO_LIBS - List of libraries when using AIO. +# AIO_FOUND - True if AIO found. + +get_filename_component(module_file_path ${CMAKE_CURRENT_LIST_FILE} PATH) + +# Look for the header file. +find_path(AIO_INCLUDE +NAMES aio.h +PATHS /usr/include $ENV{AIO_ROOT}/include /opt/local/include /usr/local/include +DOC "Path in which the file AIO/aio.h is located." ) + +mark_as_advanced(AIO_INCLUDE) + +# Look for the library. +# Does this work on UNIX systems? (LINUX) +find_library(AIO_LIBS +NAMES aio +PATHS /usr/lib /usr/lib/x86_64-linux-gnu $ENV{AIO_ROOT}/lib +DOC "Path to AIO library.") + +mark_as_advanced(AIO_LIBS) + +# Copy the results to the output variables. +if (AIO_INCLUDE AND AIO_LIBS) + message(STATUS "Found AIO in ${AIO_INCLUDE} ${AIO_LIBS}") + set(AIO_FOUND 1) + include(CheckCXXSourceCompiles) + set(CMAKE_REQUIRED_LIBRARY ${AIO_LIBS} pthread) + set(CMAKE_REQUIRED_INCLUDES ${AIO_INCLUDE}) + else () + set(AIO_FOUND 0) + endif () + + # Report the results. + if (NOT AIO_FOUND) + set(AIO_DIR_MESSAGE "AIO was not found. Make sure AIO_LIBS and AIO_INCLUDE are set.") + if (AIO_FIND_REQUIRED) + message(FATAL_ERROR "${AIO_DIR_MESSAGE}") + elseif (NOT AIO_FIND_QUIETLY) + message(STATUS "${AIO_DIR_MESSAGE}") + endif () + endif () diff --git a/cmake/modules/Findatomicops.cmake b/cmake/modules/Findatomicops.cmake new file mode 100644 index 0000000000000..18e8b03bc25f9 --- /dev/null +++ b/cmake/modules/Findatomicops.cmake @@ -0,0 +1,28 @@ +# - Find atomic_ops +# Find the native ATOMIC_OPS headers and libraries. +# +# ATOMIC_OPS_INCLUDE_DIRS - where to find atomic_ops.h, etc. +# ATOMIC_OPS_LIBRARIES - List of libraries when using atomic_ops. +# ATOMIC_OPS_FOUND - True if atomic_ops found. + +# Look for the header file. +FIND_PATH(ATOMIC_OPS_INCLUDE_DIR NAMES atomic_ops.h) + +# Look for the library. +FIND_LIBRARY(ATOMIC_OPS_LIBRARY NAMES atomic_ops) + +# handle the QUIETLY and REQUIRED arguments and set ATOMIC_OPS_FOUND to TRUE if +# all listed variables are TRUE +INCLUDE(FindPackageHandleStandardArgs) +FIND_PACKAGE_HANDLE_STANDARD_ARGS(ATOMIC_OPS DEFAULT_MSG ATOMIC_OPS_LIBRARY ATOMIC_OPS_INCLUDE_DIR) + +# Copy the results to the output variables. +IF(ATOMIC_OPS_FOUND) + SET(ATOMIC_OPS_LIBRARIES ${ATOMIC_OPS_LIBRARY}) + SET(ATOMIC_OPS_INCLUDE_DIRS ${ATOMIC_OPS_INCLUDE_DIR}) +ELSE(ATOMIC_OPS_FOUND) + SET(ATOMIC_OPS_LIBRARIES) + SET(ATOMIC_OPS_INCLUDE_DIRS) +ENDIF(ATOMIC_OPS_FOUND) + +MARK_AS_ADVANCED(ATOMIC_OPS_INCLUDE_DIR ATOMIC_OPS_LIBRARY) diff --git a/cmake/modules/Findblkid.cmake b/cmake/modules/Findblkid.cmake new file mode 100644 index 0000000000000..aa1518b5da369 --- /dev/null +++ b/cmake/modules/Findblkid.cmake @@ -0,0 +1,52 @@ +# Copyright (C) 2007-2012 Hypertable, Inc. +# +# This file is part of Hypertable. +# +# Hypertable is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 3 +# of the License, or any later version. +# +# Hypertable is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Hypertable. If not, see +# + +# - Find libblkid +# Find the blkid library and includes +# +# BLKID_INCLUDE_DIR - where to find blkid.h, etc. +# BLKID_LIBRARIES - List of libraries when using blkid. +# BLKID_FOUND - True if blkid found. + +find_path(BLKID_INCLUDE_DIR blkid/blkid.h) + +set(BLKID_NAMES ${BLKID_NAMES} blkid) +find_library(BLKID_LIBRARY NAMES ${BLKID_NAMES}) + +if (BLKID_INCLUDE_DIR AND BLKID_LIBRARY) + set(BLKID_FOUND TRUE) + set( BLKID_LIBRARIES ${BLKID_LIBRARY} ) +else () + set(BLKID_FOUND FALSE) + set( BLKID_LIBRARIES ) +endif () + +if (BLKID_FOUND) + message(STATUS "Found libblkid: ${BLKID_LIBRARY}") +else () + message(STATUS "Not Found libblkid: ${BLKID_LIBRARY}") + if (BLKID_FIND_REQUIRED) + message(STATUS "Looked for libblkid named ${BLKID_NAMES}.") + message(FATAL_ERROR "Could NOT find libblkid") + endif () +endif () + +mark_as_advanced( + BLKID_LIBRARY + BLKID_I +) diff --git a/cmake/modules/Findcds.cmake b/cmake/modules/Findcds.cmake new file mode 100644 index 0000000000000..b22dc025b9de7 --- /dev/null +++ b/cmake/modules/Findcds.cmake @@ -0,0 +1,37 @@ +# - Find cds +# +# CDS_INCLUDE_DIR - where to find cds/init.h +# FIO_FOUND - True if found. + +find_path(CDS_INC_DIR cds/init.h NO_DEFAULT_PATH PATHS + /usr/include + /opt/local/include + /usr/local/include + /opt/cds +) + +if (CDS_INC_DIR) + set(CDS_FOUND TRUE) +else () + set(CDS_FOUND FALSE) +endif () + +if (CDS_FOUND) + message(STATUS "Found cds: ${CDS_INC_DIR}") +else () + message(STATUS "Failed to find cds/init.h") + if (CDS_FIND_REQUIRED) + message(FATAL_ERROR "Missing required cds/init.h") + endif () +endif () + +find_library(CDS_LIBS + NAMES cds + PATHS /usr/lib /usr/lib/x86_64-linux-gnu /opt/cds/bin/gcc-amd64-linux-64 +) + +mark_as_advanced( + CDS_INC_DIR + CDS_LIBS +) + diff --git a/cmake/modules/Findcryptopp.cmake b/cmake/modules/Findcryptopp.cmake new file mode 100644 index 0000000000000..74a01e83ac3da --- /dev/null +++ b/cmake/modules/Findcryptopp.cmake @@ -0,0 +1,108 @@ +# Module for locating the Crypto++ encryption library. +# +# Customizable variables: +# CRYPTOPP_ROOT_DIR +# This variable points to the CryptoPP root directory. On Windows the +# library location typically will have to be provided explicitly using the +# -D command-line option. The directory should include the include/cryptopp, +# lib and/or bin sub-directories. +# +# Read-only variables: +# CRYPTOPP_FOUND +# Indicates whether the library has been found. +# +# CRYPTOPP_INCLUDE_DIRS +# Points to the CryptoPP include directory. +# +# CRYPTOPP_LIBRARIES +# Points to the CryptoPP libraries that should be passed to +# target_link_libararies. +# +# +# Copyright (c) 2012 Sergiu Dotenco +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +INCLUDE (FindPackageHandleStandardArgs) + +FIND_PATH (CRYPTOPP_ROOT_DIR + NAMES cryptopp/cryptlib.h include/cryptopp/cryptlib.h + PATHS ENV CRYPTOPPROOT + DOC "CryptoPP root directory") + +# Re-use the previous path: +FIND_PATH (CRYPTOPP_INCLUDE_DIR + NAMES cryptopp/cryptlib.h + HINTS ${CRYPTOPP_ROOT_DIR} + PATH_SUFFIXES include + DOC "CryptoPP include directory") + +FIND_LIBRARY (CRYPTOPP_LIBRARY_DEBUG + NAMES cryptlibd cryptoppd + HINTS ${CRYPTOPP_ROOT_DIR} + PATH_SUFFIXES lib + DOC "CryptoPP debug library") + +FIND_LIBRARY (CRYPTOPP_LIBRARY_RELEASE + NAMES cryptlib cryptopp + HINTS ${CRYPTOPP_ROOT_DIR} + PATH_SUFFIXES lib + DOC "CryptoPP release library") + +IF (CRYPTOPP_LIBRARY_DEBUG AND CRYPTOPP_LIBRARY_RELEASE) + SET (CRYPTOPP_LIBRARY + optimized ${CRYPTOPP_LIBRARY_RELEASE} + debug ${CRYPTOPP_LIBRARY_DEBUG} CACHE DOC "CryptoPP library") +ELSEIF (CRYPTOPP_LIBRARY_RELEASE) + SET (CRYPTOPP_LIBRARY ${CRYPTOPP_LIBRARY_RELEASE} CACHE DOC + "CryptoPP library") +ENDIF (CRYPTOPP_LIBRARY_DEBUG AND CRYPTOPP_LIBRARY_RELEASE) + +IF (CRYPTOPP_INCLUDE_DIR) + SET (_CRYPTOPP_VERSION_HEADER ${CRYPTOPP_INCLUDE_DIR}/cryptopp/config.h) + + IF (EXISTS ${_CRYPTOPP_VERSION_HEADER}) + FILE (STRINGS ${_CRYPTOPP_VERSION_HEADER} _CRYPTOPP_VERSION_TMP REGEX + "^#define CRYPTOPP_VERSION[ \t]+[0-9]+$") + + STRING (REGEX REPLACE + "^#define CRYPTOPP_VERSION[ \t]+([0-9]+)" "\\1" _CRYPTOPP_VERSION_TMP + ${_CRYPTOPP_VERSION_TMP}) + + STRING (REGEX REPLACE "([0-9]+)[0-9][0-9]" "\\1" CRYPTOPP_VERSION_MAJOR + ${_CRYPTOPP_VERSION_TMP}) + STRING (REGEX REPLACE "[0-9]([0-9])[0-9]" "\\1" CRYPTOPP_VERSION_MINOR + ${_CRYPTOPP_VERSION_TMP}) + STRING (REGEX REPLACE "[0-9][0-9]([0-9])" "\\1" CRYPTOPP_VERSION_PATCH + ${_CRYPTOPP_VERSION_TMP}) + + SET (CRYPTOPP_VERSION_COUNT 3) + SET (CRYPTOPP_VERSION + ${CRYPTOPP_VERSION_MAJOR}.${CRYPTOPP_VERSION_MINOR}.${CRYPTOPP_VERSION_PATCH}) + ENDIF (EXISTS ${_CRYPTOPP_VERSION_HEADER}) +ENDIF (CRYPTOPP_INCLUDE_DIR) + +SET (CRYPTOPP_INCLUDE_DIRS ${CRYPTOPP_INCLUDE_DIR}) +SET (CRYPTOPP_LIBRARIES ${CRYPTOPP_LIBRARY}) + +MARK_AS_ADVANCED (CRYPTOPP_INCLUDE_DIR CRYPTOPP_LIBRARY CRYPTOPP_LIBRARY_DEBUG + CRYPTOPP_LIBRARY_RELEASE) + +FIND_PACKAGE_HANDLE_STANDARD_ARGS (CryptoPP REQUIRED_VARS CRYPTOPP_ROOT_DIR + CRYPTOPP_INCLUDE_DIR CRYPTOPP_LIBRARY VERSION_VAR CRYPTOPP_VERSION) diff --git a/cmake/modules/Findexpat.cmake b/cmake/modules/Findexpat.cmake new file mode 100644 index 0000000000000..951fb25978c70 --- /dev/null +++ b/cmake/modules/Findexpat.cmake @@ -0,0 +1,29 @@ +# - Find expat +# Find the native EXPAT headers and libraries. +# +# EXPAT_INCLUDE_DIRS - where to find expat.h, etc. +# EXPAT_LIBRARIES - List of libraries when using expat. +# EXPAT_FOUND - True if expat found. + +# Look for the header file. +FIND_PATH(EXPAT_INCLUDE_DIR NAMES expat.h) + +# Look for the library. +FIND_LIBRARY(EXPAT_LIBRARY NAMES expat) + +# handle the QUIETLY and REQUIRED arguments and set EXPAT_FOUND to TRUE if +# all listed variables are TRUE +INCLUDE(FindPackageHandleStandardArgs) +FIND_PACKAGE_HANDLE_STANDARD_ARGS(EXPAT DEFAULT_MSG EXPAT_LIBRARY EXPAT_INCLUDE_DIR) + +# Copy the results to the output variables. +IF(EXPAT_FOUND) + SET(EXPAT_LIBRARIES ${EXPAT_LIBRARY}) + SET(EXPAT_INCLUDE_DIRS ${EXPAT_INCLUDE_DIR}) +ELSE(EXPAT_FOUND) + SET(EXPAT_LIBRARIES) + SET(EXPAT_INCLUDE_DIRS) +ENDIF(EXPAT_FOUND) + +MARK_AS_ADVANCED(EXPAT_INCLUDE_DIR EXPAT_LIBRARY) + diff --git a/cmake/modules/Findfcgi.cmake b/cmake/modules/Findfcgi.cmake new file mode 100644 index 0000000000000..0d07dce9374f6 --- /dev/null +++ b/cmake/modules/Findfcgi.cmake @@ -0,0 +1,35 @@ +# CMake module to search for FastCGI headers +# +# If it's found it sets FCGI_FOUND to TRUE +# and following variables are set: +# FCGI_INCLUDE_DIR +# FCGI_LIBRARY +FIND_PATH(FCGI_INCLUDE_DIR + fcgio.h + PATHS + /usr/include + /usr/local/include + /usr/include/fastcgi + "$ENV{LIB_DIR}/include" + $ENV{INCLUDE} + ) +FIND_LIBRARY(FCGI_LIBRARY NAMES fcgi libfcgi PATHS + /usr/local/lib + /usr/lib + "$ENV{LIB_DIR}/lib" + "$ENV{LIB}" + ) + +IF (FCGI_INCLUDE_DIR AND FCGI_LIBRARY) + SET(FCGI_FOUND TRUE) +ENDIF (FCGI_INCLUDE_DIR AND FCGI_LIBRARY) + +IF (FCGI_FOUND) + IF (NOT FCGI_FIND_QUIETLY) + MESSAGE(STATUS "Found FCGI: ${FCGI_LIBRARY}") + ENDIF (NOT FCGI_FIND_QUIETLY) +ELSE (FCGI_FOUND) + IF (FCGI_FIND_REQUIRED) + MESSAGE(FATAL_ERROR "Could not find FCGI") + ENDIF (FCGI_FIND_REQUIRED) +ENDIF (FCGI_FOUND) diff --git a/cmake/modules/Findfio.cmake b/cmake/modules/Findfio.cmake new file mode 100644 index 0000000000000..7a40b77cd69c4 --- /dev/null +++ b/cmake/modules/Findfio.cmake @@ -0,0 +1,29 @@ +# - Find fio +# +# FIO_INCLUDE_DIR - where to find fio.h +# FIO_FOUND - True if found. + +find_path(FIO_INCLUDE_DIR fio.h NO_DEFAULT_PATH PATHS + /usr/include + /opt/local/include + /usr/local/include +) + +if (FIO_INCLUDE_DIR) + set(FIO_FOUND TRUE) +else () + set(FIO_FOUND FALSE) +endif () + +if (FIO_FOUND) + message(STATUS "Found fio: ${FIO_INCLUDE_DIR}") +else () + message(STATUS "Failed to find fio.h") + if (FIO_FIND_REQUIRED) + message(FATAL_ERROR "Missing required fio.h") + endif () +endif () + +mark_as_advanced( + FIO_INCLUDE_DIR +) diff --git a/cmake/modules/Findfuse.cmake b/cmake/modules/Findfuse.cmake new file mode 100644 index 0000000000000..7c1a8789b28e8 --- /dev/null +++ b/cmake/modules/Findfuse.cmake @@ -0,0 +1,167 @@ +# This module can find FUSE Library +# +# Requirements: +# - CMake >= 2.8.3 +# +# The following variables will be defined for your use: +# - FUSE_FOUND : was FUSE found? +# - FUSE_INCLUDE_DIRS : FUSE include directory +# - FUSE_LIBRARIES : FUSE library +# - FUSE_DEFINITIONS : FUSE cflags +# - FUSE_VERSION : complete version of FUSE (major.minor) +# - FUSE_MAJOR_VERSION : major version of FUSE +# - FUSE_MINOR_VERSION : minor version of FUSE +# +# Example Usage: +# +# 1. Copy this file in the root of your project source directory +# 2. Then, tell CMake to search this non-standard module in your project directory by adding to your CMakeLists.txt: +# set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}) +# 3. Finally call find_package() once, here are some examples to pick from +# +# Require FUSE 2.6 or later +# find_package(FUSE 2.6 REQUIRED) +# +# if(FUSE_FOUND) +# add_definitions(${FUSE_DEFINITIONS}) +# include_directories(${FUSE_INCLUDE_DIRS}) +# add_executable(myapp myapp.c) +# target_link_libraries(myapp ${FUSE_LIBRARIES}) +# endif() + +#============================================================================= +# Copyright (c) 2012, julp +# +# Distributed under the OSI-approved BSD License +# +# This software is distributed WITHOUT ANY WARRANTY; without even the +# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +#============================================================================= + +cmake_minimum_required(VERSION 2.8.3) + +########## Private ########## +function(fusedebug _varname) + if(FUSE_DEBUG) + message("${_varname} = ${${_varname}}") + endif(FUSE_DEBUG) +endfunction(fusedebug) + +########## Public ########## +set(FUSE_FOUND TRUE) +set(FUSE_LIBRARIES ) +set(FUSE_DEFINITIONS ) +set(FUSE_INCLUDE_DIRS ) + +find_package(PkgConfig) + +set(PC_FUSE_INCLUDE_DIRS ) +set(PC_FUSE_LIBRARY_DIRS ) +if(PKG_CONFIG_FOUND) + pkg_check_modules(PC_FUSE "fuse" QUIET) + if(PC_FUSE_FOUND) +# fusedebug(PC_FUSE_LIBRARIES) +# fusedebug(PC_FUSE_LIBRARY_DIRS) +# fusedebug(PC_FUSE_LDFLAGS) +# fusedebug(PC_FUSE_LDFLAGS_OTHER) +# fusedebug(PC_FUSE_INCLUDE_DIRS) +# fusedebug(PC_FUSE_CFLAGS) +# fusedebug(PC_FUSE_CFLAGS_OTHER) + set(FUSE_DEFINITIONS "${PC_FUSE_CFLAGS_OTHER}") + endif(PC_FUSE_FOUND) +endif(PKG_CONFIG_FOUND) + +find_path( + FUSE_INCLUDE_DIRS + NAMES fuse.h + PATHS "${PC_FUSE_INCLUDE_DIRS}" + DOC "Include directories for FUSE" +) + +if(NOT FUSE_INCLUDE_DIRS) + set(FUSE_FOUND FALSE) +endif(NOT FUSE_INCLUDE_DIRS) + +find_library( + FUSE_LIBRARIES + NAMES "fuse" + PATHS "${PC_FUSE_LIBRARY_DIRS}" + DOC "Libraries for FUSE" +) + +if(NOT FUSE_LIBRARIES) + set(FUSE_FOUND FALSE) +endif(NOT FUSE_LIBRARIES) + +if(FUSE_FOUND) + if(EXISTS "${FUSE_INCLUDE_DIRS}/fuse/fuse_common.h") + file(READ "${FUSE_INCLUDE_DIRS}/fuse/fuse_common.h" _contents) + string(REGEX REPLACE ".*# *define *FUSE_MAJOR_VERSION *([0-9]+).*" "\\1" FUSE_MAJOR_VERSION "${_contents}") + string(REGEX REPLACE ".*# *define *FUSE_MINOR_VERSION *([0-9]+).*" "\\1" FUSE_MINOR_VERSION "${_contents}") + set(FUSE_VERSION "${FUSE_MAJOR_VERSION}.${FUSE_MINOR_VERSION}") + endif() + + include(CheckCSourceCompiles) + # Backup CMAKE_REQUIRED_* + set(OLD_CMAKE_REQUIRED_INCLUDES "${CMAKE_REQUIRED_INCLUDES}") + set(OLD_CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}") + set(OLD_CMAKE_REQUIRED_DEFINITIONS "${CMAKE_REQUIRED_DEFINITIONS}") + # Add FUSE compilation flags + set(CMAKE_REQUIRED_INCLUDES "${CMAKE_REQUIRED_INCLUDES}" "${FUSE_INCLUDE_DIRS}") + set(CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}" "${FUSE_LIBRARIES}") + set(CMAKE_REQUIRED_DEFINITIONS "${CMAKE_REQUIRED_DEFINITIONS}" "${FUSE_DEFINITIONS}") + check_c_source_compiles("#include +#include +#include +#include +#include +#include + +int main(void) { +return 0; +}" FUSE_CFLAGS_CHECK) + if(NOT FUSE_CFLAGS_CHECK) + set(FUSE_DEFINITIONS "-D_FILE_OFFSET_BITS=64") + # Should we run again previous test to assume the failure was due to missing definition -D_FILE_OFFSET_BITS=64? + endif(NOT FUSE_CFLAGS_CHECK) + # Restore CMAKE_REQUIRED_* + set(CMAKE_REQUIRED_INCLUDES "${OLD_CMAKE_REQUIRED_INCLUDES}") + set(CMAKE_REQUIRED_LIBRARIES "${OLD_CMAKE_REQUIRED_LIBRARIES}") + set(CMAKE_REQUIRED_DEFINITIONS "${OLD_CMAKE_REQUIRED_DEFINITIONS}") +endif(FUSE_FOUND) + +if(FUSE_INCLUDE_DIRS) + include(FindPackageHandleStandardArgs) + if(FUSE_FIND_REQUIRED AND NOT FUSE_FIND_QUIETLY) + find_package_handle_standard_args(FUSE REQUIRED_VARS FUSE_LIBRARIES FUSE_INCLUDE_DIRS VERSION_VAR FUSE_VERSION) + else() + find_package_handle_standard_args(FUSE "FUSE not found" FUSE_LIBRARIES FUSE_INCLUDE_DIRS) + endif() +else(FUSE_INCLUDE_DIRS) + if(FUSE_FIND_REQUIRED AND NOT FUSE_FIND_QUIETLY) + message(FATAL_ERROR "Could not find FUSE include directory") + endif() +endif(FUSE_INCLUDE_DIRS) + +mark_as_advanced( + FUSE_INCLUDE_DIRS + FUSE_LIBRARIES +) + +# IN (args) +fusedebug("FUSE_FIND_COMPONENTS") +fusedebug("FUSE_FIND_REQUIRED") +fusedebug("FUSE_FIND_QUIETLY") +fusedebug("FUSE_FIND_VERSION") +# OUT +# Found +fusedebug("FUSE_FOUND") +# Definitions +fusedebug("FUSE_DEFINITIONS") +# Linking +fusedebug("FUSE_INCLUDE_DIRS") +fusedebug("FUSE_LIBRARIES") +# Version +fusedebug("FUSE_MAJOR_VERSION") +fusedebug("FUSE_MINOR_VERSION") +fusedebug("FUSE_VERSION") diff --git a/cmake/modules/Findleveldb.cmake b/cmake/modules/Findleveldb.cmake new file mode 100644 index 0000000000000..8a3130241efe9 --- /dev/null +++ b/cmake/modules/Findleveldb.cmake @@ -0,0 +1,37 @@ +# - Find LevelDB +# +# LEVELDB_INCLUDE - Where to find leveldb/db.h +# LEVELDB_LIBS - List of libraries when using LevelDB. +# LEVELDB_FOUND - True if LevelDB found. + +get_filename_component(module_file_path ${CMAKE_CURRENT_LIST_FILE} PATH) + +# Look for the header file. +find_path(LEVELDB_INCLUDE NAMES leveldb/db.h PATHS $ENV{LEVELDB_ROOT}/include /opt/local/include /usr/local/include /usr/include DOC "Path in which the file leveldb/db.h is located." ) +mark_as_advanced(LEVELDB_INCLUDE) + +# Look for the library. +# Does this work on UNIX systems? (LINUX) +find_library(LEVELDB_LIBS NAMES leveldb PATHS /usr/lib $ENV{LEVELDB_ROOT}/lib DOC "Path to leveldb library." ) +mark_as_advanced(LEVELDB_LIBS) + +# Copy the results to the output variables. +if (LEVELDB_INCLUDE AND LEVELDB_LIBS) + message(STATUS "Found leveldb in ${LEVELDB_INCLUDE} ${LEVELDB_LIBS}") + set(LEVELDB_FOUND 1) + include(CheckCXXSourceCompiles) + set(CMAKE_REQUIRED_LIBRARY ${LEVELDB_LIBS} pthread) + set(CMAKE_REQUIRED_INCLUDES ${LEVELDB_INCLUDE}) + else () + set(LEVELDB_FOUND 0) + endif () + + # Report the results. + if (NOT LEVELDB_FOUND) + set(LEVELDB_DIR_MESSAGE "LEVELDB was not found. Make sure LEVELDB_LIBS and LEVELDB_INCLUDE are set.") + if (LEVELDB_FIND_REQUIRED) + message(FATAL_ERROR "${LEVELDB_DIR_MESSAGE}") + elseif (NOT LEVELDB_FIND_QUIETLY) + message(STATUS "${LEVELDB_DIR_MESSAGE}") + endif () + endif () diff --git a/cmake/modules/Findprofiler.cmake b/cmake/modules/Findprofiler.cmake new file mode 100644 index 0000000000000..ca54227f56bce --- /dev/null +++ b/cmake/modules/Findprofiler.cmake @@ -0,0 +1,34 @@ +# Try to find Profiler +# Once done, this will define +# +# PROFILER_FOUND - system has Profiler +# PROFILER_INCLUDE_DIR - the Profiler include directories +# PROFILER_LIBRARIES - link these to use Profiler + +if(PROFILER_INCLUDE_DIR AND PROFILER_LIBRARIES) +set(PROFILER_FIND_QUIETLY TRUE) +endif(PROFILER_INCLUDE_DIR AND PROFILER_LIBRARIES) + +INCLUDE(CheckCXXSymbolExists) + +# include dir + +find_path(PROFILER_INCLUDE_DIR profiler.h NO_DEFAULT_PATH PATHS + ${HT_DEPENDENCY_INCLUDE_DIR} + /usr/include + /usr/include/google + /opt/local/include + /usr/local/include +) + + +# finally the library itself +find_library(LIBPROFILER NAMES profiler) +set(PROFILER_LIBRARIES ${LIBPROFILER}) + +# handle the QUIETLY and REQUIRED arguments and set PROFILER_FOUND to TRUE if +# all listed variables are TRUE +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(PROFILER DEFAULT_MSG PROFILER_LIBRARIES PROFILER_INCLUDE_DIR) + +mark_as_advanced(PROFILER_LIBRARIES PROFILER_INCLUDE_DIR) diff --git a/cmake/modules/Findsnappy.cmake b/cmake/modules/Findsnappy.cmake new file mode 100644 index 0000000000000..0ce92a8172dab --- /dev/null +++ b/cmake/modules/Findsnappy.cmake @@ -0,0 +1,62 @@ +# Copyright (C) 2007-2012 Hypertable, Inc. +# +# This file is part of Hypertable. +# +# Hypertable is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 3 +# of the License, or any later version. +# +# Hypertable is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Hypertable. If not, see +# + +# - Find Snappy +# Find the snappy compression library and includes +# +# SNAPPY_INCLUDE_DIR - where to find snappy.h, etc. +# SNAPPY_LIBRARIES - List of libraries when using snappy. +# SNAPPY_FOUND - True if snappy found. + +find_path(SNAPPY_INCLUDE_DIR snappy.h NO_DEFAULT_PATH PATHS + ${HT_DEPENDENCY_INCLUDE_DIR} + /usr/include + /opt/local/include + /usr/local/include +) + +set(SNAPPY_NAMES ${SNAPPY_NAMES} snappy) +find_library(SNAPPY_LIBRARY NAMES ${SNAPPY_NAMES} NO_DEFAULT_PATH PATHS + ${HT_DEPENDENCY_LIB_DIR} + /usr/local/lib + /opt/local/lib + /usr/lib + ) + +if (SNAPPY_INCLUDE_DIR AND SNAPPY_LIBRARY) + set(SNAPPY_FOUND TRUE) + set( SNAPPY_LIBRARIES ${SNAPPY_LIBRARY} ) +else () + set(SNAPPY_FOUND FALSE) + set( SNAPPY_LIBRARIES ) +endif () + +if (SNAPPY_FOUND) + message(STATUS "Found Snappy: ${SNAPPY_LIBRARY}") +else () + message(STATUS "Not Found Snappy: ${SNAPPY_LIBRARY}") + if (SNAPPY_FIND_REQUIRED) + message(STATUS "Looked for Snappy libraries named ${SNAPPY_NAMES}.") + message(FATAL_ERROR "Could NOT find Snappy library") + endif () +endif () + +mark_as_advanced( + SNAPPY_LIBRARY + SNAPPY_I +) diff --git a/cmake/modules/Findtcmalloc.cmake b/cmake/modules/Findtcmalloc.cmake new file mode 100644 index 0000000000000..d9b3a456f5c31 --- /dev/null +++ b/cmake/modules/Findtcmalloc.cmake @@ -0,0 +1,47 @@ +# - Find Tcmalloc +# Find the native Tcmalloc includes and library +# +# Tcmalloc_INCLUDE_DIR - where to find Tcmalloc.h, etc. +# Tcmalloc_LIBRARIES - List of libraries when using Tcmalloc. +# Tcmalloc_FOUND - True if Tcmalloc found. + +find_path(Tcmalloc_INCLUDE_DIR google/tcmalloc.h NO_DEFAULT_PATH PATHS + ${HT_DEPENDENCY_INCLUDE_DIR} + /usr/include + /opt/local/include + /usr/local/include +) + +if (USE_TCMALLOC) + set(Tcmalloc_NAMES tcmalloc) +else () + set(Tcmalloc_NAMES tcmalloc_minimal tcmalloc) +endif () + +find_library(Tcmalloc_LIBRARY NO_DEFAULT_PATH + NAMES ${Tcmalloc_NAMES} + PATHS ${HT_DEPENDENCY_LIB_DIR} /lib /usr/lib /usr/local/lib /opt/local/lib +) + +if (Tcmalloc_INCLUDE_DIR AND Tcmalloc_LIBRARY) + set(Tcmalloc_FOUND TRUE) + set( Tcmalloc_LIBRARIES ${Tcmalloc_LIBRARY} ) +else () + set(Tcmalloc_FOUND FALSE) + set( Tcmalloc_LIBRARIES ) +endif () + +if (Tcmalloc_FOUND) + message(STATUS "Found Tcmalloc: ${Tcmalloc_LIBRARY}") +else () + message(STATUS "Not Found Tcmalloc: ${Tcmalloc_LIBRARY}") + if (Tcmalloc_FIND_REQUIRED) + message(STATUS "Looked for Tcmalloc libraries named ${Tcmalloc_NAMES}.") + message(FATAL_ERROR "Could NOT find Tcmalloc library") + endif () +endif () + +mark_as_advanced( + Tcmalloc_LIBRARY + Tcmalloc_INCLUDE_DIR + ) diff --git a/cmake/modules/Findudev.cmake b/cmake/modules/Findudev.cmake new file mode 100644 index 0000000000000..0e95f15804eb9 --- /dev/null +++ b/cmake/modules/Findudev.cmake @@ -0,0 +1,52 @@ +# Copyright (C) 2007-2012 Hypertable, Inc. +# +# This file is part of Hypertable. +# +# Hypertable is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 3 +# of the License, or any later version. +# +# Hypertable is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Hypertable. If not, see +# + +# - Find libudev +# Find the udev library and includes +# +# UDEV_INCLUDE_DIR - where to find libudev.h, etc. +# UDEV_LIBRARIES - List of libraries when using udev. +# UDEV_FOUND - True if udev found. + +find_path(UDEV_INCLUDE_DIR libudev.h) + +set(UDEV_NAMES ${UDEV_NAMES} udev) +find_library(UDEV_LIBRARY NAMES ${UDEV_NAMES}) + +if(UDEV_INCLUDE_DIR AND UDEV_LIBRARY) + set(UDEV_FOUND TRUE) + set(UDEV_LIBRARIES ${UDEV_LIBRARY}) +else() + set(UDEV_FOUND FALSE) + set(UDEV_LIBRARIES) +endif() + +if(UDEV_FOUND) + message(STATUS "Found libudev: ${UDEV_LIBRARY}") +else() + message(STATUS "Not Found libudev: ${UDEV_LIBRARY}") + if(UDEV_FIND_REQUIRED) + message(STATUS "Looked for libudev named ${UDEV_NAMES}.") + message(FATAL_ERROR "Could NOT find libudev") + endif() +endif() + +mark_as_advanced( + UDEV_LIBRARY + UDEV_I +) diff --git a/cmake/modules/Findxio.cmake b/cmake/modules/Findxio.cmake new file mode 100644 index 0000000000000..11fd892d8b3d4 --- /dev/null +++ b/cmake/modules/Findxio.cmake @@ -0,0 +1,43 @@ +# - Find libxio +# Find libxio transport library +# +# Xio_INCLUDE_DIR - libxio include dir +# Xio_LIBRARIES - List of libraries +# Xio_FOUND - True if libxio found. + +set(_xio_include_path ${HT_DEPENDENCY_INCLUDE_DIR}) +set(_xio_lib_path ${HT_DEPENDENCY_LIB_DIR}) +if (EXISTS ${WITH_XIO}) + list(APPEND _xio_include_path "${WITH_XIO}/include") + list(APPEND _xio_lib_path "${WITH_XIO}/lib") +else() + list(APPEND _xio_include_path /usr/include /usr/local/include /opt/accelio/include) + list(APPEND _xio_lib_path /lib /usr/lib /usr/local/lib /opt/accelio/lib) +endif() + +find_path(Xio_INCLUDE_DIR libxio.h NO_DEFAULT_PATH PATHS ${_xio_include_path}) + +find_library(Xio_LIBRARY NO_DEFAULT_PATH NAMES xio PATHS ${_xio_lib_path}) + +if (Xio_INCLUDE_DIR AND Xio_LIBRARY) + set(Xio_FOUND TRUE) + set(Xio_LIBRARIES ${Xio_LIBRARY} ) +else () + set(Xio_FOUND FALSE) + set(Xio_LIBRARIES ) +endif () + +if (Xio_FOUND) + message(STATUS "Found Xio: ${Xio_INCLUDE_DIR} ${Xio_LIBRARY}") +else () + message(STATUS "Not Found Xio: ${Xio_INCLUDE_DIR} ${Xio_LIBRARY}") + if (Xio_FIND_REQUIRED) + message(STATUS "Looked for Xio libraries named ${Xio_NAMES}.") + message(FATAL_ERROR "Could NOT find Xio library") + endif () +endif () + +mark_as_advanced( + Xio_LIBRARY + Xio_INCLUDE_DIR + ) diff --git a/cmake/modules/GetGitRevisionDescription.cmake b/cmake/modules/GetGitRevisionDescription.cmake new file mode 100644 index 0000000000000..c8d27f2e8ce49 --- /dev/null +++ b/cmake/modules/GetGitRevisionDescription.cmake @@ -0,0 +1,130 @@ +# - Returns a version string from Git +# +# These functions force a re-configure on each git commit so that you can +# trust the values of the variables in your build system. +# +# get_git_head_revision( [ ...]) +# +# Returns the refspec and sha hash of the current head revision +# +# git_describe( [ ...]) +# +# Returns the results of git describe on the source tree, and adjusting +# the output so that it tests false if an error occurs. +# +# git_get_exact_tag( [ ...]) +# +# Returns the results of git describe --exact-match on the source tree, +# and adjusting the output so that it tests false if there was no exact +# matching tag. +# +# Requires CMake 2.6 or newer (uses the 'function' command) +# +# Original Author: +# 2009-2010 Ryan Pavlik +# http://academic.cleardefinition.com +# Iowa State University HCI Graduate Program/VRAC +# +# Copyright Iowa State University 2009-2010. +# Distributed under the Boost Software License, Version 1.0. +# (See accompanying file LICENSE_1_0.txt or copy at +# http://www.boost.org/LICENSE_1_0.txt) + +if(__get_git_revision_description) + return() +endif() +set(__get_git_revision_description YES) + +# We must run the following at "include" time, not at function call time, +# to find the path to this module rather than the path to a calling list file +get_filename_component(_gitdescmoddir ${CMAKE_CURRENT_LIST_FILE} PATH) + +function(get_git_head_revision _refspecvar _hashvar) + set(GIT_PARENT_DIR "${CMAKE_CURRENT_SOURCE_DIR}") + set(GIT_DIR "${GIT_PARENT_DIR}/.git") + while(NOT EXISTS "${GIT_DIR}") # .git dir not found, search parent directories + set(GIT_PREVIOUS_PARENT "${GIT_PARENT_DIR}") + get_filename_component(GIT_PARENT_DIR ${GIT_PARENT_DIR} PATH) + if(GIT_PARENT_DIR STREQUAL GIT_PREVIOUS_PARENT) + # We have reached the root directory, we are not in git + set(${_refspecvar} "GITDIR-NOTFOUND" PARENT_SCOPE) + set(${_hashvar} "GITDIR-NOTFOUND" PARENT_SCOPE) + return() + endif() + set(GIT_DIR "${GIT_PARENT_DIR}/.git") + endwhile() + # check if this is a submodule + if(NOT IS_DIRECTORY ${GIT_DIR}) + file(READ ${GIT_DIR} submodule) + string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" GIT_DIR_RELATIVE ${submodule}) + get_filename_component(SUBMODULE_DIR ${GIT_DIR} PATH) + get_filename_component(GIT_DIR ${SUBMODULE_DIR}/${GIT_DIR_RELATIVE} ABSOLUTE) + endif() + set(GIT_DATA "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/git-data") + if(NOT EXISTS "${GIT_DATA}") + file(MAKE_DIRECTORY "${GIT_DATA}") + endif() + + if(NOT EXISTS "${GIT_DIR}/HEAD") + return() + endif() + set(HEAD_FILE "${GIT_DATA}/HEAD") + configure_file("${GIT_DIR}/HEAD" "${HEAD_FILE}" COPYONLY) + + configure_file("${_gitdescmoddir}/GetGitRevisionDescription.cmake.in" + "${GIT_DATA}/grabRef.cmake" + @ONLY) + include("${GIT_DATA}/grabRef.cmake") + + set(${_refspecvar} "${HEAD_REF}" PARENT_SCOPE) + set(${_hashvar} "${HEAD_HASH}" PARENT_SCOPE) +endfunction() + +function(git_describe _var) + if(NOT GIT_FOUND) + find_package(Git QUIET) + endif() + get_git_head_revision(refspec hash) + if(NOT GIT_FOUND) + set(${_var} "GIT-NOTFOUND" PARENT_SCOPE) + return() + endif() + if(NOT hash) + set(${_var} "HEAD-HASH-NOTFOUND" PARENT_SCOPE) + return() + endif() + + # TODO sanitize + #if((${ARGN}" MATCHES "&&") OR + # (ARGN MATCHES "||") OR + # (ARGN MATCHES "\\;")) + # message("Please report the following error to the project!") + # message(FATAL_ERROR "Looks like someone's doing something nefarious with git_describe! Passed arguments ${ARGN}") + #endif() + + #message(STATUS "Arguments to execute_process: ${ARGN}") + + execute_process(COMMAND + "${GIT_EXECUTABLE}" + describe + ${hash} + ${ARGN} + WORKING_DIRECTORY + "${CMAKE_SOURCE_DIR}" + RESULT_VARIABLE + res + OUTPUT_VARIABLE + out + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE) + if(NOT res EQUAL 0) + set(out "${out}-${res}-NOTFOUND") + endif() + + set(${_var} "${out}" PARENT_SCOPE) +endfunction() + +function(git_get_exact_tag _var) + git_describe(out --exact-match ${ARGN}) + set(${_var} "${out}" PARENT_SCOPE) +endfunction() diff --git a/cmake/modules/GetGitRevisionDescription.cmake.in b/cmake/modules/GetGitRevisionDescription.cmake.in new file mode 100644 index 0000000000000..6faa374a8d9e6 --- /dev/null +++ b/cmake/modules/GetGitRevisionDescription.cmake.in @@ -0,0 +1,38 @@ +# +# Internal file for GetGitRevisionDescription.cmake +# +# Requires CMake 2.6 or newer (uses the 'function' command) +# +# Original Author: +# 2009-2010 Ryan Pavlik +# http://academic.cleardefinition.com +# Iowa State University HCI Graduate Program/VRAC +# +# Copyright Iowa State University 2009-2010. +# Distributed under the Boost Software License, Version 1.0. +# (See accompanying file LICENSE_1_0.txt or copy at +# http://www.boost.org/LICENSE_1_0.txt) + +set(HEAD_HASH) + +file(READ "@HEAD_FILE@" HEAD_CONTENTS LIMIT 1024) + +string(STRIP "${HEAD_CONTENTS}" HEAD_CONTENTS) +if(HEAD_CONTENTS MATCHES "ref") + # named branch + string(REPLACE "ref: " "" HEAD_REF "${HEAD_CONTENTS}") + if(EXISTS "@GIT_DIR@/${HEAD_REF}") + configure_file("@GIT_DIR@/${HEAD_REF}" "@GIT_DATA@/head-ref" COPYONLY) + elseif(EXISTS "@GIT_DIR@/logs/${HEAD_REF}") + configure_file("@GIT_DIR@/logs/${HEAD_REF}" "@GIT_DATA@/head-ref" COPYONLY) + set(HEAD_HASH "${HEAD_REF}") + endif() +else() + # detached HEAD + configure_file("@GIT_DIR@/HEAD" "@GIT_DATA@/head-ref" COPYONLY) +endif() + +if(NOT HEAD_HASH) + file(READ "@GIT_DATA@/head-ref" HEAD_HASH LIMIT 1024) + string(STRIP "${HEAD_HASH}" HEAD_HASH) +endif() diff --git a/configure.ac b/configure.ac index 7d48d2203427f..ece3047f32e8b 100644 --- a/configure.ac +++ b/configure.ac @@ -8,7 +8,9 @@ AC_PREREQ(2.59) # VERSION define is not used by the code. It gets a version string # from 'git describe'; see src/ceph_ver.[ch] -AC_INIT([ceph], [0.89], [ceph-devel@vger.kernel.org]) +AC_INIT([ceph], [9.0.2], [ceph-devel@vger.kernel.org]) + +AX_CXX_COMPILE_STDCXX_11(, mandatory) # Create release string. Used with VERSION for RPMs. RPM_RELEASE=0 @@ -21,9 +23,20 @@ if test -d ".git" ; then fi AC_MSG_NOTICE([RPM_RELEASE='$RPM_RELEASE']) +AC_ARG_WITH([man-pages], + [AS_HELP_STRING([--with-man-pages], [build man pages])], + [], + [with_man_pages=check]) +AS_IF([test "x$with_man_pages" != "xno"], + [AC_CHECK_PROGS(SPHINX_BUILD, sphinx-1.0-build sphinx-build) + AS_IF([test -z "$SPHINX_BUILD" && \ + test "x$with_man_pages" = "xyes"], + [AC_MSG_ERROR([sphinx-build not found (python-sphinx)])])]) +AM_CONDITIONAL(WITH_MAN_PAGES, test -n "$SPHINX_BUILD") + AC_CONFIG_MACRO_DIR([m4]) -AC_CONFIG_SUBDIRS([src/gtest]) +AC_CONFIG_SUBDIRS([src/gmock]) # Environment AC_CANONICAL_HOST @@ -39,8 +52,8 @@ AM_PROG_LIBTOOL AM_PROG_AS +AM_INIT_AUTOMAKE([foreign parallel-tests]) # enable make V=0 (if automake >1.11) -AM_INIT_AUTOMAKE([foreign]) m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) # Platform @@ -86,36 +99,142 @@ AC_PROG_LIBTOOL AC_SUBST(AM_CXXFLAGS) AM_CXXFLAGS="${AM_CXXFLAGS}" +# Find out what to build (default is most of these) + +# rados? +AC_ARG_WITH([rados], + [AS_HELP_STRING([--with-rados], [build with librados support])], + [], + [with_rados=yes]) +AM_CONDITIONAL(WITH_RADOS, test "$with_rados" = "yes") +#AS_IF([test "$with_rados" = "yes"], [AC_DEFINE([WITH_RADOS])]) + +# rbd? +# rbd requires rados +AC_ARG_WITH([rbd], + [AS_HELP_STRING([--with-rbd], [build rbd files])], + [], + [with_rbd=yes]) +AM_CONDITIONAL(WITH_RBD, test "$with_rbd" = "yes") +#AS_IF([test "$with_rbd" = "yes"], [AC_DEFINE([WITH_RADOS, WITH_RBD])]) + +# cephfs? +# cephfs requires rados +AC_ARG_WITH([cephfs], + [AS_HELP_STRING([--with-cephfs], [build cephfs files])], + [], + [with_cephfs=yes]) +AM_CONDITIONAL(WITH_CEPHFS, test "$with_cephfs" = "yes") +#AS_IF([test "$with_cephfs" = "yes"], [AC_DEFINE([WITH_RADOS, WITH_CEPHFS])]) + +# radosgw? +# radosgw requires rados +AC_ARG_WITH([radosgw], + [AS_HELP_STRING([--with-radosgw], [build RADOS gateway])], + [], + [with_radosgw=check]) +# AM_CONDITIONAL is defined later -- we need to check whether we can enable radosgw if no option is present +#AS_IF([test "$with_radosgw" = "yes"], [AC_DEFINE([WITH_RADOS, WITH_RADOSGW])]) + +AC_ARG_WITH([selinux], + [AS_HELP_STRING([--with-selinux], [build SELinux policy])], + [], + [with_selinux=no]) +AM_CONDITIONAL(WITH_SELINUX, test "$with_selinux" = "yes") +if test "x$with_selinux" = x"yes"; then + AC_CHECK_FILE([/usr/share/selinux/devel/policyhelp], [true], [AC_MSG_FAILURE([No SELinux found])]) + AC_CHECK_FILE([/usr/share/selinux/devel/include/Makefile], [true], [AC_MSG_FAILURE([No SELinux Makefile found])]) +fi + +# radosstriper? +AC_ARG_WITH([radosstriper], + [AS_HELP_STRING([--with-radosstriper], [build radosstriper files])], + [], + [with_radosstriper=yes]) +AM_CONDITIONAL(WITH_RADOSSTRIPER, test "$with_radosstriper" = "yes") +#AS_IF([test "$with_radostriper" = "yes"], [AC_DEFINE([WITH_RADOS, WITH_RADOSSTRIPER])]) + +# mon? +AC_ARG_WITH([mon], + [AS_HELP_STRING([--with-mon], [build Ceph monitor software files])], + [], + [with_mon=yes]) +AM_CONDITIONAL(WITH_MON, test "$with_mon" = "yes") +#AS_IF([test "$with_mon" = "yes"], [AC_DEFINE([WITH_MON])]) + +# osd? +AC_ARG_WITH([osd], + [AS_HELP_STRING([--with-osd], [build object store daemon files])], + [], + [with_osd=yes]) +AM_CONDITIONAL(WITH_OSD, test "$with_osd" = "yes") +#AS_IF([test "$with_osd" = "yes"], [AC_DEFINE([WITH_OSD])]) + +# mds? +AC_ARG_WITH([mds], + [AS_HELP_STRING([--with-mds], [build mds files])], + [], + [with_mds=yes]) +AM_CONDITIONAL(WITH_MDS, test "$with_mds" = "yes") +#AS_IF([test "$with_mds" = "yes"], [AC_DEFINE([WITH_MDS])]) + +# client? +AC_ARG_ENABLE([client], + [AS_HELP_STRING([--enable-client], [enable client-side build])], + [], + [enable_client=yes]) +AM_CONDITIONAL([ENABLE_CLIENT], test "$enable_client" = "yes") +#AS_IF([test "$enable_client" = "yes"], [AC_DEFINE([WITH_RADOS, WITH_RBD, ENABLE_CLIENT])]) + +# server? +AC_ARG_ENABLE([server], + [AS_HELP_STRING([--enable-server], [enable server-side build ])], + [], + [enable_server=yes]) +AM_CONDITIONAL(ENABLE_SERVER, test "$enable_server" = "yes") +#AS_IF([test "$enable_server" = "yes"], [AC_DEFINE([WITH_MON, WITH_OSD, WITH_MDS, ENABLE_SERVER])]) + +# cond-check if snappy-devel is installed, needed by leveldb that is need by server parts of the project +AS_IF([test "$enable_server" = "yes" -a \( "$with_osd" = "yes" -o "$with_mon" = "yes" \)], + [AC_CHECK_LIB([snappy], [snappy_compress], [true], [AC_MSG_FAILURE([libsnappy not found])])]) + +# cond-check leveldb, necessary if server, osd or mon enabled +AS_IF([test "$enable_server" = "yes" -a \( "$with_osd" = "yes" -o "$with_mon" = "yes" \)], + [AC_CHECK_LIB([leveldb], [leveldb_open], [true], [AC_MSG_FAILURE([libleveldb not found])], [-lsnappy -lpthread])]) + # Check for yasm -if yasm -f elf64 src/common/crc32c_intel_fast_asm.S -o /dev/null; then - echo 'we have a modern and working yasm' - if test `arch` = "x86_64" ; then - echo 'we are x86_64' - arch_x32=0 - AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [[ - #if defined(__x86_64__) && defined(__ILP32__) - #error x32 - #endif]])], [], [arch_x32=1]) - if test $arch_x32 -eq 0 ; then - echo 'we are not x32' - AC_DEFINE([HAVE_GOOD_YASM_ELF64], [1], [we have a recent yasm and are x86_64]) - with_good_yasm=yes - - if yasm -f elf64 -i src/ceph/src/ceph/src/erasure-code/isa/isa-l/include/ src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx2.asm.s -o /dev/null 2> /dev/null ; then - echo 'yasm can also build the isa-l stuff' - AC_DEFINE([HAVE_BETTER_YASM_ELF64], [1], [yasm can also build the isa-l]) - with_better_yasm=yes - else - echo "yasm doesn't build the isa-l stuff" - fi - else - echo 'we are x32; no yasm for you' - fi - else - echo 'we are not x86_64 && !x32' - fi -else - echo 'we do not have a modern/working yasm' +AC_CHECK_PROG(YASM_CHECK, yasm, yes) +if test x"$YASM_CHECK" = x"yes"; then + if yasm -f elf64 src/common/crc32c_intel_fast_asm.S -o /dev/null; then + echo 'we have a modern and working yasm' + if test `arch` = "x86_64" ; then + echo 'we are x86_64' + arch_x32=0 + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [[ + #if defined(__x86_64__) && defined(__ILP32__) + #error x32 + #endif]])], [], [arch_x32=1]) + if test $arch_x32 -eq 0 ; then + echo 'we are not x32' + AC_DEFINE([HAVE_GOOD_YASM_ELF64], [1], [we have a recent yasm and are x86_64]) + with_good_yasm=yes + + if yasm -f elf64 -i src/ceph/src/ceph/src/erasure-code/isa/isa-l/include/ src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx2.asm.s -o /dev/null 2> /dev/null ; then + echo 'yasm can also build the isa-l stuff' + AC_DEFINE([HAVE_BETTER_YASM_ELF64], [1], [yasm can also build the isa-l]) + with_better_yasm=yes + else + echo "yasm doesn't build the isa-l stuff" + fi + else + echo 'we are x32; no yasm for you' + fi + else + echo 'we are not x86_64 && !x32' + fi + else + echo 'we do not have a modern/working yasm' + fi fi AM_CONDITIONAL(WITH_GOOD_YASM_ELF64, test "$with_good_yasm" = "yes") AM_CONDITIONAL(WITH_BETTER_YASM_ELF64, test "$with_better_yasm" = "yes") @@ -142,6 +261,7 @@ AC_DEFUN([AC_CHECK_CC_FLAG], AC_CHECK_CC_FLAG([-Wtype-limits], [WARN_TYPE_LIMITS]) AC_CHECK_CC_FLAG([-Wignored-qualifiers], [WARN_IGNORED_QUALIFIERS]) +AC_CHECK_CC_FLAG([-Werror=format-security], [WARN_ERROR_FORMAT_SECURITY]) # Check for compiler VTA support AX_CHECK_COMPILE_FLAG([-fvar-tracking-assignments], [HAS_VTA_SUPPORT=1], [HAS_VTA_SUPPORT=0]) @@ -156,19 +276,36 @@ ACX_PTHREAD AC_CHECK_LIB([uuid], [uuid_parse], [true], AC_MSG_FAILURE([libuuid not found])) -# rbd {map,unmap,showmapped} dependencies, Linux only +#Linux only dependencies if test x"$linux" = x"yes"; then # libblkid AC_CHECK_HEADER([blkid/blkid.h], [], AC_MSG_ERROR([blkid/blkid.h not found (libblkid-dev, libblkid-devel)])) - AC_CHECK_LIB([blkid], [blkid_devno_to_wholedisk], [true], + AC_CHECK_LIB([blkid], [blkid_get_cache], [true], + AC_MSG_FAILURE([libblkid not found])) + AC_CHECK_LIB([blkid], [blkid_find_dev_with_tag], [true], + AC_MSG_FAILURE([libblkid not found])) + AC_CHECK_LIB([blkid], [blkid_dev_devname], [true], AC_MSG_FAILURE([libblkid not found])) - # libudev - AC_CHECK_HEADER([libudev.h], [], - AC_MSG_ERROR([libudev.h not found (libudev-dev, libudev-devel)])) - AC_CHECK_LIB([udev], [udev_monitor_receive_device], [true], - AC_MSG_FAILURE([libudev not found])) + # rbd {map,unmap,showmapped} dependencies, Linux only + if test x"$with_rbd" = x"yes"; then + # libblkid + AC_CHECK_LIB([blkid], [blkid_devno_to_wholedisk], [true], + AC_MSG_FAILURE([libblkid not found])) + + # libudev + AC_CHECK_HEADER([libudev.h], [], + AC_MSG_ERROR([libudev.h not found (libudev-dev, libudev-devel)])) + AC_CHECK_LIB([udev], [udev_monitor_receive_device], [true], + AC_MSG_FAILURE([libudev not found])) + + # libexpat + AC_CHECK_HEADER([expat.h], [], + AC_MSG_ERROR([expat.h not found (libexpat-devel)])) + AC_CHECK_LIB([expat], [XML_Parse], [true], + AC_MSG_FAILURE([libexpat not found])) + fi fi # @@ -208,7 +345,7 @@ dnl check for libkeyutils on linux KEYUTILS_LIB="" AS_IF([test x"$linux" = x"yes"], [ AC_CHECK_LIB([keyutils], [add_key], [KEYUTILS_LIB="-lkeyutils"], [ - AC_MSG_FAILURE([libkeyutils not found])])]) + AC_MSG_FAILURE([libkeyutils not found (libkeyutils-dev, keyutils-libs-devel)])])]) AC_SUBST(KEYUTILS_LIB) AC_CHECK_LIB([m], [pow], [true], AC_MSG_FAILURE([libm not found])) @@ -276,6 +413,18 @@ else AC_MSG_FAILURE([no suitable crypto library found]) fi +AC_ARG_ENABLE(gitversion, + [AC_HELP_STRING([--enable-gitversion], [build Ceph with git version string])], + [], [enable_gitversion=yes]) + +AM_CONDITIONAL(NO_GIT_VERSION, [test "x$enable_gitversion" = "xno"]) + +AC_ARG_ENABLE([root-make-check], + [AS_HELP_STRING([--enable-root-make-check], [enable make check tests that require root privileges])], + [], + [enable_root_make_check=no]) +AM_CONDITIONAL(ENABLE_ROOT_MAKE_CHECK, test "x$enable_root_make_check" != xno) + # profiler? AC_ARG_WITH([profiler], [AS_HELP_STRING([--with-profiler], [build extra profiler binaries])], @@ -287,11 +436,11 @@ AC_ARG_WITH([profiler], [with_profiler=no]) AS_IF([test "x$with_profiler" = xyes], [AC_CHECK_LIB([profiler], [ProfilerFlush], [], - [AC_MSG_FAILURE([--with-profiler was given but libprofiler (libgoogle-perftools-dev on debian) not found])]), - AC_LANG_PUSH([C++]), + [AC_MSG_FAILURE([--with-profiler was given but libprofiler (libgoogle-perftools-dev on debian) not found])]) + AC_LANG_PUSH([C++]) AC_CHECK_HEADERS([gperftools/heap-profiler.h \ gperftools/malloc_extension.h \ - gperftools/profiler.h]), + gperftools/profiler.h]) AC_LANG_POP([C++]) ], []) @@ -302,7 +451,7 @@ AS_IF([test "$with_profiler" = "yes"], # debug crap? AC_ARG_WITH([debug], - [AS_HELP_STRING([--with-debug], [build extra debug binaries])], + [AS_HELP_STRING([--with-debug], [build extra debug binaries and tests])], [case "${withval}" in yes) with_debug=yes ;; no) with_debug=no ;; @@ -313,12 +462,6 @@ AM_CONDITIONAL(WITH_DEBUG, test "$with_debug" = "yes") AC_DEFINE([DEBUG_GATHER], [1], [Define if you want C_Gather debugging]) -AC_ARG_ENABLE([docker], - [AS_HELP_STRING([--enable-docker], [enable docker based functional tests])], - [], - [enable_docker=no]) -AM_CONDITIONAL(ENABLE_DOCKER, test "x$enable_docker" != xno) - # code coverage? AC_ARG_ENABLE([coverage], [AS_HELP_STRING([--enable-coverage], [enable code coverage tracking])], @@ -330,11 +473,7 @@ if test "x$enable_coverage" != xno; then fi AC_SUBST(GCOV_PREFIX_STRIP, `echo $(pwd)/src | tr -dc / | wc -c`) -# radosgw? -AC_ARG_WITH([radosgw], - [AS_HELP_STRING([--with-radosgw], [build RADOS gateway])], - [], - [with_radosgw=check]) +# is radosgw available? RADOSGW=0 AS_IF([test "x$with_radosgw" != xno], [AC_CHECK_LIB([fcgi], [FCGX_Init], @@ -386,11 +525,52 @@ AS_IF([test "x$with_fuse" != xno], [no FUSE found (use --without-fuse to disable)])])]) AM_CONDITIONAL(WITH_FUSE, [test "$HAVE_LIBFUSE" = "1"]) +# jemalloc? +AC_ARG_WITH([jemalloc], + [AS_HELP_STRING([--with-jemalloc], [enable jemalloc for memory allocations])], + [], + [with_jemalloc=no]) +JEMALLOC= +AS_IF([test "x$with_jemalloc" = xyes], + [AC_CHECK_LIB([jemalloc], [malloc], + [AC_SUBST([LIBJEMALLOC], ["-ljemalloc"]) + AC_DEFINE([HAVE_LIBJEMALLOC], [1], + [Define if you have jemalloc]) + HAVE_LIBJEMALLOC=1 + ], + [AC_MSG_FAILURE( + [no jemalloc found (do not use --with-jemalloc)])])]) +AM_CONDITIONAL(WITH_JEMALLOC, [test "$HAVE_LIBJEMALLOC" = "1"]) + +# tcmalloc-minimal? +AC_ARG_WITH([tcmalloc-minimal], + [AS_HELP_STRING([--with-tcmalloc-minimal], [enable minimal tcmalloc support for memory allocations])], + [], + [with_tcmalloc_minimal=no]) + +AS_IF([test "x$with_jemalloc" = "xyes"],[with_tcmalloc_minimal=no],[]) + +TCMALLOC_MINIMAL= +AS_IF([test "x$with_tcmalloc_minimal" != xno], + [AC_CHECK_LIB([tcmalloc_minimal], [malloc], + [AC_SUBST([LIBTCMALLOC], ["-ltcmalloc_minimal"]) + AC_DEFINE([HAVE_LIBTCMALLOC_MINIMAL], [1], + [Define if you have tcmalloc]) + HAVE_LIBTCMALLOC_MINIMAL=1 + ], + [AC_MSG_FAILURE( + [no tcmalloc found (do not use --with-tcmalloc-minimal)])])]) +AM_CONDITIONAL(WITH_TCMALLOC_MINIMAL, [test "$HAVE_LIBTCMALLOC_MINIMAL" = "1"]) + # tcmalloc? AC_ARG_WITH([tcmalloc], [AS_HELP_STRING([--without-tcmalloc], [disable tcmalloc for memory allocations])], [], [with_tcmalloc=yes]) + +AS_IF([test "x$with_jemalloc" = "xyes"],[with_tcmalloc=no],[]) +AS_IF([test "x$with_tcmalloc_minimal" = "xyes"],[with_tcmalloc=no],[]) + TCMALLOC= AS_IF([test "x$with_tcmalloc" != xno], [AC_CHECK_LIB([tcmalloc], [malloc], @@ -403,11 +583,20 @@ AS_IF([test "x$with_tcmalloc" != xno], [no tcmalloc found (use --without-tcmalloc to disable)])])]) AM_CONDITIONAL(WITH_TCMALLOC, [test "$HAVE_LIBTCMALLOC" = "1"]) +# error out if --with-jemalloc and ! --without-tcmalloc +if test "x$with_jemalloc" = "xyes"; then + if test "x$with_tcmalloc" != "xno"; then + AC_MSG_FAILURE([--with-jemalloc called without --without-tcmalloc]) + fi +fi + #set pg ref debugging? AC_ARG_ENABLE([pgrefdebugging], [AS_HELP_STRING([--enable-pgrefdebugging], [enable pg ref debugging])], - [AC_DEFINE([PG_DEBUG_REFS], [1], [Defined if you want pg ref debugging])], - []) + [], [enable_pgrefdebugging=no]) +AS_IF([test "x$enable_pgrefdebugging" = "xyes"], + [AC_DEFINE([PG_DEBUG_REFS], [1], [Defined if you want pg ref debugging])], + []) # # Java is painful @@ -459,9 +648,9 @@ if test "x$enable_cephfs_java" = "xyes"; then # the search path. AS_IF([test "x$with_debug" = "xyes"], [ dir='/usr/share/java' - junit4_jar=`find $dir -name junit4.jar | head -n 1` + junit4_jar=`( find $dir -name junit4.jar;find $dir -name junit.jar ) | head -n 1` AS_IF([test -r "$junit4_jar"], [ - EXTRA_CLASSPATH_JAR=`dirname $junit4_jar`/junit4.jar + EXTRA_CLASSPATH_JAR="$junit4_jar" AC_SUBST(EXTRA_CLASSPATH_JAR) [have_junit4=1]], [ AC_MSG_NOTICE([Cannot find junit4.jar (apt-get install junit4)]) @@ -494,10 +683,35 @@ if test "x$enable_cephfs_java" = "xyes"; then fi AM_CONDITIONAL(HAVE_JUNIT4, [test "$have_junit4" = "1"]) +# +# Accelio and OFED +# +AC_ARG_ENABLE(xio, + [AC_HELP_STRING([--enable-xio], [build Ceph Accelio transport])], + [], [enable_xio=no]) + +AM_CONDITIONAL(ENABLE_XIO, [test "x$enable_xio" = "xyes"]) + +if test "x$enable_xio" = x"yes"; then + AC_CHECK_HEADER([libxio.h], [], AC_MSG_ERROR([Cannot find header 'libxio.h'.])) + AC_CHECK_LIB([xio], [xio_init], [], AC_MSG_FAILURE([Accelio libxio not found])) + AC_CHECK_LIB([ibverbs], [ibv_query_device], [], AC_MSG_FAILURE([libibverbs not found])) + AC_CHECK_LIB([rdmacm], [rdma_connect], [], AC_MSG_FAILURE([librdmacm not found])) + + # Also require boost-regex, used in address_helper + AC_CHECK_LIB(boost_regex, main, [], + AC_MSG_FAILURE(["Boost regex library not found."])) + + AC_DEFINE([HAVE_XIO], [1], [Accelio conditional compilation]) + + XIO_LIBS="-lxio -libverbs -lrdmacm" + AC_SUBST(XIO_LIBS) +fi + # # FreeBSD has it in base. # -if test x"$freebsd" != x"yes"; then +if test x"$freebsd" != x"yes" -a x"$with_radosgw" = x"yes"; then PKG_CHECK_MODULES([LIBEDIT], [libedit >= 2.11], [], AC_MSG_FAILURE([No usable version of libedit found.])) else @@ -539,18 +753,21 @@ AC_ARG_WITH([ocf], [with_ocf=no]) AM_CONDITIONAL(WITH_OCF, [ test "$with_ocf" = "yes" ]) -# check is snappy-devel is installed, needed by leveldb -AC_CHECK_LIB([snappy], [snappy_compress], [true], [AC_MSG_FAILURE([libsnappy not found])]) -# use system leveldb -AC_CHECK_LIB([leveldb], [leveldb_open], [true], [AC_MSG_FAILURE([libleveldb not found])], [-lsnappy -lpthread]) -# see if we can use bloom filters with leveldb -AC_LANG_PUSH([C++]) -AC_CHECK_HEADER([leveldb/filter_policy.h], [AC_DEFINE([HAVE_LEVELDB_FILTER_POLICY], [1], [Defined if LevelDB supports bloom filters ])]) -AC_LANG_POP([C++]) +# cond-check snappy-devel and leveldb, necessary if server + osd or mon enabled +if test "$enable_server" = "yes" -a \( "$with_osd" = "yes" -o "$with_mon" = "yes" \); then + AC_CHECK_LIB([snappy], [snappy_compress], [true], [AC_MSG_FAILURE([libsnappy not found])]) + AC_CHECK_LIB([leveldb], [leveldb_open], [true], [AC_MSG_FAILURE([libleveldb not found])], [-lsnappy -lpthread]) + + # see if we can use bloom filters with leveldb + AC_LANG_PUSH([C++]) + AC_CHECK_HEADER([leveldb/filter_policy.h], [AC_DEFINE([HAVE_LEVELDB_FILTER_POLICY], [1], [Defined if LevelDB supports bloom filters ])]) + AC_LANG_POP([C++]) +fi # Find supported SIMD / NEON / SSE extensions supported by the compiler AX_ARM_FEATURES() AM_CONDITIONAL(HAVE_NEON, [ test "x$ax_cv_support_neon_ext" = "xyes"]) +AM_CONDITIONAL(HAVE_ARMV8_CRC, [ test "x$ax_cv_support_crc_ext" = "xyes"]) AX_INTEL_FEATURES() AM_CONDITIONAL(HAVE_SSSE3, [ test "x$ax_cv_support_ssse3_ext" = "xyes"]) AM_CONDITIONAL(HAVE_SSE4_PCLMUL, [ test "x$ax_cv_support_pclmuldq_ext" = "xyes"]) @@ -596,23 +813,19 @@ AS_IF([test "x$with_librocksdb_static" = "xyes"], AM_CONDITIONAL(WITH_SLIBROCKSDB, [ test "x$with_librocksdb_static" = "xyes" ]) AM_CONDITIONAL(WITH_LIBROCKSDB, [ test "x$with_librocksdb_static" = "xyes" -o "x$with_librocksdb" = "xyes" ]) -# use system libs3? -AC_ARG_WITH([system-libs3], - [AS_HELP_STRING([--with-system-libs3], [use system libs3])], - , - [with_system_libs3=no]) -AS_IF([test "x$with_system_libs3" = xyes], - [AC_CHECK_LIB([s3], [S3_initialize], [true], [AC_MSG_FAILURE([libs3 not found])], [-lpthread])]) -AS_IF([test "x$with_system_libs3" = xcheck], - [AC_SEARCH_LIBS([S3_initialize], [s3], [with_system_libs3=yes], [true], [-lpthread])]) -AM_CONDITIONAL(WITH_SYSTEM_LIBS3, [ test "$with_system_libs3" = "yes" ]) - -# rest-bench? -AC_ARG_WITH([rest-bench], - [AS_HELP_STRING([--with-rest-bench], [enables rest-bench])], - [], - [with_rest_bench=no]) -AM_CONDITIONAL(WITH_REST_BENCH, [ test "$with_rest_bench" = "yes" ]) +# error out if --with-jemalloc and --with-librocksdb_static as rocksdb uses tcmalloc +if test "x$with_jemalloc" = "xyes"; then + if test "x$with_librocksdb_static" != "xno"; then + AC_MSG_FAILURE([--with-jemalloc called with --with-librocksdb_static, turn off + --with-librocksdb-static or --with-jemalloc]) + fi +fi + +# needs libcurl and libxml2 +if test "x$with_rest_bench" = xyes && test "x$with_system_libs3" = xno; then + AC_CHECK_LIB([curl], [curl_easy_init], [], AC_MSG_ERROR([libcurl not found])) + AC_CHECK_LIB([xml2], [xmlParseChunk], [], AC_MSG_ERROR([libxml2 not found])) +fi # use libaio? AC_ARG_WITH([libaio], @@ -807,6 +1020,27 @@ AC_DEFINE([HAVE_FDATASYNC], 1, [Define to 1 if you have fdatasync.]) AC_MSG_RESULT([no]) ]) +AC_MSG_CHECKING([for sched.h]) +AC_LANG_PUSH([C++]) +AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ +#define _GNU_SOURCE +#include +]], [[ +cpu_set_t cpuset; +CPU_ZERO(&cpuset); +CPU_SET(sched_getcpu(), &cpuset); +sched_setaffinity(0, sizeof(cpuset), &cpuset); +sched_yield(); +return 0; +]])], [ +AC_MSG_RESULT([yes]) +AC_DEFINE([HAVE_SCHED], 1, [Define to 1 if you have sched.h.]) +], [ +AC_MSG_RESULT([no]) +]) +AC_LANG_POP([C++]) + + # # Check for pthread spinlock (depends on ACX_PTHREAD) # @@ -916,6 +1150,129 @@ AM_COND_IF([WITH_BABELTRACE], [ AC_MSG_ERROR([babeltrace/ctf/events.h not found (libbabeltrace-ctf-dev, libbabeltrace-devel)])) ]) +dnl check for valgrind +AC_ARG_ENABLE([valgrind], + [AS_HELP_STRING([--enable-valgrind], [enable valgrind unit tests])], + [enable_valgrind=$enableval], [enable_valgrind=]) +AC_CHECK_PROG(HAVE_VALGRIND, valgrind, yes) +AS_IF( + [test "x$HAVE_VALGRIND" = "x"], AS_IF([test "x$enable_valgrind" = "xyes"], [AC_MSG_ERROR([valgrind not found])]), + [test "x$enable_valgrind" = "x"], [enable_valgrind=yes]) + +AM_CONDITIONAL([VALGRIND_ENABLED], [test "x$enable_valgrind" = "xyes"]) +if test "x$enable_valgrind" = "xyes"; then + AC_CHECK_HEADERS([valgrind/helgrind.h]) +fi + +dnl systemd-libexec-dir +AC_SUBST(systemd_libexec_dir) +AC_ARG_WITH( + systemd-libexec-dir, + AS_HELP_STRING( + [--with-systemd-libexec-dir=DIR], + [systemd libexec directory @<:@SYSTEMD_LIBEXEC_DIR@:>@ + defaults to --libexecdir=DIR] + ), + [ + systemd_libexec_dir="$withval" + ], + [ + if test "x$SYSTEMD_LIBEXEC_DIR" = "x"; then + dnl store old values + + prefix_save=$prefix + exec_prefix_save=$exec_prefix + + dnl if no prefix given, then use /usr/local, the default prefix + if test "x$prefix" = "xNONE"; then + prefix="$ac_default_prefix" + fi + dnl if no exec_prefix given, then use prefix + if test "x$exec_prefix" = "xNONE"; then + exec_prefix=$prefix + fi + + dnl now get the expanded default + systemd_libexec_dir="`eval exec_prefix=$exec_prefix prefix=$prefix echo $libexecdir`" + + dnl now cleanup prefix and exec_prefix + + prefix=$prefix_save + exec_prefix=$exec_prefix_save + else + systemd_libexec_dir="$SYSTEMD_LIBEXEC_DIR" + fi + ] +) + + +dnl rgw-user +AC_SUBST(user_rgw) +AC_ARG_WITH( + rgw-user, + AS_HELP_STRING( + [--with-rgw-user=USER], + [systemd unit directory @<:@USER_RGW@:>@ + Defaults to "www-data"] + ), + [ + user_rgw="$withval" + ], + [ + if test "x$USER_RGW" = "x"; then + user_rgw=www-data + else + user_rgw="$USER_RGW" + fi + ] +) + +dnl rgw-group +AC_SUBST(group_rgw) +AC_ARG_WITH( + rgw-group, + AS_HELP_STRING( + [--with-rgw-group=GROUP], + [systemd unit directory @<:@GROUP_RGW@:>@ + Defaults to "www-data"] + ), + [ + group_rgw="$withval" + ], + [ + if test "x$GROUP_RGW" = "x"; then + group_rgw=www-data + else + group_rgw="$GROUP_RGW" + fi + ] +) + + +AC_SUBST(systemd_unit_dir) +AC_ARG_WITH( + systemd-unit-dir, + AS_HELP_STRING( + [--with-systemdsystemunitdir=DIR], + [systemd unit directory @<:@SYSTEMD_UNIT_DIR@:>@ + Defaults to the correct value for debian /etc/systemd/system/] + ), + [ + systemd_unit_dir="$withval" + ], + [ + # default to the systemd admin unit directory + which pkg-config + pkg_config_exists=$? + if test x"$pkg_config_exists" = x"0"; then + systemd_unit_dir=`pkg-config systemd --variable=systemdsystemunitdir` + else + systemd_unit_dir="/etc/systemd/system/" + fi + ] +) + + # Checks for typedefs, structures, and compiler characteristics. @@ -978,6 +1335,11 @@ AC_CONFIG_FILES([Makefile src/ocf/rbd src/java/Makefile src/tracing/Makefile + systemd/Makefile man/Makefile + doc/Makefile + selinux/Makefile + systemd/ceph-osd@.service + systemd/ceph-rgw.tmpfiles.d ceph.spec]) AC_OUTPUT diff --git a/debian/.gitignore b/debian/.gitignore index 21b52f4e9d569..fdf4f729e71a9 100644 --- a/debian/.gitignore +++ b/debian/.gitignore @@ -27,9 +27,10 @@ /librbd1 /radosgw-dbg /radosgw -/rest-bench-dbg -/rest-bench /python-ceph +/python-rados +/python-rbd +/python-cephfs /libcephfs-java /libcephfs-jni /tmp diff --git a/debian/ceph-common.install b/debian/ceph-common.install index d49b4d790b935..4e21adff9c474 100644 --- a/debian/ceph-common.install +++ b/debian/ceph-common.install @@ -4,6 +4,7 @@ usr/bin/ceph usr/bin/ceph-authtool usr/bin/ceph-conf usr/bin/ceph-dencoder +usr/bin/ceph-rbdnamer usr/bin/ceph-syn usr/bin/ceph-crush-location usr/bin/rados @@ -13,6 +14,7 @@ usr/bin/ceph-brag usr/share/man/man8/ceph-authtool.8 usr/share/man/man8/ceph-conf.8 usr/share/man/man8/ceph-dencoder.8 +usr/share/man/man8/ceph-rbdnamer.8 usr/share/man/man8/ceph-syn.8 usr/share/man/man8/ceph-post-file.8 usr/share/man/man8/ceph.8 @@ -22,4 +24,6 @@ usr/share/ceph/known_hosts_drop.ceph.com usr/share/ceph/id_dsa_drop.ceph.com usr/share/ceph/id_dsa_drop.ceph.com.pub etc/ceph/rbdmap -etc/init.d/rbdmap \ No newline at end of file +etc/init.d/rbdmap +lib/udev/rules.d/50-rbd.rules +usr/lib/python*/dist-packages/ceph_argparse.py* diff --git a/debian/ceph-mds.dirs b/debian/ceph-mds.dirs new file mode 100644 index 0000000000000..9845268080de8 --- /dev/null +++ b/debian/ceph-mds.dirs @@ -0,0 +1 @@ +var/lib/ceph/mds diff --git a/debian/ceph-mds.install b/debian/ceph-mds.install index df3cbe4e637f3..85bdc3bdf0ce6 100644 --- a/debian/ceph-mds.install +++ b/debian/ceph-mds.install @@ -1,3 +1,5 @@ usr/bin/ceph-mds usr/bin/cephfs-journal-tool +usr/bin/cephfs-table-tool +usr/bin/cephfs-data-scan usr/share/man/man8/ceph-mds.8 diff --git a/debian/ceph-test.install b/debian/ceph-test.install index 3dfeeb15ecd42..367cf4777ac37 100644 --- a/debian/ceph-test.install +++ b/debian/ceph-test.install @@ -1,11 +1,14 @@ usr/bin/ceph-coverage usr/bin/ceph_bench_log -usr/bin/ceph_objectstore_tool usr/bin/ceph_kvstorebench usr/bin/ceph_multi_stress_watch usr/bin/ceph_erasure_code usr/bin/ceph_erasure_code_benchmark usr/bin/ceph_omapbench +usr/bin/ceph_perf_objectstore +usr/bin/ceph_perf_local +usr/bin/ceph_perf_msgr_client +usr/bin/ceph_perf_msgr_server usr/bin/ceph_psim usr/bin/ceph_radosacl usr/bin/ceph_rgw_jsonparser @@ -26,3 +29,4 @@ usr/bin/ceph-kvstore-tool usr/share/java/libcephfs-test.jar usr/bin/rbd-replay* usr/share/man/man8/rbd-replay*.8 +usr/lib/ceph/ceph-monstore-update-crush.sh diff --git a/debian/ceph.dirs b/debian/ceph.dirs index 21b6c23ca0364..faff244918ae7 100644 --- a/debian/ceph.dirs +++ b/debian/ceph.dirs @@ -1,6 +1,6 @@ var/lib/ceph/tmp var/lib/ceph/mon var/lib/ceph/osd -var/lib/ceph/mds var/lib/ceph/bootstrap-osd var/lib/ceph/bootstrap-mds +var/lib/ceph/bootstrap-rgw diff --git a/debian/ceph.install b/debian/ceph.install index 1055993d807e1..353f200fe3adc 100644 --- a/debian/ceph.install +++ b/debian/ceph.install @@ -3,16 +3,16 @@ lib/udev/rules.d/95-ceph-osd.rules lib/udev/rules.d/60-ceph-partuuid-workaround.rules usr/sbin/ceph-create-keys usr/sbin/ceph-disk -usr/sbin/ceph-disk-activate -usr/sbin/ceph-disk-prepare +usr/bin/ceph-detect-init usr/bin/ceph-clsinfo usr/bin/ceph-debugpack usr/bin/ceph-mon +usr/bin/ceph-objectstore-tool usr/bin/ceph-osd usr/bin/ceph-run usr/bin/ceph-rest-api usr/lib/python*/dist-packages/ceph_rest_api.py -usr/bin/ceph_mon_store_converter +usr/lib/python*/dist-packages/ceph_detect_init* usr/bin/crushtool usr/bin/monmaptool usr/bin/osdmaptool @@ -23,7 +23,10 @@ usr/libexec/ceph/ceph-osd-prestart.sh usr/share/doc/ceph/sample.ceph.conf usr/share/doc/ceph/sample.fetch_config usr/share/man/man8/ceph-clsinfo.8 +usr/share/man/man8/ceph-create-keys.8 usr/share/man/man8/ceph-debugpack.8 +usr/share/man/man8/ceph-deploy.8 +usr/share/man/man8/ceph-detect-init.8 usr/share/man/man8/ceph-disk.8 usr/share/man/man8/ceph-mon.8 usr/share/man/man8/ceph-osd.8 @@ -32,3 +35,4 @@ usr/share/man/man8/ceph-rest-api.8 usr/share/man/man8/crushtool.8 usr/share/man/man8/monmaptool.8 usr/share/man/man8/osdmaptool.8 +usr/lib/python*/dist-packages/ceph_daemon.py* diff --git a/debian/changelog b/debian/changelog index 2d0430cd609f3..318a59c05e009 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,51 @@ +ceph (9.0.2-1) stable; urgency=low + + * New upstream release + + -- Alfredo Deza Tue, 14 Jul 2015 13:10:31 -0700 + +ceph (9.0.1-1) stable; urgency=low + + * New upstream release + + -- Alfredo Deza Fri, 05 Jun 2015 10:59:02 -0700 + +ceph (9.0.0-1) stable; urgency=low + + * New upstream release + + -- Alfredo Deza Mon, 04 May 2015 12:32:58 -0700 + +ceph (0.94-1) stable; urgency=low + + * New upstream release + + -- Alfredo Deza Tue, 07 Apr 2015 10:05:40 -0700 + +ceph (0.93-1) stable; urgency=low + + * New upstream release + + -- Alfredo Deza Fri, 27 Feb 2015 09:52:53 -0800 + +ceph (0.92-1) stable; urgency=low + + * New upstream release + + -- Alfredo Deza Mon, 02 Feb 2015 10:35:27 -0800 + +ceph (0.91-1) stable; urgency=low + + * New upstream release + + -- Alfredo Deza Tue, 13 Jan 2015 12:10:22 -0800 + +ceph (0.90-1) stable; urgency=low + + * New upstream release + + -- Alfredo Deza Fri, 19 Dec 2014 06:56:22 -0800 + ceph (0.89-1) stable; urgency=low * New upstream release diff --git a/debian/control b/debian/control index f0db89d4475e4..0e4cd1e17518a 100644 --- a/debian/control +++ b/debian/control @@ -10,9 +10,11 @@ Build-Depends: autoconf, automake, autotools-dev, libbz2-dev, + cryptsetup-bin | cryptsetup, debhelper (>= 6.0.7~), default-jdk, git, + gdisk, javahelper, junit4, libaio-dev, @@ -29,7 +31,7 @@ Build-Depends: autoconf, libexpat1-dev, libfcgi-dev, libfuse-dev, - libgoogle-perftools-dev [i386 amd64], + libgoogle-perftools-dev [i386 amd64 arm64], libkeyutils-dev, libleveldb-dev, libnss3-dev, @@ -39,22 +41,33 @@ Build-Depends: autoconf, libudev-dev, libxml2-dev, lsb-release, + parted, pkg-config, python (>= 2.6.6-3~), python-argparse, python-nose, + python-sphinx, + python-virtualenv, + sdparm | hdparm, uuid-dev, uuid-runtime, xfslibs-dev, - yasm [amd64] + xfsprogs, + xmlstarlet, + yasm [amd64], + zlib1g-dev Standards-Version: 3.9.3 Package: ceph Architecture: linux-any Depends: binutils, - ceph-common (>= 0.78-500), + ceph-common (>= 9.0.0-943), cryptsetup-bin | cryptsetup, + debianutils, + findutils, gdisk, + grep, + logrotate, parted, python, python-argparse, @@ -65,8 +78,9 @@ Depends: binutils, ${misc:Depends}, ${shlibs:Depends} Recommends: btrfs-tools, ceph-mds, librados2, libradosstriper1, librbd1 -Replaces: ceph-common (<< 0.78-500), python-ceph (<< 0.85-979) -Breaks: python-ceph (<< 0.85-979) +Replaces: ceph-common (<< 0.78-500), python-ceph (<< 0.92-1223), + ceph-test (<< 0.94-1322) +Breaks: python-ceph (<< 0.92-1223), ceph-test (<< 0.94-1322) X-Python-Version: >= 2.6 Description: distributed storage and file system Ceph is a massively scalable, open-source, distributed @@ -83,6 +97,8 @@ Architecture: linux-any Section: debug Priority: extra Depends: ceph (= ${binary:Version}), ${misc:Depends} +Replaces: ceph-test-dbg (<< 0.94-1322) +Breaks: ceph-test-dbg (<< 0.94-1322) Description: debugging symbols for ceph Ceph is a distributed storage system designed to provide excellent performance, reliability, and scalability. @@ -93,8 +109,8 @@ Package: ceph-mds Architecture: linux-any Depends: ceph, ${misc:Depends}, ${shlibs:Depends} Recommends: ceph-fs-common, ceph-fuse, libcephfs1 -Replaces: ceph (<< 0.58-1) -Breaks: ceph (<< 0.58-1) +Replaces: ceph (<< 0.93-417) +Breaks: ceph (<< 0.93-417) Description: metadata server for the ceph distributed file system Ceph is a massively scalable, open-source, distributed storage system that runs on commodity hardware and delivers object, @@ -174,10 +190,18 @@ Description: debugging symbols for rbd-fuse Package: ceph-common Architecture: linux-any Depends: librbd1 (= ${binary:Version}), ${misc:Depends}, ${shlibs:Depends}, - python-ceph (= ${binary:Version}), python-requests + python-rados (= ${binary:Version}), + python-cephfs (= ${binary:Version}), + python-rbd (= ${binary:Version}), + python-requests Conflicts: ceph-client-tools -Replaces: ceph-client-tools, ceph (<< 0.78-500) -Breaks: ceph (<< 0.78-500) +Replaces: ceph-client-tools, + ceph (<< 9.0.0-943), + python-ceph (<< 0.92-1223), + librbd1 (<< 0.92-1238) +Breaks: ceph (<< 9.0.0-943), + python-ceph (<< 0.92-1223), + librbd1 (<< 0.92-1238) Suggests: ceph, ceph-mds Description: common utilities to mount and interact with a ceph storage cluster Ceph is a massively scalable, open-source, distributed @@ -402,7 +426,8 @@ Description: Ceph distributed file system client library (development files) Package: radosgw Architecture: linux-any -Depends: ceph-common (= ${binary:Version}), ${misc:Depends}, ${shlibs:Depends} +Depends: ceph-common (= ${binary:Version}), mime-support, + ${misc:Depends}, ${shlibs:Depends} Description: REST gateway for RADOS distributed object store RADOS is a distributed object store used by the Ceph distributed storage system. This package provides a REST gateway to the @@ -424,25 +449,9 @@ Description: debugging symbols for radosgw . This package contains debugging symbols for radosgw. -Package: rest-bench -Architecture: linux-any -Depends: ceph-common, curl, xml2, ${misc:Depends}, ${shlibs:Depends} -Description: RESTful bencher that can be used to benchmark - radosgw performance. - -Package: rest-bench-dbg -Architecture: linux-any -Section: debug -Priority: extra -Depends: ceph-common, curl, xml2, ${misc:Depends}, ${shlibs:Depends} -Description: debugging symbols for rest-bench - radosgw performance. - . - This package contains the debugging symbols for rest-bench. - Package: ceph-test Architecture: linux-any -Depends: ceph-common, curl, xml2, ${misc:Depends}, ${shlibs:Depends} +Depends: ceph-common, curl, xml2, xmlstarlet, ${misc:Depends}, ${shlibs:Depends} Description: Ceph test and benchmarking tools This package contains tools for testing and benchmarking Ceph. @@ -450,7 +459,8 @@ Package: ceph-test-dbg Architecture: linux-any Section: debug Priority: extra -Depends: ceph-common, curl, xml2, ${misc:Depends}, ${shlibs:Depends} +Depends: ceph-test (= ${binary:Version}), ceph-common, curl, xml2, + ${misc:Depends}, ${shlibs:Depends} Description: Ceph test and benchmarking tools . This package contains the debugging symbols for ceph-test. @@ -458,16 +468,59 @@ Description: Ceph test and benchmarking tools Package: python-ceph Architecture: linux-any Section: python -Depends: librados2, librbd1, libcephfs1, ${misc:Depends}, ${python:Depends} -Replaces: ceph (<< 0.85-979) +Depends: python-rados, python-rbd, python-cephfs +X-Python-Version: >= 2.6 +Description: Meta-package for python libraries for the Ceph libraries + Ceph is a massively scalable, open-source, distributed + storage system that runs on commodity hardware and delivers object, + block and file system storage. + . + This package is a metapackage for all python bindings. + +Package: python-rados +Architecture: linux-any +Section: python +Depends: librados2, ${misc:Depends}, ${python:Depends} +Replaces: python-ceph (<< 0.92-1223) +Breaks: python-ceph (<< 0.92-1223) +X-Python-Version: >= 2.6 +Description: Python libraries for the Ceph librados library + Ceph is a massively scalable, open-source, distributed + storage system that runs on commodity hardware and delivers object, + block and file system storage. + . + This package contains Python libraries for interacting with Ceph's + RADOS object storage. + +Package: python-rbd +Architecture: linux-any +Section: python +Depends: librbd1, ${misc:Depends}, ${python:Depends} +Replaces: python-ceph (<< 0.92-1223) +Breaks: python-ceph (<< 0.92-1223) +X-Python-Version: >= 2.6 +Description: Python libraries for the Ceph librbd library + Ceph is a massively scalable, open-source, distributed + storage system that runs on commodity hardware and delivers object, + block and file system storage. + . + This package contains Python libraries for interacting with Ceph's + RBD block device library. + +Package: python-cephfs +Architecture: linux-any +Section: python +Depends: libcephfs1, ${misc:Depends}, ${python:Depends} +Replaces: python-ceph (<< 0.92-1223) +Breaks: python-ceph (<< 0.92-1223) X-Python-Version: >= 2.6 -Description: Python libraries for the Ceph distributed filesystem +Description: Python libraries for the Ceph libcephfs library Ceph is a massively scalable, open-source, distributed storage system that runs on commodity hardware and delivers object, block and file system storage. . This package contains Python libraries for interacting with Ceph's - RADOS object storage, and RBD (RADOS block device). + CephFS file system client library. Package: libcephfs-java Section: java diff --git a/debian/copyright b/debian/copyright index d3906c44d3510..db2fafaaa6335 100644 --- a/debian/copyright +++ b/debian/copyright @@ -13,11 +13,11 @@ License: Creative Commons Attribution-ShareAlike (CC BY-SA) Files: src/mount/canonicalize.c Copyright: Copyright (C) 1993 Rick Sladkey -License: LGPL2 or later +License: LGPL2 or later (see COPYING-GPL2) Files: src/os/btrfs_ioctl.h Copyright: Copyright (C) 2007 Oracle. All rights reserved. -License: GPL2 +License: GPL2 (see COPYING-GPL2) Files: src/include/ceph_hash.cc Copyright: None @@ -138,3 +138,10 @@ Packaging: Copyright (C) 2004-2009 by Sage Weil Copyright (C) 2010 Canonical, Ltd. Licensed under LGPL-2.1 + +Files: src/test/perf_local.cc +Copyright: + (c) 2011-2014 Stanford University + (c) 2011 Facebook +License: + The MIT License diff --git a/debian/librbd1.install b/debian/librbd1.install index 0aaf73338a013..b3cb648c6362b 100644 --- a/debian/librbd1.install +++ b/debian/librbd1.install @@ -1,4 +1 @@ -lib/udev/rules.d/50-rbd.rules -usr/bin/ceph-rbdnamer usr/lib/librbd.so.* -usr/share/man/man8/ceph-rbdnamer.8 diff --git a/debian/python-ceph.install b/debian/python-ceph.install deleted file mode 100644 index 427f7f14a0bbe..0000000000000 --- a/debian/python-ceph.install +++ /dev/null @@ -1,4 +0,0 @@ -usr/lib/python*/dist-packages/rados.py* -usr/lib/python*/dist-packages/rbd.py* -usr/lib/python*/dist-packages/cephfs.py* -usr/lib/python*/dist-packages/ceph_argparse.py* diff --git a/debian/python-cephfs.install b/debian/python-cephfs.install new file mode 100644 index 0000000000000..458102cd555a4 --- /dev/null +++ b/debian/python-cephfs.install @@ -0,0 +1 @@ +usr/lib/python*/dist-packages/cephfs.py* diff --git a/debian/python-rados.install b/debian/python-rados.install new file mode 100644 index 0000000000000..7012f60b4afa9 --- /dev/null +++ b/debian/python-rados.install @@ -0,0 +1 @@ +usr/lib/python*/dist-packages/rados.py* diff --git a/debian/python-rbd.install b/debian/python-rbd.install new file mode 100644 index 0000000000000..a4ec715d9842a --- /dev/null +++ b/debian/python-rbd.install @@ -0,0 +1 @@ +usr/lib/python*/dist-packages/rbd.py* diff --git a/debian/rest-bench.install b/debian/rest-bench.install deleted file mode 100644 index 8535f20d5a4d3..0000000000000 --- a/debian/rest-bench.install +++ /dev/null @@ -1 +0,0 @@ -usr/bin/rest-bench diff --git a/debian/rules b/debian/rules index 34f6939e007b9..bb0aeaf3da18e 100755 --- a/debian/rules +++ b/debian/rules @@ -20,7 +20,7 @@ endif export DEB_HOST_ARCH ?= $(shell dpkg-architecture -qDEB_HOST_ARCH) -extraopts += --with-ocf --with-rest-bench --with-nss +extraopts += --with-ocf --with-nss extraopts += --with-debug extraopts += --enable-cephfs-java @@ -33,12 +33,6 @@ ifeq ($(DEB_HOST_ARCH), armel) extraopts += --without-libatomic-ops endif -ifeq ($(shell lsb_release -sc | egrep -q '(precise|quantal|raring|saucy|wheezy|squeeze)' && echo yes),yes) - extraopts += --without-lttng --without-babeltrace -else - extraopts += --with-lttng --with-babeltrace -endif - configure: configure-stamp configure-stamp: dh_testdir @@ -149,7 +143,6 @@ binary-arch: build install dh_strip -plibrbd1 --dbg-package=librbd1-dbg dh_strip -plibcephfs1 --dbg-package=libcephfs1-dbg dh_strip -pradosgw --dbg-package=radosgw-dbg - dh_strip -prest-bench --dbg-package=rest-bench-dbg dh_strip -pceph-test --dbg-package=ceph-test-dbg dh_compress -a diff --git a/do_autogen.sh b/do_autogen.sh index 51fb4405ba9ca..bfbe528bdcef9 100755 --- a/do_autogen.sh +++ b/do_autogen.sh @@ -16,8 +16,11 @@ do_autogen.sh: make a ceph build by running autogen, etc. -p google profiler -O optimize -c use cryptopp +-C add parameters to configure -j with java -r with rocksdb +-J --with-jemalloc +-L --without-lttng EOF } @@ -30,8 +33,8 @@ die() { debug_level=0 verbose=0 profile=0 -CONFIGURE_FLAGS="--disable-static" -while getopts "d:e:hHrTPjpcvO:" flag +CONFIGURE_FLAGS="--disable-static --with-lttng" +while getopts "d:e:hHrTPJLjpcvO:C:" flag do case $flag in d) debug_level=$OPTARG;; @@ -40,6 +43,8 @@ do c) CONFIGURE_FLAGS="$CONFIGURE_FLAGS --with-cryptopp --without-nss";; + C) CONFIGURE_FLAGS="$CONFIGURE_FLAGS $OPTARG";; + P) profile=1;; p) with_profiler="--with-profiler" ;; @@ -56,6 +61,10 @@ do e) encode_dump=$OPTARG;; + J) CONFIGURE_FLAGS="$CONFIGURE_FLAGS --with-jemalloc";; + + L) CONFIGURE_FLAGS="$CONFIGURE_FLAGS --without-lttng";; + *) echo usage @@ -126,6 +135,6 @@ export CXXFLAGS ./configure \ --prefix=/usr --sbindir=/sbin --localstatedir=/var --sysconfdir=/etc \ ---with-debug $with_profiler --with-nss --with-radosgw \ +--with-debug $with_profiler --with-nss --without-cryptopp --with-radosgw \ $CONFIGURE_FLAGS \ || die "configure failed" diff --git a/doc/.gitignore b/doc/.gitignore index 0c7c74746ae94..aca7518a3ee0a 100644 --- a/doc/.gitignore +++ b/doc/.gitignore @@ -1,2 +1,3 @@ /overview.png /object_store.png +/Makefile diff --git a/doc/Makefile.am b/doc/Makefile.am new file mode 100644 index 0000000000000..344bd8948431f --- /dev/null +++ b/doc/Makefile.am @@ -0,0 +1,33 @@ +EXTRA_DIST = \ + man/8/ceph-authtool.rst \ + man/8/ceph-clsinfo.rst \ + man/8/ceph-conf.rst \ + man/8/ceph-create-keys.rst \ + man/8/ceph-debugpack.rst \ + man/8/ceph-dencoder.rst \ + man/8/ceph-deploy.rst \ + man/8/ceph-disk.rst \ + man/8/cephfs.rst \ + man/8/ceph-fuse.rst \ + man/8/ceph-mds.rst \ + man/8/ceph-mon.rst \ + man/8/ceph-osd.rst \ + man/8/ceph-post-file.rst \ + man/8/ceph-rbdnamer.rst \ + man/8/ceph-rest-api.rst \ + man/8/ceph.rst \ + man/8/ceph-run.rst \ + man/8/ceph-syn.rst \ + man/8/crushtool.rst \ + man/8/librados-config.rst \ + man/8/monmaptool.rst \ + man/8/mount.ceph.rst \ + man/8/osdmaptool.rst \ + man/8/radosgw-admin.rst \ + man/8/radosgw.rst \ + man/8/rados.rst \ + man/8/rbd-fuse.rst \ + man/8/rbd-replay-many.rst \ + man/8/rbd-replay-prep.rst \ + man/8/rbd-replay.rst \ + man/8/rbd.rst diff --git a/doc/_templates/smarttoc.html b/doc/_templates/smarttoc.html index 8967e7f886881..8ec60cdf454da 100644 --- a/doc/_templates/smarttoc.html +++ b/doc/_templates/smarttoc.html @@ -1,11 +1,11 @@ {# Sphinx sidebar template: smart table of contents. - Show a sidebar ToC that gives you a more global view of the - documentation, and not the confusing cur/prev/next that is the + Shows a sidebar ToC that gives you a more global view of the + documentation, and not the confusing cur/prev/next which is the default sidebar. - The ToC will open & collapse automatically to show the part of the + The ToC will open and collapse automatically to show the part of the hierarchy you are in. Top-level items will always be visible. #} diff --git a/doc/_themes/ceph/static/nature.css_t b/doc/_themes/ceph/static/nature.css_t index 1673a54539270..71fb56f4d918f 100644 --- a/doc/_themes/ceph/static/nature.css_t +++ b/doc/_themes/ceph/static/nature.css_t @@ -146,7 +146,7 @@ a { color: #F05C56; text-decoration: none; } - + a:hover { color: #F05C56; text-decoration: underline; @@ -196,7 +196,7 @@ div.admonition p.admonition-title + p { display: inline; } -div.highlight{ +div.highlight { background-color: white; } @@ -278,6 +278,10 @@ div.admonition.important { color: #fff; } +div.admonition.important a { + color: #E6E8E8; +} + div.tip tt.literal { background-color: #55aeba; color: #fff; @@ -298,3 +302,9 @@ dl.glossary dt { padding-top:20px; } + +p.breathe-sectiondef-title { + font-size: 1.3em; + font-weight: bold; + border-bottom: thin solid #5E6A71; +} diff --git a/doc/architecture.rst b/doc/architecture.rst index 060676cbbb0c9..d9e65d1fadd44 100644 --- a/doc/architecture.rst +++ b/doc/architecture.rst @@ -1185,9 +1185,9 @@ Object Watch/Notify ------------------- A client can register a persistent interest with an object and keep a session to -the primary OSD open. The client can send a notification message and payload to +the primary OSD open. The client can send a notification message and a payload to all watchers and receive notification when the watchers receive the -notification. This enables a client to use any object a +notification. This enables a client to use any object as a synchronization/communication channel. @@ -1597,4 +1597,4 @@ instance for high availability. .. _Set Pool Values: ../rados/operations/pools#set-pool-values .. _Kerberos: http://en.wikipedia.org/wiki/Kerberos_(protocol) .. _Cephx Config Guide: ../rados/configuration/auth-config-ref -.. _User Management: ../rados/operations/user-management \ No newline at end of file +.. _User Management: ../rados/operations/user-management diff --git a/doc/cephfs/disaster-recovery.rst b/doc/cephfs/disaster-recovery.rst new file mode 100644 index 0000000000000..8d6748d57428c --- /dev/null +++ b/doc/cephfs/disaster-recovery.rst @@ -0,0 +1,162 @@ + +Disaster recovery +================= + +.. danger:: + + The notes in this section are aimed at experts, making a best effort + to recovery what they can from damaged filesystems. These steps + have the potential to make things worse as well as better. If you + are unsure, do not proceed. + + +Journal export +-------------- + +Before attempting dangerous operations, make a copy of the journal like so: + +:: + + cephfs-journal-tool journal export backup.bin + +Note that this command may not always work if the journal is badly corrupted, +in which case a RADOS-level copy should be made (http://tracker.ceph.com/issues/9902). + + +Dentry recovery from journal +---------------------------- + +If a journal is damaged or for any reason an MDS is incapable of replaying it, +attempt to recover what file metadata we can like so: + +:: + + cephfs-journal-tool event recover_dentries summary + +This command by default acts on MDS rank 0, pass --rank= to operate on other ranks. + +This command will write any inodes/dentries recoverable from the journal +into the backing store, if these inodes/dentries are higher-versioned +than the previous contents of the backing store. If any regions of the journal +are missing/damaged, they will be skipped. + +Note that in addition to writing out dentries and inodes, this command will update +the InoTables of each 'in' MDS rank, to indicate that any written inodes' numbers +are now in use. In simple cases, this will result in an entirely valid backing +store state. + +.. warning:: + + The resulting state of the backing store is not guaranteed to be self-consistent, + and an online MDS scrub will be required afterwards. The journal contents + will not be modified by this command, you should truncate the journal + separately after recovering what you can. + +Journal truncation +------------------ + +If the journal is corrupt or MDSs cannot replay it for any reason, you can +truncate it like so: + +:: + + cephfs-journal-tool journal reset + +.. warning:: + + Resetting the journal *will* lose metadata unless you have extracted + it by other means such as ``recover_dentries``. It is likely to leave + some orphaned objects in the data pool. It may result in re-allocation + of already-written inodes, such that permissions rules could be violated. + +MDS table wipes +--------------- + +After the journal has been reset, it may no longer be consistent with respect +to the contents of the MDS tables (InoTable, SessionMap, SnapServer). + +To reset the SessionMap (erase all sessions), use: + +:: + + cephfs-table-tool all reset session + +This command acts on the tables of all 'in' MDS ranks. Replace 'all' with an MDS +rank to operate on that rank only. + +The session table is the table most likely to need resetting, but if you know you +also need to reset the other tables then replace 'session' with 'snap' or 'inode'. + +MDS map reset +------------- + +Once the in-RADOS state of the filesystem (i.e. contents of the metadata pool) +is somewhat recovered, it may be necessary to update the MDS map to reflect +the contents of the metadata pool. Use the following command to reset the MDS +map to a single MDS: + +:: + + ceph fs reset --yes-i-really-mean-it + +Once this is run, any in-RADOS state for MDS ranks other than 0 will be ignored: +as a result it is possible for this to result in data loss. + +One might wonder what the difference is between 'fs reset' and 'fs remove; fs new'. The +key distinction is that doing a remove/new will leave rank 0 in 'creating' state, such +that it would overwrite any existing root inode on disk and orphan any existing files. In +contrast, the 'reset' command will leave rank 0 in 'active' state such that the next MDS +daemon to claim the rank will go ahead and use the existing in-RADOS metadata. + +Recovery from missing metadata objects +-------------------------------------- + +Depending on what objects are missing or corrupt, you may need to +run various commands to regenerate default versions of the +objects. + +:: + + # Session table + cephfs-table-tool 0 reset session + # SnapServer + cephfs-table-tool 0 reset snap + # InoTable + cephfs-table-tool 0 reset inode + # Journal + cephfs-journal-tool --rank=0 journal reset + # Root inodes ("/" and MDS directory) + cephfs-data-scan init + +Finally, you can regenerate metadata objects for missing files +and directories based on the contents of a data pool. This is +a two-phase process. First, scanning *all* objects to calculate +size and mtime metadata for inodes. Second, scanning the first +object from every file to collect this metadata and inject +it into the metadata pool. + +:: + + cephfs-data-scan scan_extents + cephfs-data-scan scan_inodes + +This command may take a very long time if there are many +files or very large files in the data pool. To accelerate +the process, run multiple instances of the tool. Decide on +a number of workers, and pass each worker a number within +the range 0-(N_workers - 1), like so: + +:: + + # Worker 0 + cephfs-data-scan scan_extents 0 1 + # Worker 1 + cephfs-data-scan scan_extents 1 1 + + # Worker 0 + cephfs-data-scan scan_inodes 0 1 + # Worker 1 + cephfs-data-scan scan_inodes 1 1 + +It is important to ensure that all workers have completed the +scan_extents phase before any workers enter the scan_inodes phase. diff --git a/doc/cephfs/early-adopters.rst b/doc/cephfs/early-adopters.rst new file mode 100644 index 0000000000000..1047479f2581d --- /dev/null +++ b/doc/cephfs/early-adopters.rst @@ -0,0 +1,61 @@ + +CephFS for early adopters +========================= + +This pages provides guidance for early adoption of CephFS by users +with an appetite for adventure. While work is ongoing to build the +scrubbing and disaster recovery tools needed to run CephFS in demanding +production environments, it is already useful for community members to +try CephFS and provide bug reports and feedback. + +Setup instructions +================== + +Please see the instructions at :doc:`/cephfs/index`. + +Most stable configuration +========================= + +For the best chance of a happy healthy filesystem, use a **single active MDS** +and **do not use snapshots**. Both of these are the default: + +* Snapshots are disabled by default, unless they are enabled explicitly by + an administrator using the ``allow_new_snaps`` setting. +* Ceph will use a single active MDS unless an administrator explicitly sets + ``max_mds`` to a value greater than 1. Note that creating additional + MDS daemons (e.g. with ``ceph-deploy mds create``) is okay, as these will + by default simply become standbys. It is also fairly safe to enable + standby-replay mode. + +Which client? +============= + +The fuse client is the easiest way to get up to date code, while +the kernel client will often give better performance. + +The clients do not always provide equivalent functionality, for example +the fuse client supports client-enforced quotas while the kernel client +does not. + +When encountering bugs or performance issues, it is often instructive to +try using the other client, in order to find out whether the bug was +client-specific or not (and then to let the developers know). + +Reporting issues +================ + +If you have identified a specific issue, please report it with as much +information as possible. Especially important information: + +* Ceph versions installed on client and server +* Whether you are using the kernel or fuse client +* If you are using the kernel client, what kernel version? +* How many clients are in play, doing what kind of workload? +* If a system is 'stuck', is that affecting all clients or just one? +* Any ceph health messages +* Any backtraces in the ceph logs from crashes + +If you are satisfied that you have found a bug, please file it on +http://tracker.ceph.com. For more general queries please write +to the ceph-users mailing list. + diff --git a/doc/cephfs/eviction.rst b/doc/cephfs/eviction.rst new file mode 100644 index 0000000000000..bd201cebbf76b --- /dev/null +++ b/doc/cephfs/eviction.rst @@ -0,0 +1,119 @@ + +Ceph filesystem client eviction +=============================== + +When a filesystem client is unresponsive or otherwise misbehaving, it +may be necessary to forcibly terminate its access to the filesystem. This +process is called *eviction*. + +This process is somewhat thorough in order to protect against data inconsistency +resulting from misbehaving clients. + +OSD blacklisting +---------------- + +First, prevent the client from performing any more data operations by *blacklisting* +it at the RADOS level. You may be familiar with this concept as *fencing* in other +storage systems. + +Identify the client to evict from the MDS session list: + +:: + + # ceph daemon mds.a session ls + [ + { "id": 4117, + "num_leases": 0, + "num_caps": 1, + "state": "open", + "replay_requests": 0, + "reconnecting": false, + "inst": "client.4117 172.16.79.251:0\/3271", + "client_metadata": { "entity_id": "admin", + "hostname": "fedoravm.localdomain", + "mount_point": "\/home\/user\/mnt"}}] + +In this case the 'fedoravm' client has address ``172.16.79.251:0/3271``, so we blacklist +it as follows: + +:: + + # ceph osd blacklist add 172.16.79.251:0/3271 + blacklisting 172.16.79.251:0/3271 until 2014-12-09 13:09:56.569368 (3600 sec) + +OSD epoch barrier +----------------- + +While the evicted client is now marked as blacklisted in the central (mon) copy of the OSD +map, it is now necessary to ensure that this OSD map update has propagated to all daemons +involved in subsequent filesystem I/O. To do this, use the ``osdmap barrier`` MDS admin +socket command. + +First read the latest OSD epoch: + +:: + + # ceph osd dump + epoch 12 + fsid fd61ca96-53ff-4311-826c-f36b176d69ea + created 2014-12-09 12:03:38.595844 + modified 2014-12-09 12:09:56.619957 + ... + +In this case it is 12. Now request the MDS to barrier on this epoch: + +:: + + # ceph daemon mds.a osdmap barrier 12 + +MDS session eviction +-------------------- + +Finally, it is safe to evict the client's MDS session, such that any capabilities it held +may be issued to other clients. The ID here is the ``id`` attribute from the ``session ls`` +output: + +:: + + # ceph daemon mds.a session evict 4117 + +That's it! The client has now been evicted, and any resources it had locked will +now be available for other clients. + +Background: OSD epoch barrier +----------------------------- + +The purpose of the barrier is to ensure that when we hand out any +capabilities which might allow touching the same RADOS objects, the +clients we hand out the capabilities to must have a sufficiently recent +OSD map to not race with cancelled operations (from ENOSPC) or +blacklisted clients (from evictions) + +More specifically, the cases where we set an epoch barrier are: + + * Client eviction (where the client is blacklisted and other clients + must wait for a post-blacklist epoch to touch the same objects) + * OSD map full flag handling in the client (where the client may + cancel some OSD ops from a pre-full epoch, so other clients must + wait until the full epoch or later before touching the same objects). + * MDS startup, because we don't persist the barrier epoch, so must + assume that latest OSD map is always required after a restart. + +Note that this is a global value for simplicity: we could maintain this on +a per-inode basis. We don't, because: + + * It would be more complicated + * It would use an extra 4 bytes of memory for every inode + * It would not be much more efficient as almost always everyone has the latest + OSD map anyway, in most cases everyone will breeze through this barrier + rather than waiting. + * We only do this barrier in very rare cases, so any benefit from per-inode + granularity would only very rarely be seen. + +The epoch barrier is transmitted along with all capability messages, and +instructs the receiver of the message to avoid sending any more RADOS +operations to OSDs until it has seen this OSD epoch. This mainly applies +to clients (doing their data writes directly to files), but also applies +to the MDS because things like file size probing and file deletion are +done directly from the MDS. + diff --git a/doc/cephfs/file-layouts.rst b/doc/cephfs/file-layouts.rst index 82d2958f85a20..95be8ae3e5772 100644 --- a/doc/cephfs/file-layouts.rst +++ b/doc/cephfs/file-layouts.rst @@ -144,4 +144,18 @@ directories do not have layouts set: $ getfattr -n ceph.file.layout dir/childdir/grandchild # file: dir/childdir/grandchild ceph.file.layout="stripe_unit=4194304 stripe_count=4 object_size=4194304 pool=cephfs_data" + +Adding a data pool to the MDS +--------------------------------- + +Before you can use a pool with CephFS you have to add it to the Metadata Servers. + +.. code-block:: bash + + $ ceph mds add_data_pool cephfs_data_ssd + # Pool should now show up + $ ceph fs ls + .... data pools: [cephfs_data cephfs_data_ssd ] + +Make sure that your cephx keys allows the client to access this new pool. diff --git a/doc/cephfs/full.rst b/doc/cephfs/full.rst new file mode 100644 index 0000000000000..a58b94c77bbf1 --- /dev/null +++ b/doc/cephfs/full.rst @@ -0,0 +1,60 @@ + +Handling a full Ceph filesystem +=============================== + +When a RADOS cluster reaches its ``mon_osd_full_ratio`` (default +95%) capacity, it is marked with the OSD full flag. This flag causes +most normal RADOS clients to pause all operations until it is resolved +(for example by adding more capacity to the cluster). + +The filesystem has some special handling of the full flag, explained below. + +Hammer and later +---------------- + +Since the hammer release, a full filesystem will lead to ENOSPC +results from: + + * Data writes on the client + * Metadata operations other than deletes and truncates + +Because the full condition may not be encountered until +data is flushed to disk (sometime after a ``write`` call has already +returned 0), the ENOSPC error may not be seen until the application +calls ``fsync`` or ``fclose`` (or equivalent) on the file handle. + +Calling ``fsync`` is guaranteed to reliably indicate whether the data +made it to disk, and will return an error if it doesn't. ``fclose`` will +only return an error if buffered data happened to be flushed since +the last write -- a successful ``fclose`` does not guarantee that the +data made it to disk, and in a full-space situation, buffered data +may be discarded after an ``fclose`` if no space is available to persist it. + +.. warning:: + If an application appears to be misbehaving on a full filesystem, + check that it is performing ``fsync()`` calls as necessary to ensure + data is on disk before proceeding. + +Data writes may be cancelled by the client if they are in flight at the +time the OSD full flag is sent. Clients update the ``osd_epoch_barrier`` +when releasing capabilities on files affected by cancelled operations, in +order to ensure that these cancelled operations do not interfere with +subsequent access to the data objects by the MDS or other clients. For +more on the epoch barrier mechanism, see :doc:`eviction`. + +Legacy (pre-hammer) behavior +---------------------------- + +In versions of Ceph earlier than hammer, the MDS would ignore +the full status of the RADOS cluster, and any data writes from +clients would stall until the cluster ceased to be full. + +There are two dangerous conditions to watch for with this behaviour: + +* If a client had pending writes to a file, then it was not possible + for the client to release the file to the MDS for deletion: this could + lead to difficulty clearing space on a full filesystem +* If clients continued to create a large number of empty files, the + resulting metadata writes from the MDS could lead to total exhaustion + of space on the OSDs such that no further deletions could be performed. + diff --git a/doc/cephfs/index.rst b/doc/cephfs/index.rst index 7821701f5e0bb..67e1468b64778 100644 --- a/doc/cephfs/index.rst +++ b/doc/cephfs/index.rst @@ -7,20 +7,24 @@ a Ceph Storage Cluster to store its data. The Ceph filesystem uses the same Ceph Storage Cluster system as Ceph Block Devices, Ceph Object Storage with its S3 and Swift APIs, or native bindings (librados). -.. important:: Ceph FS is currently not recommended for production data. +.. important:: CephFS currently lacks a robust 'fsck' check and + repair function. Please use caution when storing + important data as the disaster recovery tools are + still under development. For more information about + using CephFS today, see :doc:`/cephfs/early-adopters` .. ditaa:: +-----------------------+ +------------------------+ - | CephFS Kernel Object | | CephFS FUSE | - +-----------------------+ +------------------------+ - - +---------------------------------------------------+ - | Ceph FS Library (libcephfs) | - +---------------------------------------------------+ - - +---------------------------------------------------+ - | Ceph Storage Cluster Protocol (librados) | - +---------------------------------------------------+ + | | | CephFS FUSE | + | | +------------------------+ + | | + | | +------------------------+ + | CephFS Kernel Object | | CephFS Library | + | | +------------------------+ + | | + | | +------------------------+ + | | | librados | + +-----------------------+ +------------------------+ +---------------+ +---------------+ +---------------+ | OSDs | | MDSs | | Monitors | @@ -51,7 +55,7 @@ least one :term:`Ceph Metadata Server` running. .. raw:: html -

Step 2: Mount Ceph FS

+

Step 2: Mount CephFS

Once you have a healthy Ceph Storage Cluster with at least one Ceph Metadata Server, you may create and mount your Ceph Filesystem. @@ -61,10 +65,10 @@ authentication keyring. .. toctree:: :maxdepth: 1 - Create Ceph FS - Mount Ceph FS - Mount Ceph FS as FUSE - Mount Ceph FS in fstab + Create CephFS + Mount CephFS + Mount CephFS as FUSE + Mount CephFS in fstab Manpage cephfs <../../man/8/cephfs> Manpage ceph-fuse <../../man/8/ceph-fuse> Manpage mount.ceph <../../man/8/mount.ceph> @@ -77,11 +81,15 @@ authentication keyring. .. toctree:: :maxdepth: 1 + CephFS Quotas Using Ceph with Hadoop libcephfs <../../api/libcephfs-java/> cephfs-journal-tool File layouts + Client eviction + Handling full filesystems Troubleshooting + Disaster recovery .. raw:: html diff --git a/doc/cephfs/quota.rst b/doc/cephfs/quota.rst new file mode 100644 index 0000000000000..e92f05b6e68c4 --- /dev/null +++ b/doc/cephfs/quota.rst @@ -0,0 +1,70 @@ +CephFS Quotas +============= + +CephFS allows quotas to be set on any directory in the system. The +quota can restrict the number of *bytes* or the number of *files* +stored beneath that point in the directory hierarchy. + +Limitations +----------- + +#. *Quotas are cooperative and non-adversarial.* CephFS quotas rely on + the cooperation of the client who is mounting the file system to + stop writers when a limit is reached. A modified or adversarial + client cannot be prevented from writing as much data as it needs. + Quotas should not be relied on to prevent filling the system in + environments where the clients are fully untrusted. + +#. *Quotas are imprecise.* Processes that are writing to the file + system will be stopped a short time after the quota limit is + reached. They will inevitably be allowed to write some amount of + data over the configured limit. How far over the quota they are + able to go depends primarily on the amount of time, not the amount + of data. Generally speaking writers will be stopped within 10s of + seconds of crossing the configured limit. + +#. *Quotas are not yet implemented in the kernel client.* Quotas are + supported by the userspace client (libcephfs, ceph-fuse) but are + not yet implemented in the Linux kernel client. + +#. *Quotas must be configured carefully when used with path-based + mount restrictions.* The client needs to have access to the + directory inode on which quotas are configured in order to enforce + them. If the client has restricted access to a specific path + (e.g., ``/home/user``) based on the MDS capability, and a quota is + configured on an ancestor directory they do not have access too + (e.g., ``/home``), the client will not enforce it. When using + path-based access restrictions be sure to configure the quota on + the directory the client is restricted too (e.g., ``/home/user``) + or something nested beneath it. + +Configuration +------------- + +Like most other things in CephFS, quotas are configured using virtual +extended attributes: + + * ``ceph.quota.max_files`` -- file limit + * ``ceph.quota.max_bytes`` -- byte limit + +If the attributes appear on a directory inode that means a quota is +configured there. If they are not present then no quota is set on +that directory (although one may still be configured on a parent directory). + +To set a quota:: + + setfattr -n ceph.quota.max_bytes -v 100000000 /some/dir # 100 MB + setfattr -n ceph.quota.max_files -v 10000 /some/dir # 10,000 files + +To view quota settings:: + + getfattr -n ceph.quota.max_bytes /some/dir + getfattr -n ceph.quota.max_files /some/dir + +Note that if the value of the extended attribute is ``0`` that means +the quota is not set. + +To remove a quota:: + + setfattr -n ceph.quota.max_bytes -v 0 /some/dir + setfattr -n ceph.quota.max_files -v 0 /some/dir diff --git a/doc/changelog/v0.80.10.txt b/doc/changelog/v0.80.10.txt new file mode 100644 index 0000000000000..be82bcc9b6991 --- /dev/null +++ b/doc/changelog/v0.80.10.txt @@ -0,0 +1,3308 @@ +commit ea6c958c38df1216bf95c927f143d8b13c4a9e70 (tag: refs/tags/v0.80.10) +Author: Jenkins +Date: Thu Jun 18 09:48:56 2015 -0700 + + 0.80.10 + +commit 9b7f09e280b4610b9f85ac34ec014018f0e2e1d3 +Author: Sage Weil +Date: Wed Jun 17 09:35:28 2015 -0700 + + qa/workunits/rados/test-upgarde-v9.0.1: fix exclude syntax + + It's -, then a list of all exclusions separated by :. There are just 2. + + Signed-off-by: Sage Weil + (cherry picked from commit 78d894a634d727a9367f809a1f57234e5e6935be) + +commit 4e28fbc52db7d808aeaede884f342beafb7fc581 +Author: Sage Weil +Date: Tue Jun 16 21:05:29 2015 -0700 + + qa/workunits/rados/test-upgrade-v9.0.1: skip one more evict test + + Signed-off-by: Sage Weil + (cherry picked from commit 3e8d60a80ce31860eac76a1f6489a35e1795a0c0) + +commit 34ba3719997fed6d0b8148f21e59b446bdf42962 +Author: Josh Durgin +Date: Mon Jun 15 15:12:43 2015 -0700 + + qa: add compatibility filtered rados api tests for upgrades + + Post-9.0.1, the evict op returns success when an object doesn't exist + in the cache tier. Skip the tests that are incompatible across + versions. + + Fixes: #11548 + Signed-off-by: Josh Durgin + (cherry picked from commit 348a3d3c9880e7d022e71a2faafe51c8f771406e) + +commit d1f478200342f422ee6e563bf7aad54ba38b3ed3 +Merge: d0f9c5f 7f1abd9 +Author: Sage Weil +Date: Wed Jun 10 15:03:48 2015 -0700 + + Merge pull request #4924 from ceph/wip-11955-firefly + + 11955: ceph.spec.in: package mkcephfs on EL6 + + Reviewed-by: Sage Weil + +commit 7f1abd9053088ded6613aeca4e7c9489c44910e4 +Author: Ken Dreyer +Date: Wed Jun 10 15:43:41 2015 -0600 + + ceph.spec.in: package mkcephfs on EL6 + + Commit efbca0465c2946e113771966df08cf7cf37b1196 added mkcephfs to the + RPM %files listing, but this /usr/sbin path is only correct for CentOS + 7. In CentOS 6, the utility is present at /sbin/mkcephfs instead. This + causes rpmbuild to fail to build the tip of the firefly branch on EL6. + + Adjust the RPM %files list so we properly package mkcephfs on both EL7 + and EL6. + + http://tracker.ceph.com/issues/11955 Refs: #11955 + + Signed-off-by: Ken Dreyer + +commit d0f9c5f47024f53b4eccea2e0fde9b7844746362 +Merge: efbca04 9930138 +Author: Orit Wasserman +Date: Thu Jun 4 19:07:03 2015 +0200 + + Merge pull request #4851 from ceph/wip-10873-firefly + + rgw: generate the "Date" HTTP header for civetweb. + +commit 99301384be96997203682679c0430ca0a53be5d3 (refs/remotes/gh/wip-10873-firefly) +Author: Radoslaw Zarzynski +Date: Wed Feb 18 15:48:43 2015 +0100 + + rgw: generate the "Date" HTTP header for civetweb. + + Fixes: #10873 + Backport: hammer + Signed-off-by: Radoslaw Zarzynski + (cherry picked from commit ea384f83b601f60e135c3d3f960fdb75a919dd84) + +commit efbca0465c2946e113771966df08cf7cf37b1196 +Author: Sage Weil +Date: Wed Jun 3 13:03:50 2015 -0400 + + ceph.spec: add mkcephfs to ceph.rpm + + Signed-off-by: Sage Weil + +commit e785f1a2bb4cd77b0aaaccd78060183157ad842e +Author: Ken Dreyer +Date: Tue Dec 2 13:58:10 2014 -0700 + + packaging: package ceph-disk(8) + + The ceph-disk man page was added in + a450cab2b8148cb8a9b043d629feccf89e5aabac, but this was not added to the + RPM or DEB packaging. Add it here. + + Signed-off-by: Ken Dreyer + (cherry picked from commit b743a951114b00bbb6e14fb88f1928b504bc0f8b) + +commit 819cf6ddb986b37c5d1229c4ea330010e88bb615 +Merge: e93711a 665a857 +Author: Yehuda Sadeh +Date: Tue Jun 2 19:53:03 2015 +0300 + + Merge pull request #4780 from oritwas/wip-10295-firefly + + rgw: civetweb should use unique request id + + Reviewd-by: Yehuda Sadeh + +commit e93711a40d680230b03fe722a4da49a6f9a9b667 +Merge: 071c943 50e8579 +Author: Yehuda Sadeh +Date: Tue Jun 2 19:02:07 2015 +0300 + + Merge pull request #4829 from oritwas/wip-negative_content_length-firefly + + rgw: don't allow negative / invalid content length + + Reviewed-by: Yehuda Sadeh + +commit 50e85797507a3ba13193f368cff461c08e44a9b3 +Author: Yehuda Sadeh +Date: Fri Aug 1 16:15:36 2014 -0700 + + rgw: don't allow negative / invalid content length + + Certain frontends (e.g., civetweb) don't filter such requests. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 0e74b7a1d56733358e2f1d3df4386125a94c2966) + +commit 071c94385ee71b86c5ed8363d56cf299da1aa7b3 +Merge: 68211f6 c632ef1 +Author: Sage Weil +Date: Wed May 27 15:36:15 2015 -0700 + + Merge pull request #4766 from SUSE/wip-11673-firefly + + Debian: ceph-test and rest-bench debug packages should require their respective binary packages + + Reviewed-by: Sage Weil + +commit 665a85701177230365e43a351d7722cb2adbde93 +Author: Orit Wasserman +Date: Thu Apr 23 17:36:47 2015 +0200 + + rgw: civetweb should use unique request id + + max_req_id was moved to RGWRados and changed to atomic64_t. + + The same request id resulted in gc giving the same idtag to all objects + resulting in a leakage of rados objects. It only kept the last deleted object in + it's queue, the previous objects were never freed. + + Fixes: 10295 + Backport: Hammer, Firefly + + Signed-off-by: Orit Wasserman + (cherry picked from commit c262259) + + Conflicts: + src/rgw/rgw_main.cc + src/rgw/rgw_rados.h + +commit c632ef13e05654d0e5ddc79fc7e9553cad8cbed0 +Author: Ken Dreyer +Date: Mon May 18 10:50:58 2015 -0600 + + debian: set rest-bench-dbg ceph-test-dbg dependencies + + Debian's debug packages ought to depend on their respective binary + packages. This was the case for many of our ceph packages, but it was + not the case for ceph-test-dbg or rest-bench-dbg. + + Add the dependencies on the relevant binary packages, pinned to + "= ${binary:Version}" per convention. + + http://tracker.ceph.com/issues/11673 Fixes: #11673 + + Signed-off-by: Ken Dreyer + (cherry picked from commit f898ec1e4e3472b0202280f09653a769fc62c8d3) + +commit 68211f695941ee128eb9a7fd0d80b615c0ded6cf +Merge: 7d11b19 cd8f183 +Author: Loic Dachary +Date: Mon May 18 14:25:59 2015 +0200 + + Merge pull request #4697 from ceph/wip-11622-firefly + + Wip 11622 firefly + + Reviewed-by: Loic Dachary + +commit cd8f1830eb5d7ff75b17d7f0915ee4b3b834b149 (refs/remotes/gh/wip-11622-firefly) +Author: Yehuda Sadeh +Date: Wed May 13 17:05:22 2015 -0700 + + rgw: merge manifests correctly when there's prefix override + + Fixes: #11622 + Backport: hammer, firefly + + Prefix override happens in a manifest when a rados object does not + conform to the generic prefix set on the manifest. When merging + manifests (specifically being used in multipart objects upload), we need + to check if the rule that we try to merge has a prefix that is the same + as the previous rule. Beforehand we checked if both had the same + override_prefix setting, but that might not apply as both manifests + might have different prefixes. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 389ae6739ddc6239a4dd7c5f7f9bfc9b645b8577) + +commit eef3d2f1c400573db90936fd417769183950b6ee +Author: Yehuda Sadeh +Date: Tue May 12 16:40:10 2015 -0700 + + rgw: restore buffer of multipart upload after EEXIST + + Fixes #11604 + Backport: hammer, firefly + + When we need to restart a write of part data, we need to revert to + buffer to before the write, otherwise we're going to skip some data. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 580ccaec12daae64c38a1616d0be907bdd70a888) + +commit 7d11b19003503a9db1572d01f7a170e35b29017d +Merge: 114f2e9 9b33965 +Author: Yehuda Sadeh +Date: Fri May 15 10:27:42 2015 -0700 + + Merge pull request #4414 from xinxinsh/wip-11125-firefly + + rgw: keystone token cache does not work correctly + +commit 114f2e9bb5665760a5be9816785508f1c97662d5 +Merge: ac7d28a f33effc +Author: Loic Dachary +Date: Fri May 15 17:03:04 2015 +0200 + + Merge pull request #4415 from xinxinsh/wip-11244-firefly + + cancel_pull: requeue waiters + + Reviewed-by: Samuel Just + +commit ac7d28a6977084da0b70a3f2d0a54c8fa55b16fa +Merge: f273792 1f6b1bb +Author: Loic Dachary +Date: Fri May 15 17:02:27 2015 +0200 + + Merge pull request #4416 from xinxinsh/wip-10976-firefly + + fix PG::all_unfound_are_queried_or_lost for non-existent osds + + Reviewed-by: Samuel Just + +commit f273792c675aca95694cd36794d8a854731bf308 +Merge: 8a6632b 019b28b +Author: Loic Dachary +Date: Fri May 15 17:02:05 2015 +0200 + + Merge pull request #4556 from xinxinsh/wip-11429-firefly + + OSD::load_pgs: we need to handle the case where an upgrade from earlier versions which ignored non-existent pgs resurrects a pg with a prehistoric osdmap + + Reviewed-by: Samuel Just + +commit 8a6632b4e5ffad305f92c89656b161a521529e45 +Merge: 156c385 a71f309 +Author: Sage Weil +Date: Thu May 14 09:50:28 2015 -0700 + + Merge pull request #4638 from SUSE/wip-11453-firefly + + run RGW as root + + Reviewed-by: Ken Dreyer + Reviewed-by: Sage Weil + +commit 019b28b6397dbc3111faa6a8eb57349a48f408d8 +Author: Samuel Just +Date: Mon Apr 20 23:45:57 2015 -0700 + + OSD: handle the case where we resurrected an old, deleted pg + + Prior to giant, we would skip pgs in load_pgs which were not present in + the current osdmap. Those pgs would eventually refer to very old + osdmaps, which we no longer have causing the assertion failure in 11429 + once the osd is finally upgraded to a version which does not skip the + pgs. Instead, if we do not have the map for the pg epoch, complain to + the osd log and skip the pg. + + Fixes: 11429 + Signed-off-by: Samuel Just + (cherry picked from commit fbfd50de5b9b40d71d2e768418a8eca28b1afaca) + + Conflicts: + src/osd/OSD.cc + resolved by add a new comment line + +commit 156c385ef726e56d219c3383a4015c50aec2758a +Merge: fcd0ea3 8cc4bc1 +Author: Loic Dachary +Date: Tue May 12 13:57:36 2015 +0200 + + Merge pull request #4379 from ceph/wip-11416 + + rgw: use correct objv_tracker for bucket instance + + Reviewed-by: Josh Durgin + Reviewed-by: Loic Dachary + +commit a71f3091a4ea810c02517642fb4ab9ce5516b452 +Author: Ken Dreyer +Date: Wed Apr 22 16:36:42 2015 -0600 + + init-radosgw: run RGW as root + + The ceph-radosgw service fails to start if the httpd package is not + installed. This is because the init.d file attempts to start the RGW + process with the "apache" UID. If a user is running civetweb, there is + no reason for the httpd or apache2 package to be present on the system. + + Switch the init scripts to use "root" as is done on Ubuntu. + + http://tracker.ceph.com/issues/11453 Refs: #11453 + + Reported-by: Vickey Singh + Signed-off-by: Ken Dreyer + (cherry picked from commit 47339c5ac352d305e68a58f3d744c3ce0fd3a2ac) + +commit fcd0ea3383582e7a1dc8091e7a48e1d4bbaa76ee +Author: Greg Farnum +Date: Fri May 8 11:29:44 2015 -0700 + + workunits: remove defunct cephfs set_layout test + + Signed-off-by: Greg Farnum + +commit df053b86a89edf8f390400dad8c5e654e14df327 +Merge: aef0272 62645d3 +Author: Yehuda Sadeh +Date: Wed May 6 13:07:11 2015 -0700 + + Merge pull request #4571 from ceph/wip-11256-firefly + + Wip 11256 firefly + + Reviewed-by: Yehuda Sadeh + +commit aef0272d72afaef849b5d4acbf55626033369ee8 +Author: Noah Watkins +Date: Fri Mar 27 19:34:12 2015 -0700 + + java: libcephfs_jni.so is in /usr/lib64 on rhel + + Signed-off-by: Noah Watkins + (cherry picked from commit aed3434dc7c5161c72c7d5655faa3bc693fc9777) + +commit a551a23d36e3f30ff5b0679a98ee760166ae47ae +Author: Yan, Zheng +Date: Mon Mar 2 21:04:25 2015 +0800 + + qa/workunits/fs/misc: fix filelock_interrupt.py + + Handle the case that kernel does not support fcntl.F_OFD_SETLK. + Also fix the code that checks if fnctl fails with errno == EINTR. + + Signed-off-by: Yan, Zheng + (cherry picked from commit 4ececa3dc4a21b98f61a592da9e2be60a0d71625) + Reviewed-by: Greg Farnum + +commit 07031b151b4ac6677b1f663ac200d09088deff64 +Merge: 83f8d43 3fb97e2 +Author: Loic Dachary +Date: Thu Apr 30 00:32:21 2015 +0200 + + Merge pull request #4385 from xinxinsh/wip-11199-firefly + + osd: ENOENT on clone + + Reviewed-by: Samuel Just + +commit 83f8d434a5c245711922ea43a962160177aa5a40 +Merge: f1425e0 29bc9e5 +Author: Loic Dachary +Date: Thu Apr 30 00:31:48 2015 +0200 + + Merge pull request #4384 from xinxinsh/wip-11197-firefly + + pg stuck stale after create with activation delay + + Reviewed-by: Samuel Just + +commit f1425e093e00343a5ae4b9ff56911bf654a5c152 +Merge: e980e00 69d9339 +Author: Loic Dachary +Date: Thu Apr 30 00:31:32 2015 +0200 + + Merge pull request #4382 from xinxinsh/wip-10718-firefly + + osd/PGLog.h: 279: FAILED assert(log.log.size() == log_keys_debug.size()) + + Reviewed-by: Samuel Just + +commit e980e0036909afc392f9bdf5532ce500af602c2f +Merge: b36229b 110c354 +Author: Loic Dachary +Date: Thu Apr 30 00:23:04 2015 +0200 + + Merge pull request #4185 from ldachary/wip-11156-firefly + + FAILED assert(soid < scrubber.start || soid >= scrubber.end) + + Reviewed-by: Samuel Just + +commit b36229b5aeec669cd1494f47120ae207a393846f +Merge: da75978 20c2175 +Author: Sage Weil +Date: Tue Apr 28 08:11:18 2015 -0700 + + Merge pull request #4475 from ceph/wip-9538.firefly + + mon: OSDMonitor: fallback to json-pretty in case of invalid formatter + + Reviewed-by: Loic Dachary + +commit 20c2175352c9d4e4d0e709d294fe6fe0695761cc +Author: Loic Dachary +Date: Fri Sep 19 15:28:36 2014 +0200 + + mon: osd find / metadata --format plain fallback + + ceph --format plain osd find 1 (and metadata) are not implemented and + must fallback to the default (json-pretty). + + http://tracker.ceph.com/issues/9538 Fixes: #9538 + + Signed-off-by: Loic Dachary + (cherry picked from commit 13780d755115387591888f94ea6c58ac0db3ecc4) + +commit da759784ebda11275106c3b280f1d32b64ade00a +Merge: 1a10a7e e1d5773 +Author: David Zafman +Date: Mon Apr 27 10:30:49 2015 -0700 + + Merge pull request #4453 from ceph/wip-11454 + + PG::actingset should be used when checking the number of acting OSDs for... + + Reviewed-by: Sage Weil + +commit 1a10a7eac190249a41b5200f0cc5e3863e76c31d +Merge: a860e2b b9da6f1 +Author: Loic Dachary +Date: Mon Apr 27 10:14:00 2015 +0200 + + Merge pull request #4245 from ceph/wip-11113-firefly + + librbd: snap_remove should ignore -ENOENT errors + + Reviewed-by: Shu, Xinxin + Reviewed-by: Loic Dachary + +commit a860e2b3c02ef4f1402b54b08a6bacd4b02cc07f +Merge: dd15e54 364563a +Author: Loic Dachary +Date: Mon Apr 27 10:12:40 2015 +0200 + + Merge pull request #4206 from ceph/wip-5488-firefly + + librbd: acquire cache_lock before refreshing parent + + Reviewed-by: Josh Durgin + +commit dd15e54b2fae134126e9795a8a973e391c628eb7 +Merge: 51ff2b6 5404fbf +Author: Loic Dachary +Date: Mon Apr 27 10:09:23 2015 +0200 + + Merge pull request #3963 from dachary/wip-10153-firefly + + Rados.shutdown() dies with Illegal instruction (core dumped) + + Reviewed-by: Yehuda Sadeh + +commit 51ff2b60dc576fb444bcfea8f6cd3e12043ed5b8 +Merge: 12143ff c0cfd6e +Author: Loic Dachary +Date: Mon Apr 27 10:08:15 2015 +0200 + + Merge pull request #4383 from xinxinsh/wip-11144-firefly + + erasure-code-profile set races with erasure-code-profile rm + + Reviewed-by: Loic Dachary + +commit 1f6b1bb51d0b40899155313165ac5e3a21dde11a +Author: Mykola Golub +Date: Tue Mar 3 08:45:58 2015 +0200 + + osd: fix PG::all_unfound_are_queried_or_lost for non-existent osds + + A common mistake upon osd loss is to remove the osd from the crush map + before marking the osd lost. This tends to make it so that the user + can no longer mark the osd lost to satisfy all_unfound_are_queried_or_lost. + + The simple solution is for all_unfound_are_queried_or_lost to ignore + the osd if it does not exist. + + Fixes: #10976 + Backports: firefly,giant + + Signed-off-by: Mykola Golub + (cherry picked from commit 5bb51320138ff714806d24312149f6275d546608) + +commit e1d57730ddfc92b17562080b10bc22fceb86539a +Author: Guang Yang +Date: Mon Sep 29 08:21:10 2014 +0000 + + PG::actingset should be used when checking the number of acting OSDs for a given PG. + Signed-off-by: Guang Yang + + (cherry picked from commit 19be358322be48fafa17b28054619a8b5e7d403b) + + Conflicts: + src/osd/PG.cc PG::get_backfill_priority() doesn't exist in firefly + Variation in code related to no "undersized" state in firefly + + Fixes: #11454 + +commit f33effccc0592fddfcd9c5c2f5c3385462aa9b84 +Author: Samuel Just +Date: Thu Mar 26 10:26:48 2015 -0700 + + ReplicatedPG::cancel_pull: requeue waiters as well + + If we are in recovery_wait, we might not recover that object as part of + recover_primary for some time. Worse, if we are waiting on a backfill + which is blocked waiting on a copy_from on the missing object in + question, it can become a dead lock. + + Fixes: 11244 + Backport: firefly + Signed-off-by: Samuel Just + (cherry picked from commit 0b2e272430dd7433e6763be99b8a4cb127d9be19) + +commit 9b3396516c00ef931bab2d4aa6288ad974ec579d +Author: Yehuda Sadeh +Date: Tue Mar 3 11:03:35 2015 -0800 + + rgw: update keystone cache with token info + + Fixes: #11125 + Backport: hammer, firefly + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 6616294aa140ceb83cc61c6ab6f9947636f5e67d) + +commit 8cc4bc162c8c1c240e62840d968b967f5f47b682 (refs/remotes/gh/wip-11416) +Author: Yehuda Sadeh +Date: Wed Dec 17 17:12:43 2014 -0800 + + rgw: use correct objv_tracker for bucket instance + + When trying to create a bucket that already existed, use the + objv_tracker of the newly created instance, and not of the original + bucket. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit fe158ecc25feefcea8aea4133118e4a84900a8ec) + +commit c0cfd6e56ca9d17241da159295bcee7cf44c9ba3 +Author: Loic Dachary +Date: Wed Mar 18 14:17:00 2015 +0100 + + osd: erasure-code-profile incremental rm before set + + It is possible for an incremental change to have both a rm and a set for + a given erasure code profile. It only happens when a rm is followed by a + set. When a set is followed by a rm, the rm will remove the pending set + in the incremental change. + + The logic is the same for pool create and pool delete. + + We must apply the incremental erasure-code-profile removal before the + creation otherwise rm and set in the same proposal will ignore the set. + + This fix is minimal. A better change would be that erasure-code-profile + set checks if there is a pending removal and wait_for_finished_proposal + before creating. + + http://tracker.ceph.com/issues/11144 Fixes: #11144 + + Signed-off-by: Loic Dachary + (cherry picked from commit 0d52aca0d0c302983d03b0f5213ffed187e4ed63) + + Conflicts: + src/osd/OSDMap.cc + resolved by replacing i++ with ++i + +commit 3fb97e25b194e92112077385b10381801e02ddb9 +Author: Samuel Just +Date: Tue Mar 24 15:14:34 2015 -0700 + + ReplicatedPG: trim backfill intervals based on peer's last_backfill_started + + Otherwise, we fail to trim the peer's last_backfill_started and get bug 11199. + + 1) osd 4 backfills up to 31bccdb2/mira01213209-286/head (henceforth: foo) + + 2) Interval change happens + + 3) osd 0 now finds itself backfilling to 4 (lb=foo) and osd.5 + (lb=b6670ba2/mira01213209-160/snapdir//1, henceforth: bar) + + 4) recover_backfill causes both 4 and 5 to scan forward, so 4 has an interval + starting at foo, 5 has an interval starting at bar. + + 5) Once those have come back, recover_backfill attempts to trim off the + last_backfill_started, but 4's interval starts after that, so foo remains in + osd 4's interval (this is the bug) + + 7) We serve a copyfrom on foo (sent to 4 as well). + + 8) We eventually get to foo in the backfilling. Normally, they would have the + same version, but of course we don't update osd.4's interval from the log since + it should not have received writes in that interval. Thus, we end up trying to + recover foo on osd.4 anyway. + + 9) But, an interval change happens between removing foo from osd.4 and + completing the recovery, leaving osd.4 without foo, but with lb >= foo + + Fixes: #11199 + Backport: firefly + Signed-off-by: Samuel Just + (cherry picked from commit 1388d6bd949a18e8ac0aecb0eb79ffb93d316879) + +commit 29bc9e56a978d4e18a507e71858baa21037964c4 +Author: Samuel Just +Date: Tue Mar 24 10:48:02 2015 -0700 + + PG: set/clear CREATING in Primary state entry/exit + + Previously, we did not actually set it when we got a pg creation message from + the mon. It would actually get set on the first start_peering_interval after + that point. If we don't get that far, but do send a stat update to the mon, we + can end up with 11197. Instead, let's just set it and clear it upon entry into + and exit from the Primary state. + + Fixes: 11197 + Signed-off-by: Samuel Just + (cherry picked from commit ddf0292250dfb1040d3cad96fa2cf886fd65933c) + +commit 12143ff9b25fdd96f8d1a9cecb1329c7f354d414 +Merge: 53eff01 bf1d8e8 +Author: Yehuda Sadeh +Date: Mon Apr 6 10:24:06 2015 -0700 + + Merge pull request #4275 from jeanchlopez/wip-11160-firefly + + rgw: shouldn't need to disable rgw_socket_path if frontend is configured + +commit bf1d8e888439df75feadde76bafe7d07d0e8481a +Author: Yehuda Sadeh +Date: Wed Mar 18 20:49:13 2015 -0700 + + rgw: don't use rgw_socket_path if frontend is configured + + Fixes: #11160 + Backport: hammer, firefly + + Previously if we wanted to use the tcp fcgi socket, we needed to clear + rgw_socket_path. + + Signed-off-by: Yehuda Sadeh + +commit 53eff01f1e7ecd3783bb05c47588a134d4fededc +Merge: 899738e 80afc5e +Author: Josh Durgin +Date: Wed Apr 1 16:30:23 2015 -0700 + + Merge pull request #4247 from ceph/wip-11303 + + Fix do_autogen.sh so that -L is allowed + + Reviewed-by: Josh Durgin + +commit 80afc5eca293e5e2f168f219931a2f554040cdd8 +Author: Alfredo Deza +Date: Wed Apr 1 19:10:33 2015 -0400 + + Fix do_autogen.sh so that -L is allowed + + Signed-off-by: Alfredo Deza + +commit b9da6f1025683d240baa73a9fcccbf2780035231 +Author: Jason Dillaman +Date: Mon Mar 16 18:40:49 2015 -0400 + + librbd: snap_remove should ignore -ENOENT errors + + If the attempt to deregister the snapshot from the parent + image fails with -ENOENT, ignore the error as it is safe + to assume that the child is not associated with the parent. + + Fixes: #11113 + Signed-off-by: Jason Dillaman + (cherry picked from commit cf8094942ccdba831e03e5a79451cfa5d78a135f) + +commit b250b3cfae69f4e8354027fae26fd85a792da0df +Author: Jason Dillaman +Date: Mon Mar 16 18:35:07 2015 -0400 + + librbd: get_parent_info should protect against invalid parent + + get_parent_info should return -ENOENT if the image does not + have an associated parent image. + + Signed-off-by: Jason Dillaman + (cherry picked from commit 21afd0ef8e1fb81c5ace5fd403513c542e2413e3) + +commit 69d9339fb352342a87e1148e9b5161246b27776a +Author: Samuel Just +Date: Mon Feb 2 09:07:27 2015 -0800 + + PGLog: improve PGLog::check() debugging + + Related: 10718 + Signed-off-by: Samuel Just + (cherry picked from commit c656bcec2441c90b084ca50a17c37625d69942a1) + +commit 73cbb6f34a5a556dbbd4db2e6f81391d83447efd +Author: Samuel Just +Date: Tue Mar 17 10:07:03 2015 -0700 + + PGLog::merge_log: in tail extend case, log.log might be empty + + Even if the tail != last_update, the log might be empty due to split + moving all entries into other logs. + + Fixes: 10718 + Signed-off-by: Samuel Just + (cherry picked from commit f5a2aef3db29b99634250fd388a0c2d9b9a23d38) + +commit 62645d30b0ceee075a56bc692fd65cee85b09aa5 (refs/remotes/gh/wip-11256-firefly) +Author: Yehuda Sadeh +Date: Mon Mar 30 17:34:57 2015 -0700 + + cls_rgw: use multimap to keep pending operations in bucket index + + Fixes: #11256 + Multiple concurrent requests might be sent using the same tag, need the + entry map to be able to hold multiple entries. + + Signed-off-by: Yehuda Sadeh + +commit 7538319dd7aa80a3318c108d345dee8044cf20a8 +Author: Yehuda Sadeh +Date: Fri Mar 27 16:32:48 2015 -0700 + + rgw: generate new tag for object when setting object attrs + + Fixes: #11256 + Backport: firefly, hammer + + Beforehand we were reusing the object's tag, which is problematic as + this tag is used for bucket index updates, and we might be clobbering a + racing update (like object removal). + + Signed-off-by: Yehuda Sadeh + +commit 364563aac979fdf5ccbb6c588051d097a26bc594 +Author: Jason Dillaman +Date: Mon Mar 16 11:04:22 2015 -0400 + + librbd: acquire cache_lock before refreshing parent + + cache_lock needs to be acquired before snap_lock to avoid + the potential for deadlock. + + Fixes: #5488 + Signed-off-by: Jason Dillaman + +commit 110c35498942ea0feec395b6e7992f802dd740ce +Author: Samuel Just +Date: Fri Mar 20 15:28:15 2015 -0700 + + ReplicatedPG::promote_object: check scrubber and block if necessary + + Otherwise, we might attempt to promote into an in-progress scrub + interval causing 11156. I would have added a return value to + promote_object(), but could not find an existing user which + cared to distinguish the cases, even with a null op passed. + All existing users are in maybe_handle_cache. The ones which + pass a null op are for promoting the object in parallel + with a proxy -- a case where not actually performing the promote + does not really matter. + + Fixes: #11156 + Signed-off-by: Samuel Just + (cherry picked from commit 65bb4df599541cd2e0f195b905f24f529e255c00) + +commit 899738e10e82b50dcf7dfffe5cc83937179bf323 +Merge: 15acfe7 9437cb1 +Author: Loic Dachary +Date: Mon Mar 23 20:38:43 2015 +0100 + + Merge pull request #4126 from dzafman/wip-11176-firefly + + ceph-objectstore-tool: Output only unsupported features when incomatible + + Reviewed-by: Loic Dachary + +commit 15acfe7f226ca7bc2c942d5fbcd3a40bd41e5930 +Merge: 6c95cd2 cddbff9 +Author: Loic Dachary +Date: Sat Mar 21 11:38:28 2015 +0100 + + Merge pull request #4079 from dachary/wip-11157-firefly + + doc,tests: force checkout of submodules + + Reviewed-by: David Zafman + +commit 6c95cd24f6f4a1a933f8799754831e7a5c0a725d +Merge: ea79a3b 43053fc +Author: Yuri Weinstein +Date: Fri Mar 20 21:06:17 2015 -0700 + + Merge pull request #4129 from dzafman/wip-11139-firefly + + ceph-objectstore-tool: Use exit status 11 for incompatible import attemp... + +commit 43053fcd8969c406969fef67613aa37ad1cc86bf +Author: David Zafman +Date: Fri Mar 20 17:48:01 2015 -0700 + + ceph-objectstore-tool: Use exit status 11 for incompatible import attempt + + This is used so upgrade testing doesn't generate false failure. + Fixes: #11139 + + Signed-off-by: David Zafman + (cherry picked from commit 175aff8afe8215547ab57f8d8017ce8fdc0ff543) + +commit 9437cb1b284ec9366b51d660396c2c8a9366b31f +Author: David Zafman +Date: Fri Mar 20 16:57:40 2015 -0700 + + ceph-objectstore-tool: Output only unsupported features when incomatible + + Fixes: #11176 + Backport: firefly, giant + + Signed-off-by: David Zafman + (cherry picked from commit 5b23f5b5892b36fb7d06efc0d77e64a24ef6e8c9) + +commit ea79a3b122917ddf5dc1972bb9dc5b20f78e2f18 +Merge: c1c20d8 07fc9f6 +Author: Loic Dachary +Date: Fri Mar 20 08:44:39 2015 +0100 + + Merge pull request #3970 from ceph/firefly-11053 + + mds: fix assertion caused by system clock backwards + + Reviewed-by: John Spray + +commit c1c20d89d511499d678fdba0667581e88b9b5d95 +Merge: 2fbb9a6 63b39db +Author: Loic Dachary +Date: Fri Mar 20 08:42:58 2015 +0100 + + Merge pull request #4021 from ceph/wip-7737-firefly + + osd: fix negative degraded objects during backfilling + + Reviewed-by: Sage Weil + +commit 2fbb9a63a82ac5136b033799aec303fc151b25e5 +Merge: 3160e8d 2c7eda6 +Author: Loic Dachary +Date: Fri Mar 20 08:38:26 2015 +0100 + + Merge pull request #3952 from dachary/wip-9986-firefly + + objecter: fix map skipping + + Reviewed-by: Sage Weil + +commit 3160e8d6918a04d1fec9fdccbc30bf007c7940c6 +Author: Yehuda Sadeh +Date: Wed Mar 18 20:55:24 2015 -0700 + + init-radosgw*: don't require rgw_socket_path to be defined + + Fixes: #11159 + Backport: hammer, firefly + + Scripts required rgw_socket_path to exist in order to start radosgw. + This is not needed. + + Reported-by: Dan Mick + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 6823bcdcd0ce72cd223e809291f46d82da76115c) + +commit cddbff9a3653d6257d13e0ac411cd6e4cd71feef +Author: Loic Dachary +Date: Thu Mar 19 00:32:39 2015 +0100 + + doc,tests: force checkout of submodules + + When updating submodules, always checkout even if the HEAD is the + desired commit hash (update --force) to avoid the following: + + * a directory gmock exists in hammer + * a submodule gmock replaces the directory gmock in master + * checkout master + submodule update : gmock/.git is created + * checkout hammer : the gmock directory still contains the .git from + master because it did not exist at the time and checkout won't + remove untracked directories + * checkout master + submodule update : git rev-parse HEAD is + at the desired commit although the content of the gmock directory + is from hammer + + http://tracker.ceph.com/issues/11157 Fixes: #11157 + + Signed-off-by: Loic Dachary + +commit 63b39dbd529936e60d0fd08dffd35f82b3d1729c (refs/remotes/gh/wip-7737-firefly) +Author: Guang Yang +Date: Thu Feb 26 08:13:12 2015 +0000 + + osd: fix negative degraded objects during backfilling + + When there is deleting requests during backfilling, the reported number of degraded + objects could be negative, as the primary's num_objects is the latest (locally) but + the number for replicas might not reflect the deletings. A simple fix is to ignore + the negative subtracted value. + + Signed-off-by: Guang Yang + (cherry picked from commit 14d7e36d3c978844da73d0e1c8a3a1ec863bac15) + + Conflicts: + src/osd/PG.cc + +commit 493d285508914769cba3639b601ae6c20303af0d +Merge: 9839726 8a25a51 +Author: Loic Dachary +Date: Mon Mar 16 23:20:36 2015 +0100 + + Merge pull request #3966 from theanalyst/wip-10698-backport + + rgw: fail s3 POST auth if keystone not configured + + Reviewed-by: Yehuda Sadeh + +commit 9839726f009a3189013a23e8226d2f7618f1d56b +Merge: dab85cb 651dc55 +Author: Loic Dachary +Date: Mon Mar 16 23:19:22 2015 +0100 + + Merge pull request #3934 from dachary/wip-10665-firefly + + rgw: Swift API. Dump container's custom metadata. + + Reviewed-by: Yehuda Sadeh + +commit dab85cb7fcc5770fe4d6f075cf583a33ea335aa3 +Merge: 0e55046 d0fd417 +Author: Loic Dachary +Date: Mon Mar 16 23:18:59 2015 +0100 + + Merge pull request #3929 from dachary/wip-10475-firefly + + rgw: Swift API. Support for X-Remove-Container-Meta-{key} header. + + Reviewed-by: Yehuda Sadeh + +commit 0e550461666d6e3d32981f99c8327931a6bd6d62 +Merge: 1ca6dd9 963439f +Author: Loic Dachary +Date: Mon Mar 16 23:18:24 2015 +0100 + + Merge pull request #3938 from dachary/wip-10770-firefly + + rgw: send appropriate op to cancel bucket index pending operation + + Reviewed-by: Yehuda Sadeh + +commit 1ca6dd9676014a81983bd397e3154bf53243c7f2 +Merge: 66327d6 34d19e1 +Author: Loic Dachary +Date: Mon Mar 16 23:17:56 2015 +0100 + + Merge pull request #3961 from dachary/wip-10106-firefly + + rgw: flush xml header on get acl request + + Reviewed-by: Yehuda Sadeh + +commit 66327d69a9bfa875d58b063c180a11b2769a093e +Merge: de45d9e 9074eb7 +Author: Loic Dachary +Date: Mon Mar 16 23:16:53 2015 +0100 + + Merge pull request #3958 from dachary/wip-10062-firefly + + rgw: check for timestamp for s3 keystone auth + + + Reviewed-by: Yehuda Sadeh + Reviewed-by: Abhishek Lekshmanan + Reviewed-by: Valery Tschopp + +commit de45d9e2606d8fb6ea5533521163669e748e7d01 +Merge: 308f758 8ef14fc +Author: Loic Dachary +Date: Mon Mar 16 17:00:27 2015 +0100 + + Merge pull request #3942 from dachary/wip-10844-firefly + + mon: MonCap: take EntityName instead when expanding profiles + + Reviewed-by: Joao Eduardo Luis + +commit 308f758f49d28c012f3ba765519144e5ea5794e6 +Merge: c59a081 ca42905 +Author: Loic Dachary +Date: Mon Mar 16 16:53:48 2015 +0100 + + Merge pull request #3936 from dachary/wip-10724-firefly + + backport ceph-disk man page to Firefly + + Reviewed-by: Loic Dachary + +commit c59a08135c35072fa05104e26b500553e800cdd1 +Merge: 331acae 8ec8f11 +Author: Loic Dachary +Date: Mon Mar 16 16:50:39 2015 +0100 + + Merge pull request #3940 from dachary/wip-8753-firefly + + ReplicatedPG::on_change: clean up callbacks_for_degraded_object + + Reviewed-by: Samuel Just + +commit 331acae4d6b34bb047c914945a2d9c0d9c7b9562 +Merge: fc364a7 4e32ff2 +Author: Loic Dachary +Date: Mon Mar 16 16:49:56 2015 +0100 + + Merge pull request #3827 from dachary/wip-10259-firefly + + osd_types: op_queue_age_hist and fs_perf_stat should be in osd_stat_t::o... + + Reviewed-by: Samuel Just + +commit fc364a76f85eb5016ea460e5fd9c1603df374bcf +Merge: 31d99d2 b0d0d44 +Author: Loic Dachary +Date: Mon Mar 16 16:48:02 2015 +0100 + + Merge pull request #3962 from dachary/wip-10150-firefly + + ReplicatedPG::scan_range: an object can disappear between the list and t... + + Reviewed-by: Samuel Just + +commit 31d99d2de714eb4a8eeb4a431cbc0d40ca749f15 +Merge: 45f0870 5865411 +Author: Loic Dachary +Date: Mon Mar 16 16:45:32 2015 +0100 + + Merge pull request #3948 from dachary/wip-9891-firefly + + DBObjectMap: lock header_lock on sync() + + Reviewed-by: Samuel Just + +commit 45f087032e27c63dc459318717b05fe5f9888664 +Merge: 3050262 6207333 +Author: Loic Dachary +Date: Mon Mar 16 16:43:48 2015 +0100 + + Merge pull request #3949 from dachary/wip-9915-firefly + + osd: cache tiering: fix the atime logic of the eviction + + Reviewed-by: Samuel Just + +commit 3050262d7dcb8ac20b9f86544461bb59f4b87cf5 +Merge: c0abc4d 0f31388 +Author: Loic Dachary +Date: Mon Mar 16 16:40:12 2015 +0100 + + Merge pull request #3944 from dachary/wip-9193-firefly + + messages/MWatchNotify: include an error code in the message + + Reviewed-by: Samuel Just + +commit c0abc4d769afd00773a9f466ffd3feced9cdb17d +Merge: f9acd3a f856739 +Author: Loic Dachary +Date: Mon Mar 16 16:39:41 2015 +0100 + + Merge pull request #3943 from dachary/wip-8011-firefly + + ReplicatedPG: fail a non-blocking flush if the object is being scrubbed + + Reviewed-by: Samuel Just + +commit f9acd3ad7397ac1e745beeeaf21b55ecd95484a1 +Merge: f95d327 ca96b59 +Author: Loic Dachary +Date: Mon Mar 16 16:39:16 2015 +0100 + + Merge pull request #3941 from dachary/wip-10817-firefly + + WorkQueue: make wait timeout on empty queue configurable + + Reviewed-by: Samuel Just + +commit f95d327cbc0750cfb77114c66082ddd5fc458b94 +Merge: 32de8ab 3782b8b +Author: Loic Dachary +Date: Mon Mar 16 16:38:42 2015 +0100 + + Merge pull request #3937 from dachary/wip-10762-firefly + + mon: ignore osd failures from before up_from + + Reviewed-by: Samuel Just + +commit 32de8ab146f242f7b73aca211f059bc39d38f85c +Merge: 24a8c10 6fd3dfa +Author: Loic Dachary +Date: Mon Mar 16 16:38:22 2015 +0100 + + Merge pull request #3933 from dachary/wip-10617-firefly + + osd: do not ignore deleted pgs on startup + + Reviewed-by: Samuel Just + +commit 24a8c10a5a215c60b525f4170e45565390e02231 +Merge: f4e76c3 368a5a8 +Author: Loic Dachary +Date: Mon Mar 16 16:38:01 2015 +0100 + + Merge pull request #3932 from dachary/wip-10546-firefly + + mon: Monitor: fix timecheck rounds period + + Reviewed-by: Samuel Just + +commit f4e76c3db90bf383c02cdb8cca19a37cd187095b +Merge: fa518ff 66b13f2 +Author: Loic Dachary +Date: Mon Mar 16 16:37:36 2015 +0100 + + Merge pull request #3931 from dachary/wip-10512-firefly + + osd: requeue blocked op before flush it was blocked on + + Reviewed-by: Samuel Just + +commit fa518ffc5961785f01f0f91980c38a7e02686901 +Merge: ac9980e a22aa8f +Author: Loic Dachary +Date: Mon Mar 16 16:37:05 2015 +0100 + + Merge pull request #3930 from dachary/wip-10497-firefly + + librados: Translate operation flags from C APIs + + Reviewed-by: Samuel Just + Reviewed-by: Josh Durgin + +commit ac9980e074adf587b8b16af0bd08b3fc3233804d +Merge: ccebb5f d3de8a5 +Author: Loic Dachary +Date: Mon Mar 16 16:31:40 2015 +0100 + + Merge pull request #3955 from dachary/wip-10059-firefly + + PG: always clear_primary_state + + Reviewed-by: Samuel Just + +commit ccebb5f90bea0fd3bac1bd7ab93e5e07700f1301 +Merge: a30379d f4bab86 +Author: Loic Dachary +Date: Mon Mar 16 16:31:16 2015 +0100 + + Merge pull request #3954 from dachary/wip-10014-firefly + + ObjectStore: Don't use largest_data_off to calc data_align. + + Reviewed-by: Samuel Just + +commit a30379d9af89cffa4b3083ae14a94e3758c2b01c +Merge: e575ca8 eb03e79 +Author: Loic Dachary +Date: Mon Mar 16 16:25:59 2015 +0100 + + Merge pull request #3947 from dachary/wip-9555-firefly + + osd: check that source OSD is valid for MOSDRepScrub + + Reviewed-by: Samuel Just + +commit e575ca84419f8be5c7fae7e2ffac549956a74f82 +Merge: cd675bb c60da2f +Author: Loic Dachary +Date: Mon Mar 16 16:25:30 2015 +0100 + + Merge pull request #3964 from dachary/wip-10157-firefly + + PGLog: include rollback_info_trimmed_to in (read|write)_log + + Reviewed-by: Samuel Just + +commit cd675bba5fdfb85689880ca7ecbd284181984a63 +Merge: 2b8e476 1a0f770 +Author: Loic Dachary +Date: Mon Mar 16 16:24:55 2015 +0100 + + Merge pull request #3960 from dachary/wip-6003-firefly + + FileJournal: fix journalq population in do_read_entry() + + Reviewed-by: Samuel Just + +commit 2b8e4762edb7c35950f7e0d69bba2b5e1d83133a +Merge: d434ead a746f7e +Author: Loic Dachary +Date: Mon Mar 16 16:24:03 2015 +0100 + + Merge pull request #3950 from dachary/wip-9985-firefly + + Get the currently atime of the object in cache pool for eviction + + Reviewed-by: Samuel Just + Reviewed-by: Xinze Chi + +commit d434eadadd1df6e4df18a6914015736bc09722d1 +Merge: e79e5f8 5f1245e +Author: Loic Dachary +Date: Mon Mar 16 16:20:12 2015 +0100 + + Merge pull request #3935 from dachary/wip-10723-firefly + + Fix memory leak in python rados bindings + + Reviewed-by: Samuel Just + Reviewed-by: Josh Durgin + +commit e79e5f8fbbfba6984cf21784e20d1c46cb60a397 +Merge: a9a36de 91b2aca +Author: Loic Dachary +Date: Mon Mar 16 15:02:37 2015 +0100 + + Merge pull request #3866 from ceph/wip-cot-firefly + + Backport ceph-objectstore-tool changes to firefly + + Reviewed-by: Loic Dachary + +commit a9a36deb89ac5719787e905469fe0b1bde5d58ca +Merge: 83c571e 7e85722 +Author: Loic Dachary +Date: Mon Mar 16 15:00:56 2015 +0100 + + Merge pull request #3996 from dzafman/wip-10676 + + Fix ceph command manpage to match ceph -h + + Reviewed-by: Xinxin Shu + +commit 83c571e3067b4cad3e4567522c797d09a82d87db +Merge: baa74b8 d5c3a14 +Author: Loic Dachary +Date: Mon Mar 16 14:55:37 2015 +0100 + + Merge pull request #3927 from dachary/wip-10351-firefly + + mount.ceph: avoid spurious error message + + Reviewed-by: Yan, Zheng + +commit 7e85722fd4c89715fc2ed79697c82d65d7ebf287 +Author: David Zafman +Date: Fri Mar 13 16:50:13 2015 -0700 + + doc: Minor fixes to ceph command manpage + + Fixes: #10676 + + Signed-off-by: David Zafman + +commit 9ac488c1eb0e30511079ba05aaf11c79615b3940 +Author: David Zafman +Date: Thu Mar 12 11:39:52 2015 -0700 + + doc: Fix ceph command manpage to match ceph -h (firefly) + + Improve synopsis section + Fixes: #10676 + + Signed-off-by: David Zafman + +commit 5f1245e131e33a98572408c8223deed2c7cf7b75 +Author: Josh Durgin +Date: Mon Feb 9 20:50:23 2015 -0800 + + rados.py: keep reference to python callbacks + + If we don't keep a reference to these, the librados aio calls will + segfault since the python-level callbacks will have been garbage + collected. Passing them to aio_create_completion() does not take a + reference to them. Keep a reference in the python Completion object + associated with the request, since they need the same lifetime. + + This fixes a regression from 60b019f69aa0e39d276c669698c92fc890599f50. + + Fixes: #10775 + Backport: dumpling, firefly, giant + Signed-off-by: Josh Durgin + (cherry picked from commit 36d37aadbbbece28d70e827511f1a473d851463d) + +commit cf366fc3b21ff6f98530dbadb75a430c25672d56 +Author: Nilamdyuti Goswami +Date: Thu Dec 18 17:11:22 2014 +0530 + + doc: Changes format style in ceph to improve readability as html. + + Signed-off-by: Nilamdyuti Goswami + (cherry picked from commit 8b796173063ac9af8c21364521fc5ee23d901196) + +commit 07fc9f66a69aa31d2cf8bf7a277d3e14ad6209be +Author: Yan, Zheng +Date: Thu Mar 12 11:01:46 2015 +0800 + + mds: fix assertion caused by system clock backwards + + Fixes: #11053 + Signed-off-by: Yan, Zheng + +commit 8a25a51e42cdaed2c66dc25a6c6d0245441123a3 +Author: Yehuda Sadeh +Date: Fri Jan 30 07:03:30 2015 -0800 + + rgw: fail s3 POST auth if keystone not configured + + Fixes: #10698 + This fixes issue introduced in 8b3dfc9472022ea45ad24e02e0aa21dfdad798f8, + where if user does not exist, we try keystone authentication. However, + if keystone is not configured we justt fall through without failing. + This would have failed later on due to bucket permissions, unless bucket + had a public write permissions. + + Backports: Firefly + Reported-by: Valery Tschopp + Signed-off-by: Yehuda Sadeh + + Conflicts: + src/rgw/rgw_rest_s3.cc + +commit eb03e792040bd22c1ae8b7dd73d94fbfd6208eec +Author: Sage Weil +Date: Sat Dec 13 08:06:31 2014 -0800 + + osd: check that source OSD is valid for MOSDRepScrub + + Make sure the message we got from the peer OSD is valid. Specifically, + this avoids a race like this: + + - A marks down B + - B sends MOSDRepScrub + - A accepts connection from B as new + - A replies to scrub + - B crashes because msgr seq 1 < expected seq 1000+ + + See #8880 for the most recent fix for requests. + + Fixes: #9555 + Backport: giant, firefly + Signed-off-by: Sage Weil + (cherry picked from commit 847e5e102522d651aa9687a54aaafcebf3afc596) + + Conflicts: + src/osd/OSD.cc: require functions first argument is now a reference + +commit ae18707b3caa115dc510aff38b77f8afe8555c61 +Author: Sage Weil +Date: Sat Dec 13 07:56:39 2014 -0800 + + osd: pass Message* to most require_* helpers + + These do nothing but op->get_req(); pass the Message* explicitly so that + non-OpRequest callers can use them. + + Signed-off-by: Sage Weil + (cherry picked from commit 707a111d53efb09b3471dd3788b86d2bfee4e96f) + + Conflicts: + src/osd/OSD.cc + src/osd/OSD.h + significantly changes had to be made but in a simple way and + stays within the scope of the original commit + +commit c60da2f3c34e7325c748d2d6e55140a0a30013fd +Author: Samuel Just +Date: Thu Nov 20 15:15:08 2014 -0800 + + PGLog: include rollback_info_trimmed_to in (read|write)_log + + Fixes: #10157 + Backport: firefly, giant + Signed-off-by: Samuel Just + (cherry picked from commit 1fe8b846641486cc294fe7e1d2450132c38d2dba) + + Conflicts: + src/osd/PGLog.cc + in the context coll_t::META_COLL was replaced with META_COLL + +commit 1a0f770eea18af6b276a31157f201a93166eb038 +Author: Samuel Just +Date: Fri Feb 6 09:52:29 2015 -0800 + + FileJournal: fix journalq population in do_read_entry() + + Fixes: 6003 + Backport: dumpling, firefly, giant + Signed-off-by: Samuel Just + (cherry picked from commit bae1f3eaa09c4747b8bfc6fb5dc673aa6989b695) + + Conflicts: + src/os/FileJournal.cc + because reinterpret_cast was added near two hunks after firefly + +commit a746f7e5985198024067cb6e123569c09169b356 +Author: Sage Weil +Date: Fri Oct 31 19:33:59 2014 -0700 + + osd/ReplicatedPG: fix compile error + + From 1fef4c3d541cba360738437420ebfa2447d5802e. + + Signed-off-by: Sage Weil + (cherry picked from commit 4a9ad7dc2da6f4fa6a64235776a3f1d2799aef60) + +commit 5404fbfdd9b18cdb3fe5bed67146c769ec3acfa0 +Author: Federico Simoncelli +Date: Sat Nov 15 14:14:04 2014 +0000 + + common: do not unlock rwlock on destruction + + According to pthread_rwlock_unlock(3p): + + Results are undefined if the read-write lock rwlock is not held + by the calling thread. + + and: + + https://sourceware.org/bugzilla/show_bug.cgi?id=17561 + + Calling pthread_rwlock_unlock on an rwlock which is not locked + is undefined. + + calling pthread_rwlock_unlock on RWLock destruction could cause + an unknown behavior for two reasons: + + - the lock is acquired by another thread (undefined) + - the lock is not acquired (undefined) + + Moreover since glibc-2.20 calling pthread_rwlock_unlock on a + rwlock that is not locked results in a SIGILL that kills the + application. + + This patch removes the pthread_rwlock_unlock call on destruction + and replaces it with an assertion to check that the RWLock is + not in use. + + Any code that relied on the implicit release is now going to + break the assertion, e.g.: + + { + RWLock l; + l.get(for_write); + } // implicit release, wrong. + + Signed-off-by: Federico Simoncelli + (cherry picked from commit cf2104d4d991361c53f6e2fea93b69de10cd654b) + +commit a73a4cb3889a6da21c3cfa4ddfa16d1a7059d20c +Author: Yehuda Sadeh +Date: Sat May 3 08:32:19 2014 -0700 + + common/RWLock: track read/write locks via atomics for assertion checks + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 92615ea95a31d9fd22c3d11c860e0f502dc52c26) + +commit b0d0d44018de6289b64b6334edd4959ca4a0cc48 +Author: Samuel Just +Date: Thu Dec 11 13:05:54 2014 -0800 + + ReplicatedPG::scan_range: an object can disappear between the list and the attr get + + The first item in the range is often last_backfill, upon which writes + can be occuring. It's trimmed off on the primary side anyway. + + Fixes: 10150 + Backport: dumpling, firefly, giant + Signed-off-by: Samuel Just + (cherry picked from commit dce6f288ad541fe7f0ef8374301cd712dd3bfa39) + +commit 34d19e1501b242fd8fc7cc95656592b5982f29a6 +Author: Yehuda Sadeh +Date: Fri Jan 30 18:42:40 2015 -0800 + + rgw: flush xml header on get acl request + + Fixes: #10106 + Backport: firefly, giant + + dump_start() updates the formatter with the appropriate prefix, however, + we never flushed the formatter. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit eb45f861343162e018968b8c56693a8c6f5b2cab) + +commit 9074eb7c46589aa1e5d10a2b9a8534f22dff2154 +Author: Abhishek Lekshmanan +Date: Mon Nov 17 17:37:00 2014 +0530 + + rgw: check for timestamp for s3 keystone auth + + This commit ensures that we check for timestamp of s3 request is within + acceptable grace time of radosgw + Addresses some failures in #10062 + Fixes: #10062 + Signed-off-by: Abhishek Lekshmanan + + (cherry picked from commit 4b35ae067fef9f97b886afe112d662c61c564365) + +commit d3de8a5722a68d69023cf60c9076c2fb19058bf9 +Author: Samuel Just +Date: Wed Nov 19 08:20:16 2014 -0800 + + PG: always clear_primary_state on new interval, but only clear pg temp if not primary + + Signed-off-by: Samuel Just + (cherry picked from commit f692bfe076b8ddb679c6d1a6ea78cc47f0876326) + +commit 8b07236cd799b4752cbe620b737343a02a703a17 +Author: Samuel Just +Date: Fri Nov 14 15:44:20 2014 -0800 + + PG: always clear_primary_state when leaving Primary + + Otherwise, entries from the log collection process might leak into the next + epoch, where we might end up choosing a different authoritative log. In this + case, it resulted in us not rolling back to log entries on one of the replicas + prior to trying to recover from an affected object due to the peer_missing not + being cleared. + + Fixes: #10059 + Backport: giant, firefly, dumpling + Signed-off-by: Samuel Just + (cherry picked from commit c87bde64dfccb5d6ee2877cc74c66fc064b1bcd7) + +commit f4bab86fe3b218d66c14d06883c297836d9ca19d +Author: Jianpeng Ma +Date: Mon Oct 27 11:22:13 2014 +0800 + + ObjectStore: Don't use largest_data_off to calc data_align. + + If largest_data_off % CEPH_PAGE_SIZE != 0, the get_data_aligment return + a erro value. This make the FileJouranl::align_bl to memcopy much data. + + Tested-by: Sage Weil + Signed-off-by: Jianpeng Ma + (cherry picked from commit a1aa70f2f21339feabfe9c1b3c9c9f97fbd53c9d) + +commit 2c7eda690ae2f71c8e8e189b8f7330196c1f385f +Author: Ding Dinghua +Date: Thu Oct 30 14:58:42 2014 +0800 + + osdc/Objecter: Fix a bug of dead looping in Objecter::handle_osd_map + + If current map epoch is less than oldest epoch, current map epoch + should step up to oldest epoch. + + Fixes: #9986 + Signed-off-by: Ding Dinghua + (cherry picked from commit e0166a23c2cf655bfb4cf873be021a14d9b9be27) + + Conflicts: + src/osdc/Objecter.cc + the debug line that follows was re-indented + +commit 11f8017cbdf94d4a8083412b96c251ee39286541 +Author: Ding Dinghua +Date: Thu Oct 30 14:58:05 2014 +0800 + + osdc/Objecter: e shouldn't be zero in Objecter::handle_osd_map + + Signed-off-by: Ding Dinghua + (cherry picked from commit 31c584c8ba022cd44fe2872d221f3026618cefab) + + Conflicts: + src/osdc/Objecter.cc + the debug line that follows was re-indented + +commit 7831582026441abbd6066dd951db4b63ffb45402 +Author: Xinze Chi +Date: Wed Oct 29 07:11:11 2014 +0000 + + Get the currently atime of the object in cache pool for eviction + + Because if there are mutiple atime in agent_state for the same object, we should use the recently one. + + Signed-off-by: Xinze Chi + (cherry picked from commit 1fef4c3d541cba360738437420ebfa2447d5802e) + +commit 620733386bd0694960cecac8f32bd1538382d5bb +Author: Zhiqiang Wang +Date: Tue Oct 28 09:37:11 2014 +0800 + + osd: cache tiering: fix the atime logic of the eviction + + Reported-by: Xinze Chi + Signed-off-by: Zhiqiang Wang + (cherry picked from commit 622c5ac41707069ef8db92cb67c9185acf125d40) + +commit 5865411360f722ec511f2df6656d4ba975bef8eb +Author: Samuel Just +Date: Fri Feb 20 13:43:46 2015 -0800 + + DBObjectMap: lock header_lock on sync() + + Otherwise, we can race with another thread updating state.seq + resulting in the old, smaller value getting persisted. If there + is a crash at that time, we will reuse a sequence number, resulting + in an inconsistent node tree and bug #9891. + + Fixes: 9891 + Backport: giant, firefly, dumpling + Signed-off-by: Samuel Just + (cherry picked from commit 2b63dd25fc1c73fa42e52e9ea4ab5a45dd9422a0) + + Conflicts: + src/os/DBObjectMap.cc + because we have state.v = 1; instead of state.v = 2; + +commit baa74b88a4f4b6073b99b7a9774692b37405b59e +Merge: be6559a 420ea03 +Author: Loic Dachary +Date: Wed Mar 11 09:09:23 2015 +0100 + + Merge pull request #3828 from dachary/wip-10425-firefly + + librados: fix resources leakage in RadosClient::connect() + + Reviewed-by: Radoslaw Zarzynski + +commit be6559a423badde3b573b0c9055056999baae104 +Merge: f91d355 6e58732 +Author: Loic Dachary +Date: Wed Mar 11 09:06:27 2015 +0100 + + Merge pull request #3826 from dachary/wip-10257-firefly + + mon: PGMonitor: several stats output error fixes + + Reviewed-by: Joao Eduardo Luis + +commit f91d355306620cc543113ed21fddf84f4c170d6e +Merge: f5525a1 1e58bb4 +Author: Loic Dachary +Date: Wed Mar 11 09:05:13 2015 +0100 + + Merge pull request #3824 from dachary/wip-10353-firefly + + crush: set_choose_tries = 100 for erasure code rulesets + + Reviewed-by: Loic Dachary + +commit f5525a1f41c9154b48f3ad0ccd899b5203c897bd +Merge: b7e3614 27dbbb3 +Author: Loic Dachary +Date: Wed Mar 11 09:04:33 2015 +0100 + + Merge pull request #3823 from dachary/wip-10787-firefly + + mon/OSDMonitor: do not trust small values in osd epoch cache + + Reviewed-by: Joao Eduardo Luis + +commit b7e3614ff38379aeea66ba00c64bc88ffa080963 +Merge: b5a67f0 7ed92f7 +Author: Loic Dachary +Date: Wed Mar 11 09:02:16 2015 +0100 + + Merge pull request #3915 from dachary/wip-10080-firefly + + SimpleMessenger: allow RESETSESSION whenever we forget an endpoint + + Reviewed-by: Greg Farnum + +commit 8ef14fcca715a33be8c611a37628c90d7fafca43 +Author: Joao Eduardo Luis +Date: Wed Feb 11 23:36:01 2015 +0000 + + mon: MonCap: take EntityName instead when expanding profiles + + entity_name_t is tightly coupled to the messenger, while EntityName is + tied to auth. When expanding profiles we want to tie the profile + expansion to the entity that was authenticated. Otherwise we may incur + in weird behavior such as having caps validation failing because a given + client messenger inst does not match the auth entity it used. + + e.g., running + + ceph --name osd.0 config-key exists foo daemon-private/osd.X/foo + + has entity_name_t 'client.12345' and EntityName 'osd.0'. Using + entity_name_t during profile expansion would not allow the client access + to daemon-private/osd.X/foo (client.12345 != osd.X). + + Fixes: #10844 + Backport: firefly,giant + + Signed-off-by: Joao Eduardo Luis + (cherry picked from commit 87544f68b88fb3dd17c519de3119a9ad9ab21dfb) + +commit 370f37f452aff3b48f9ae8a33b7ef26b572b41c8 +Author: Joao Eduardo Luis +Date: Fri Nov 14 21:03:54 2014 +0000 + + mon: Monitor: stash auth entity name in session + + Backport: giant + + Signed-off-by: Joao Eduardo Luis + (cherry picked from commit ca8e1efc0be9bffcfbdce5593526d257aa498062) + +commit 0f31388eb2bb68c09ab270f871b421a9c368af97 +Author: Sage Weil +Date: Sun Aug 10 17:51:08 2014 -0700 + + messages/MWatchNotify: include an error code in the message + + Document the fields, while we are here. + + Signed-off-by: Sage Weil + (cherry picked from commit 7c7bf5fee7be397ef141b947f532a2a0b3567b42) + +commit f856739824bc271405a6fa35bdefc2bdc42c2f02 +Author: Samuel Just +Date: Thu Nov 20 14:27:39 2014 -0800 + + ReplicatedPG: fail a non-blocking flush if the object is being scrubbed + + Fixes: #8011 + Backport: firefly, giant + Signed-off-by: Samuel Just + (cherry picked from commit 9b26de3f3653d38dcdfc5b97874089f19d2a59d7) + +commit ca96b59db529ffbba0c834795800b6e90a7e4fce +Author: Samuel Just +Date: Mon Feb 9 17:11:38 2015 -0800 + + WorkQueue: make wait timeout on empty queue configurable + + Fixes: 10817 + Backport: giant, firefly, dumpling + Signed-off-by: Samuel Just + (cherry picked from commit 5aa6f910843e98a05bfcabe6f29d612cf335edbf) + +commit 8ec8f1175a129624dffb511782664d74966d3c42 +Author: Samuel Just +Date: Mon Feb 9 10:37:15 2015 -0800 + + ReplicatedPG::on_change: clean up callbacks_for_degraded_object + + Backport: dumpling, firefly, giant + Fixes: 8753 + Signed-off-by: Samuel Just + (cherry picked from commit a5ecaa189d47a69466a6cd55fa4180e5c3092dc2) + +commit 963439f1038000c3f28c728350a2e9d351341e0b +Author: Yehuda Sadeh +Date: Thu Feb 5 09:33:26 2015 -0800 + + rgw: send appropriate op to cancel bucket index pending operation + + Fixes: #10770 + Backport: firefly, giant + + Reported-by: baijiaruo + Signed-off-by: Yehuda Sadeh + (cherry picked from commit dfee96e3aebcaeef18c721ab73f0460eba69f1c7) + + Conflicts: + src/rgw/rgw_rados.cc + +commit 3782b8b3652c0eb3fb8f65193ecfe09e92925170 +Author: Sage Weil +Date: Thu Feb 5 03:07:50 2015 -0800 + + mon: ignore osd failures from before up_from + + If the failure was generated for an instance of the OSD prior to when + it came up, ignore it. + + This probably causes a fair bit of unnecessary flapping in the wild... + + Backport: giant, firefly + Fixes: #10762 + Reported-by: Dan van der Ster + Signed-off-by: Sage Weil + (cherry picked from commit 400ac237d35d0d1d53f240fea87e8483c0e2a7f5) + +commit ca42905a6fca8b2b404500a6f74951ae20c8a488 +Author: Nilamdyuti Goswami +Date: Wed Nov 26 22:21:32 2014 +0530 + + doc: Adds the updated man page for ceph-disk utility. + + Signed-off-by: Nilamdyuti Goswami + (cherry picked from commit 016080d2c39919d73956e5e138ba5f079327aa44) + +commit 6602ab4581f27e066484b4c419d8206fcc20e045 +Author: Nilamdyuti Goswami +Date: Wed Nov 26 22:19:01 2014 +0530 + + doc: Updates the man page for ceph-disk utility with some changes. + + Signed-off-by: Nilamdyuti Goswami + (cherry picked from commit 8a48847cd46d4807ca1dbd13d3a561a693cdc877) + +commit 459807021dd3bf86ce31d30a3b576e783da67e3f +Author: Nilamdyuti Goswami +Date: Tue Nov 25 21:23:08 2014 +0530 + + doc: Adds man page for ceph-disk utility. + + Signed-off-by: Nilamdyuti Goswami + (cherry picked from commit 9a118d56d4a5b0a0456e9f092f5ae9293f7bf3f3) + +commit 430d821b7592148ad9b2656bb3031a0484763f33 +Author: Nilamdyuti Goswami +Date: Tue Nov 25 21:16:16 2014 +0530 + + doc: Removes ceph-deploy usage instances from ceph-disk man page. + + Signed-off-by: Nilamdyuti Goswami + (cherry picked from commit 242dd1c0bbb728475a94f47740790b8a196d9804) + +commit 21c3256c851065521e34a179dc05d48fcc0a6e0f +Author: Nilamdyuti Goswami +Date: Tue Nov 25 02:06:39 2014 +0530 + + doc: Updates man page for ceph-disk utility. + + Signed-off-by: Nilamdyuti Goswami + (cherry picked from commit 7dcc85042b0c0a26e495f7574ce144d1083d15f8) + +commit 1a6490e38d7f5fd3ff640a810c3b911699cd4884 +Author: Nilamdyuti Goswami +Date: Mon Nov 24 22:05:11 2014 +0530 + + doc: Adds man page for ceph-disk utility. + + Signed-off-by: Nilamdyuti Goswami + (cherry picked from commit a450cab2b8148cb8a9b043d629feccf89e5aabac) + +commit 3bab47054dc77b9a00d3f47fa73f458ede7d4ab4 +Author: Billy Olsen +Date: Mon Feb 2 16:24:59 2015 -0700 + + Fix memory leak in python rados bindings + + A circular reference was inadvertently created when using the + CFUNCTYPE binding for callbacks for the asynchronous i/o callbacks. + This commit refactors the usage of the callbacks such that the + Ioctx object does not have a class reference to the callbacks. + + Fixes: #10723 + Backport: giant, firefly, dumpling + Signed-off-by: Billy Olsen + Reviewed-by: Dan Mick + Reviewed-by: Josh Durgin + (cherry picked from commit 60b019f69aa0e39d276c669698c92fc890599f50) + +commit d0fd417e872a73033903fb36144fe7a39e90fc9a +Author: Dmytro Iurchenko +Date: Tue Feb 3 17:54:38 2015 +0200 + + rgw: Swift API. Support for X-Remove-Container-Meta-{key} header. + + Fixes: #10475 + Backport: hammer, firefly + Reported-by: Josh Durgin + Signed-off-by: Dmytro Iurchenko + (cherry picked from commit f67bfa24fd6f69c2fcc0987eba8b6b426dd78320) + + Conflicts: + src/rgw/rgw_rest.h + trivial merge: prototype of an unrelated function changed + s/is_object_op/!(s->object == NULL)/ + +commit 651dc556047aa4ee9e95fe9fc7bcd11488973872 +Author: Dmytro Iurchenko +Date: Mon Feb 2 11:27:00 2015 +0200 + + rgw: Swift API. Dump container's custom metadata. + + Fixes: #10665 + Backport: hammer, firefly + Reported-by: Ahmad Faheem + Signed-off-by: Dmytro Iurchenko + (cherry picked from commit 2f8d31e9b1c4b09506bd1b0dad635c6e820783b2) + + Conflicts: + src/rgw/rgw_rest_swift.cc + The first hunk conflicts because X-Storage-Policy was added + after firefly. The second hunk conflicts because the type of the + s->object data member changed after firefly but it is irrelevant + because the patch does not need to access s->object anymore. + +commit 6fd3dfa7224f0af3101fd1614506e8cb2fc7f6a0 +Author: Sage Weil +Date: Fri Jan 23 10:47:44 2015 -0800 + + osd: do not ignore deleted pgs on startup + + These need to get instantiated so that we can complete the removal process. + + Fixes: #10617 + Signed-off-by: Sage Weil + (cherry picked from commit 879fd0c192f5d3c6afd36c2df359806ea95827b8) + +commit 368a5a8a8739e98ffdb8ac1210d111092e31be9e +Author: Joao Eduardo Luis +Date: Fri Jan 30 11:37:28 2015 +0000 + + mon: Monitor: fix timecheck rounds period + + Fixes: #10546 + Backports: dumpling?,firefly,giant + + Signed-off-by: Joao Eduardo Luis + (cherry picked from commit 2e749599ac6e1060cf553b521761a93fafbf65bb) + +commit 66b13f2beff702c3b0bcb0aaa8da4e12d594eddf +Author: Sage Weil +Date: Sun Jan 11 17:28:04 2015 -0800 + + osd: requeue blocked op before flush it was blocked on + + If we have request A (say, cache-flush) that blocks things, and then + request B that gets blocked on it, and we have an interval change, then we + need to requeue B first, then A, so that the resulting queue will keep + A before B and preserve the order. + + This was observed on this firefly run: + + ubuntu@teuthology:/a/sage-2015-01-09_21:43:43-rados-firefly-distro-basic-multi/694675 + + Backport: giant, firefly + Fixes: #10512 + Signed-off-by: Sage Weil + (cherry picked from commit 11bdfb4131ecac16d4a364d651c6cf5d1d28c702) + +commit a22aa8f3228e0baf2ba08d9e79ee4924cd341a07 +Author: Matt Richards +Date: Thu Jan 8 13:16:17 2015 -0800 + + librados: Translate operation flags from C APIs + + The operation flags in the public C API are a distinct enum + and need to be translated to Ceph OSD flags, like as happens in + the C++ API. It seems like the C enum and the C++ enum consciously + use the same values, so I reused the C++ translation function. + + Signed-off-by: Matthew Richards + (cherry picked from commit 49d114f1fff90e5c0f206725a5eb82c0ba329376) + + Conflicts: + src/librados/librados.cc + comes from lttng tracepoints introduced after firefly + +commit d5c3a14390f6bb2af45a1a4ad842777302dd0553 +Author: Yan, Zheng +Date: Sat Jan 3 15:29:29 2015 +0800 + + mount.ceph: avoid spurious error message + + /etc/mtab in most modern distributions is a symbol link to + /proc/self/mounts. + + Fixes: #10351 + Signed-off-by: Yan, Zheng + (cherry picked from commit bdd0e3c4bda97fe18487a58dd173a7dff752e1a2) + +commit 7ed92f7dc5f0f327b77b6f3835e3f821fc810708 +Author: Greg Farnum +Date: Tue Dec 2 15:17:57 2014 -0800 + + SimpleMessenger: allow RESETSESSION whenever we forget an endpoint + + In the past (e229f8451d37913225c49481b2ce2896ca6788a2) we decided to disable + reset of lossless Pipes, because lossless peers resetting caused trouble and + they can't forget about each other. But they actually can: if mark_down() + is called. + + I can't figure out how else we could forget about a remote endpoint, so I think + it's okay if we tell them we reset in order to clean up state. That's desirable + so that we don't get into strange situations with out-of-whack counters. + + Fixes: #10080 + Backport: giant, firefly, dumpling + + Signed-off-by: Greg Farnum + (cherry picked from commit 8cd1fdd7a778eb84cb4d7161f73bc621cc394261) + +commit 91b2acaadee1b62c1fcac73147908ec4477840f3 +Author: David Zafman +Date: Thu Oct 9 11:20:13 2014 -0700 + + osd: Get pgid ancestor from last_map when building past intervals + + Fixed OSD::build_past_intervals_parallel() and PG::generate_past_intervals() + + Fixes: #10430 + + Signed-off-by: David Zafman + (cherry picked from commit 0c5b66da7a9ba516340d06d9e806beb9d1040d0e) + + Conflicts: + src/osd/OSD.cc + +commit c09b6d9d64fdcdc3842c4f89acf10080125a4adc +Author: David Zafman +Date: Tue Dec 23 12:04:26 2014 -0800 + + osd: Pass oldest_map when generating past intervals + + From load_pgs() the superblock hasn't been published yet + so we need to retrieve the value of oldest_map depending on the context. + + Fixes: #10427 + + Signed-off-by: David Zafman + (cherry picked from commit 7fb721c1ceb39b38ca2e653299bcf51e109411d7) + + Conflicts: + src/osd/OSD.cc + +commit 9235d781bf860c1ecef5af600431f1619b56fbc1 +Author: David Zafman +Date: Wed Dec 17 16:59:09 2014 -0800 + + osd: build_push_op() handle short reads so recipient doesn't crash + + Fixes: #8121 + + Signed-off-by: David Zafman + (cherry picked from commit b9a04846d18e1c6621d7f2794ec1fae02875bed2) + +commit 7ce0cb8e33a281d5f675273d7bcbc570a32e5497 +Author: Kefu Chai +Date: Thu Feb 5 16:33:08 2015 +0800 + + ceph_objectstore_tool: fix check_output on python2.6 + + * backported the subprocess.check_output from python2.7 + + Fixes: #10756 + + Signed-off-by: Kefu Chai + (cherry picked from commit 15350a088d84bc6fc664f0d3f5d09b35f58b2144) + + Conflicts: + src/test/ceph_objectstore_tool.py + +commit d5d34ddd1eac688d9422ca02c859d61be8e3e56e +Author: David Zafman +Date: Tue Jan 6 15:49:50 2015 -0800 + + ceph_objectstore_tool: For import get object_info_t available for each object + + Add object_info_t to object_begin so we have at object create time + This will be useful for importing from multiple erasure coded exports + + Signed-off-by: David Zafman + (cherry picked from commit 05d916ed12f361da48ef73953bcc0cef465fcc2a) + +commit f70590b211ba28f350921b2f0c92712ed779858e +Author: David Zafman +Date: Fri Dec 12 15:16:03 2014 -0800 + + ceph_objectstore_tool: Handle import of pg which OSD has split + + Fail import if import data doesn't include OSDMap and can't find it locally + See if local map can be read for import's epoch + Jump to current epoch like a split would if local map not present + + Fixes: #9781 + + Signed-off-by: David Zafman + (cherry picked from commit afda6e4f3b98cc1773fd014583dfb5e1f214a939) + + Conflicts: + src/tools/ceph_objectstore_tool.cc + +commit aedd324f8fbb031d5126ae158f03066c7342f4b0 +Author: David Zafman +Date: Fri Dec 12 15:38:33 2014 -0800 + + ceph_objectstore_tool: On import following a split skip objects no longer in pg + + Signed-off-by: David Zafman + (cherry picked from commit 68b27e25a3729566c3a22c0b71f70f7f3aca29a3) + + Conflicts: + src/tools/ceph_objectstore_tool.cc + +commit c3de607c2ce16a26af7a78a4e557f321ffbcb44d +Author: David Zafman +Date: Fri Dec 5 15:12:21 2014 -0800 + + ceph_objectstore_tool: Verify that object pool and pgid pool match + + Also, earlier check for invalid --pgid with import op + + Signed-off-by: David Zafman + (cherry picked from commit da3be80497a1b1878ee5d2015f8840d202a83aa2) + +commit 5ec38e54f40431a5303cafe202c5097cc400fdb6 +Author: David Zafman +Date: Wed Nov 19 11:47:36 2014 -0800 + + ceph_objectstore_tool: Check for splits and fail import if there were splits + + Add osdmap into metadata_section + On export put metadata_section before file data + + Fixes: #9780 + + Signed-off-by: David Zafman + (cherry picked from commit 19fdeea8b67091ed044ebce25799d3237b4d734a) + +commit aa472fa4df19d826a5af72e286365fa5ce4c71fb +Author: David Zafman +Date: Mon Dec 15 10:03:53 2014 -0800 + + ceph_objectstore_tool: Add special exit code for test infrastructure + + Signed-off-by: David Zafman + (cherry picked from commit b1f12f09c0211b608178f5ca2e292ab1765ce620) + +commit 22b7c2faee8cfad1b40cef019984f4a2d112b268 +Author: David Zafman +Date: Wed Nov 19 11:41:39 2014 -0800 + + ceph_objectstore_tool: Check that pool exists before allowing import + + Signed-off-by: David Zafman + (cherry picked from commit f24f646d870703b7b79563cdbc20920248be6142) + + Conflicts: + src/tools/ceph_objectstore_tool.cc + +commit f65db436f6eb83d7c59fdddced4d35dd9eeeb078 +Author: David Zafman +Date: Wed Oct 15 15:21:11 2014 -0700 + + ceph_objectstore_tool: Check cluster_fsid before allowing an import + + Signed-off-by: David Zafman + (cherry picked from commit 196c8112dc9a6e8780b05d6c579cd7fbd5b07589) + +commit 360f68ec69b36b172d15c6206698340a5c00aafa +Author: David Zafman +Date: Thu Oct 16 12:27:56 2014 -0700 + + ceph_objectstore_tool: Allow the metadata_section to be anywhere in the export + + Signed-off-by: David Zafman + (cherry picked from commit 62dd912f1192b28700a15b02507a8c9efd710cb5) + + Conflicts: + src/tools/ceph_objectstore_tool.cc + +commit c3fcbe636ae1a936b9180628cff939b2b5dddf7c +Author: David Zafman +Date: Fri Dec 12 15:01:24 2014 -0800 + + ceph_objectstore_tool: import-rados shouldn't import internal namespace objects + + Signed-off-by: David Zafman + (cherry picked from commit f727d2eaf50b0351feb9f12dcd65d50fb6eff7e9) + +commit 2d69076943b9724ce1d5c9f03f2f7594a77b92d2 +Author: David Zafman +Date: Fri Dec 12 14:58:54 2014 -0800 + + ceph_objectstore_tool: Get g_ceph_context available to import-rados + + Signed-off-by: David Zafman + (cherry picked from commit ddc4613ec71752e5dccbbfe6dc078b86f0f186a9) + + Conflicts: + src/tools/ceph_objectstore_tool.cc + +commit bbed3a728471292de625d922abeae8b39d290045 +Author: David Zafman +Date: Tue Dec 9 18:09:04 2014 -0800 + + ceph_objectstore_tool: Fix import-rados skipping of snapshots + + Signed-off-by: David Zafman + (cherry picked from commit fe936026ed87c9f95f3b7ad235b24c22e8de5f55) + +commit d962c79dc1f0f2189e25743c6d253fac412c004d +Author: David Zafman +Date: Thu Nov 20 13:00:10 2014 -0800 + + ceph_objectstore_tool: read_fd() doesn't handle ^D from tty stdin, don't allow + + Signed-off-by: David Zafman + (cherry picked from commit 5cb692528e3ac0ebea3f1714b3ac43f69d176888) + +commit 8b7f4cb232c2f1bde6f50c1f092cb622fe2c41f1 +Author: David Zafman +Date: Fri Dec 19 13:47:32 2014 -0800 + + ceph-objectstore-tool: Remove --pretty-format and use new --format options + + Call new_formatter() with --format specified argument + + Signed-off-by: David Zafman + (cherry picked from commit 22b71744bb0cb56434d5f6214ccea7d81f771860) + + Conflicts: + src/tools/ceph_objectstore_tool.cc + +commit ee183b8e56c6f8a88ac781cf1fedb6a7a93f1005 +Author: David Zafman +Date: Wed Oct 15 15:20:03 2014 -0700 + + ceph_objectstore_tool: Describe super_ver values + + Signed-off-by: David Zafman + (cherry picked from commit 0aeba0f216a54390288b5e3d6147deb31877f744) + +commit 5f82f824c7b1bb246bdb54b11a694133a49da70a +Author: Danny Al-Gaaf +Date: Fri Jan 2 18:36:54 2015 +0100 + + ceph_objectstore_tool.cc: reduce scope of variable + + Signed-off-by: Danny Al-Gaaf + (cherry picked from commit 78542f9a901c05e627b53b5306ea604be3bc82e8) + +commit 2f97c51b5ee5de7c1657bc8fee6ccaa474f6f478 +Author: Danny Al-Gaaf +Date: Fri Jan 2 15:48:08 2015 +0100 + + ceph_objectstore_tool.cc: prefer ++operator for non-primitive iterators + + Signed-off-by: Danny Al-Gaaf + (cherry picked from commit 252fc03ba21c7b09922598a8d40997fc639bb994) + +commit 2a22bfedc4fa9f461dc8cfd42c659b9c55ddafca +Author: David Zafman +Date: Tue Nov 25 16:56:19 2014 -0800 + + ceph_objectstore_tool: Prevent tool from transitioning struct_v on rm-past-intervals + + Signed-off-by: David Zafman + (cherry picked from commit f056bdf93980a0a8e6157dbb124a79389a8f1a3c) + +commit 53aa04f95c43795da81a7d9f3117d7e5816aedcb +Author: David Zafman +Date: Thu Dec 4 18:53:08 2014 -0800 + + ceph_objectstore_tool: Accept json object with --pgid instead of array + + It isn't anticipated that anyone would use this but keeps backward compatible + + Signed-off-by: David Zafman + (cherry picked from commit 59b423e2e8846b098326fdec440de46b8e3d2769) + +commit 60e9a8120b292628ee4e5ef33fe933222609b861 +Author: David Zafman +Date: Thu Dec 4 18:27:50 2014 -0800 + + ceph_objectstore_tool: Improve object spec parsing error messages + + Signed-off-by: David Zafman + (cherry picked from commit de6384fda183801c16af1b61ed36eaed289bb4f6) + +commit 4f95409c0dadeed18334c00630ddc6d7c99d2819 +Author: David Zafman +Date: Thu Dec 4 17:48:28 2014 -0800 + + ceph_objectstore_tool: Fix errors messages in newer code + + Signed-off-by: David Zafman + (cherry picked from commit eae7c02fded460f6c8aaf18aa83d2730b89e0eb1) + +commit 1703867735c8f8ab1c83aa526c84b278436f38d5 +Author: David Zafman +Date: Thu Dec 4 16:00:40 2014 -0800 + + ceph_objectstore_tool: Remove extraneous endl on error throw messages + + Signed-off-by: David Zafman + (cherry picked from commit 32c832f0c62259a492d1c934c56ac165496763a0) + +commit f2d2110a8cb1a1b1216c1083b54ea6212138dc93 +Author: David Zafman +Date: Thu Dec 4 14:01:39 2014 -0800 + + ceph_objectstore_tool: Add --format and --pretty-format support + + --pretty-format defaults true + Add --format so xml output can be requested + --op list defaults to single line of json per object + To override this more human readable output use --pretty-format=false + Add testing of --op list special handling + + Signed-off-by: David Zafman + (cherry picked from commit cca85a534fe488ae314400e8faad55a758609467) + +commit a5544c12c3e982edf66f55a8edfb7fc69300520b +Author: David Zafman +Date: Wed Dec 3 17:53:11 2014 -0800 + + ceph_objectstore_tool: Strip _* (always _head) from pgid in list entry output + + Signed-off-by: David Zafman + (cherry picked from commit 5eacd3c5f39766cb8be6b3251d139d16431cf6b6) + +commit 4a0a42f3e6b9a1b7ed4df3d4b6c417acfc00050a +Author: David Zafman +Date: Wed Dec 3 16:39:04 2014 -0800 + + ceph_objectstore_tool: BUG: --op list wasn't including snapshots + + Signed-off-by: David Zafman + (cherry picked from commit b617ee2d45886ec6b3dc0db0edbf814ea5748083) + +commit 06579b9f269dc9864e920368c5bea9bcb9dd8be9 +Author: David Zafman +Date: Wed Dec 3 16:38:22 2014 -0800 + + ceph_objectstore_tool: For terminal output of list one object per line + + Instead of a parsable array make it easier to cut and paste listed objects + + Signed-off-by: David Zafman + (cherry picked from commit 5a66db9418aeed31fec98999c5053dab357d9c1e) + +commit 22c6bf410de1b0c81f131aa4d5682f5162dd1a56 +Author: David Zafman +Date: Wed Dec 3 16:35:09 2014 -0800 + + ceph_objectstore_tool: In error case umount objectstore + + Signed-off-by: David Zafman + (cherry picked from commit bc6ea9cb8a2b86c73f7f15bc46145177ccf91f4b) + +commit d802ab69a4a0f37ed9328ec385746f59643e1420 +Author: Loic Dachary +Date: Thu Nov 27 01:24:03 2014 +0100 + + objectstore_tool: test --op list variants + + Signed-off-by: Loic Dachary + (cherry picked from commit ddba2676c4c48c2a556f5d4ffd817bfe134a9153) + +commit f30e053fe7b3e5efc679b20cf1b3e2f7f8ed7e54 +Author: Loic Dachary +Date: Thu Nov 27 00:11:45 2014 +0100 + + objectstore_tool: parse new object description format + + The object format changed from + + {json object} + + to + + [pgid,{json object}] + + The parser is updated accordingly. If the --pgid is present, check that + it equals the pgid from the object description. + + Signed-off-by: Loic Dachary + (cherry picked from commit df9d5c5cfd8b0ff793647a592c7661965cef5c92) + +commit fce87c9d7dbb51989826d068d6b6657a2f3d129d +Author: Loic Dachary +Date: Wed Nov 26 23:35:21 2014 +0100 + + objectstore_tool: filter --op list and explore all PGs + + The positional object name is used to filter the output of --op list and + only show the objects with a matching name. If both the object name and + the pgid are omitted, all objects from all PGs are displayed. + + The output format is changed from + + {"oid":"GROUP","key":"","snapid":-2, + "hash":2659194943,"max":0,"pool":0,"namespace":""} + + to + + [["0.7_head",{"oid":"GROUP","key":"","snapid":-2, + "hash":2659194943,"max":0,"pool":0, + "namespace":""}]] + + where the first member is the pgid where the object is found. + + Signed-off-by: Loic Dachary + (cherry picked from commit c69aaceac7f370e5369d511bf17898adc338ae43) + +commit 2f874fd3715d216a68658e94a5c741e665c76632 +Author: Loic Dachary +Date: Wed Nov 26 23:34:22 2014 +0100 + + objectstore_tool: lookup objects by name + + If the object is not a parsable JSON string, assume an object name and + look it up in all the PGs. If multiple objects have the same name, only + apply the command to one of them. It is primarily useful in a test + environment where the names of the tests objects are known and only a + small number of objects exists. It replaces the following: + + path='--data-path dev/osd0 --journal-path dev/osd0.journal' + for pgid in $(./ceph_objectstore_tool $path --op list-pgs) ; do + object=$(./ceph_objectstore_tool $path --pgid $pgid --op list | + grep '"oid":"NAME"') + test -n "$object" && break + done + ./ceph_objectstore_tool $path --pgid $pgid "$object" remove + + with: + + ./ceph_objectstore_tool $path NAME remove + + http://tracker.ceph.com/issues/10192 Fixes: #10192 + + Signed-off-by: Loic Dachary + (cherry picked from commit 7c1165f96391821c00cca1ac04b3433dbec6bb6e) + + Conflicts: + src/tools/ceph_objectstore_tool.cc + +commit eb48aba2119959c53ea8a103b53f0c2e07c52acb +Author: Loic Dachary +Date: Wed Nov 26 17:30:30 2014 +0100 + + objectstore_tool: refactor list-lost and fix-lost + + Abstract out the PG exploration loops and encapsulate the list-lost and + fix-lost semantic in a callable object. + + Signed-off-by: Loic Dachary + (cherry picked from commit d9e747b1bdb53d1fe543ef311e3db35fb78d8051) + +commit eab3226edc8ecc6d2d93f463e019ef4eacb9f468 +Author: Loic Dachary +Date: Wed Nov 26 17:26:54 2014 +0100 + + objectstore_tool: update usage strings + + Signed-off-by: Loic Dachary + (cherry picked from commit a90233c8b79ae6c035b5169c7f8809f853631689) + + Conflicts: + src/tools/ceph_objectstore_tool.cc + +commit 5bbe41cb1e7faf40a9c53a9bdc01db8c35670a36 +Author: David Zafman +Date: Tue Nov 18 13:00:15 2014 -0800 + + ceph_objectstore_tool: Add feature called set-allow-sharded-objects + + Uses --op set-allow-sharded-objects option + This operation will be rejected if on the target OSD's osdmap there is + at least one OSD which does not support ERASURE CODES. + Prompt the user that they could import if sharded state allowed + Prompt the user to use new feature if sharded state found inconsistent + + Fixes: #10077 + + Signed-off-by: David Zafman + (cherry picked from commit f3dab446fc8e58b3b3d9334b8c38722e73881b9e) + + Conflicts: + src/tools/ceph_objectstore_tool.cc + +commit 4e30d8c53c0a4952f81cc3b5680c7e92fffb1355 +Author: David Zafman +Date: Tue Nov 18 11:59:18 2014 -0800 + + ceph_objectstore_tool: Add utility routine get_osdmap() + + Signed-off-by: David Zafman + (cherry picked from commit b3021b0d3362000b5938d406ed2e032a8eb38405) + + Conflicts: + src/tools/ceph_objectstore_tool.cc + +commit f997b126afaeadebe6b6d76954fbe2008bd2a7d8 +Author: David Zafman +Date: Wed Nov 12 21:14:11 2014 -0800 + + ceph_objectstore_tool: Clear ...INCOMPAT_SHARDS from feature if exporting replicated pg + + Don't require importing OSD to have shards feature for replicated pg + + http://tracker.ceph.com/issues/10077 Fixes: #10077 + + Signed-off-by: David Zafman + (cherry picked from commit 86baf2d38170ef19de2dd5e9ce3f280237d8474d) + +commit 3f862da57945f821ed459f5a77f8032331c9cb20 +Author: David Zafman +Date: Tue Nov 18 00:10:41 2014 -0800 + + tests: ceph_objectstore_tool.py test all variants of export/import + + Handle change of error message text + + Signed-off-by: David Zafman + (cherry picked from commit 9e53c3554c71121f2e2dd0234b5232da37ad5a1b) + +commit 2a58bdee19eb8f02a2800d728d402e76566d7f58 +Author: David Zafman +Date: Mon Nov 17 23:23:40 2014 -0800 + + ceph_objectstore_tool: Make --file option consistent by treating "-" as stdout/stdin + + Signed-off-by: David Zafman + (cherry picked from commit 8c87f3284f87d1121218cb7f41edc81b74c9df29) + +commit 70329be7be5d80e52d5424958cecd7f4d939add6 +Author: David Zafman +Date: Sat Nov 15 11:43:10 2014 -0800 + + tests: ceph_objectstore_tool.py fix list-attr for erasure code + + Adding testing of xattr for erasure coded shards + Fix error message when finding an unexpected xattr key + + Signed-off-by: David Zafman + (cherry picked from commit cbecab477a70782f2f69258f035e78fb5c829810) + + Conflicts: + src/test/ceph_objectstore_tool.py + +commit bc921c7ecbc061ccef4847d967986c9fa902111f +Author: David Zafman +Date: Sat Nov 15 11:46:15 2014 -0800 + + tests: ceph_objectstore_tool.py check for malformed JSON for erasure code objs + + Signed-off-by: David Zafman + (cherry picked from commit 40717aa4c399e87d2c3e32038f78788eb213f87d) + +commit 99ffd137f17c438e3ee9dbbc0ab73cdcd3a45a5c +Author: David Zafman +Date: Sat Nov 15 11:44:54 2014 -0800 + + tests: ceph_objectstore_tool.py fix off by 1 ATTR_OBJS handling + + Signed-off-by: David Zafman + (cherry picked from commit eaf1d1e35243566c46b478788e79e0ebf7583015) + +commit 951c951a0f164271c5e9b96ecbd510cf5c6663b6 +Author: Loic Dachary +Date: Fri Nov 14 11:00:17 2014 +0100 + + tests: ceph_objectstore_tool.py skip if /dev/tty fails + + Some environments do not have a /dev/tty. When opening /dev/tty fails, + skip the test instead of returning an error. + + Signed-off-by: Loic Dachary + (cherry picked from commit 4c94f1778fdf483e9e0b62f89f7e46e78aeeebf3) + +commit 8dc263555d7aca2befa912c78c585a43c8e7592c +Author: Loic Dachary +Date: Thu Nov 13 19:15:50 2014 +0100 + + tests: ceph_objectstore_tool.py encapsulate init-ceph stop + + Call init-ceph in kill_daemons and add a call to kill_daemon when main + returns on error so that it never leaves daemons hanging. + + Signed-off-by: Loic Dachary + (cherry picked from commit fc435ff3623d196dd7cc375302acd8dfc9eb59fd) + +commit b78d802f3e082b0762203ae37b3c3a44b7608907 +Author: Loic Dachary +Date: Thu Nov 13 19:14:49 2014 +0100 + + tests: ceph_objectstore_tool.py main returns + + Instead of calling sys.exit() the main function returns the desired exit + code. + + Signed-off-by: Loic Dachary + (cherry picked from commit 0f3d7b1315f2b5595047d8bd13949ed0d9194bfa) + +commit 04e480b09b59f5318e9b206e6c3e529d8bb22328 +Author: Loic Dachary +Date: Thu Nov 13 17:32:14 2014 +0100 + + tests: ceph_objectstore_tool.py replace stop.sh with init-ceph + + The stop.sh will stop all ceph-* processes. Use the init-ceph script + instead to selectively kill the daemons run by the vstart.sh cluster + used for ceph_objectstore_tool. + + Signed-off-by: Loic Dachary + (cherry picked from commit e8f34bd62bf282144b8851fb9764cf4429a49c25) + +commit 51855584eb970b28dfa266ee3653963fb77a2b1e +Author: Loic Dachary +Date: Thu Nov 13 17:30:29 2014 +0100 + + tests: ceph_objectstore_tool.py use a dedicated directory + + Set CEPH_DIR to a directory that is specific to ceph_objectstore_tool so + that it can run in parallel with other vstart.sh clusters. + + Signed-off-by: Loic Dachary + (cherry picked from commit e303d1afde58e68c1f587962010da9e1f1278bc3) + + Conflicts: + src/test/ceph_objectstore_tool.py + +commit 454ec85ae449cb20c2ddecade421262d9d9de615 +Author: Loic Dachary +Date: Thu Nov 13 17:27:01 2014 +0100 + + tests: ceph_objectstore_tool.py run faster by default + + By default use only a small number of objects to speed up the tests. If + the argument "big" is given, use a large number of objects as it may + help find some problems. + + Signed-off-by: Loic Dachary + (cherry picked from commit 235257c257aea98b770d9637957818c8aeb6c745) + +commit 427fe9177f90db091685b937839dcc4dfe4c8a01 +Author: Loic Dachary +Date: Thu Nov 13 17:21:48 2014 +0100 + + tests: ceph_objectstore_tool.py run mon and osd on specific port + + By default vstart.sh runs MDS but they are not needed for the tests, + only run mon and osd instead. Instead of using the default vstart.sh + port which may conflict with a already running vstart.sh, set the + CEPH_PORT=7400 which is not used by any other test run with make check. + + Signed-off-by: Loic Dachary + (cherry picked from commit f04d4e7539bc8c1b6cf94db815f9dcdecc52faa2) + +commit 28ed34a7dc32033be61cd61178eb59471de45aac +Author: Loic Dachary +Date: Thu Nov 13 17:16:41 2014 +0100 + + tests: ceph_objectstore_tool.py can use a WARN cluster + + The tests do not need HEALTH_OK exclusively, a HEALTH_WARN cluster can + also run them successfully. + + Signed-off-by: Loic Dachary + (cherry picked from commit 783378c019aaac36d542e1b12c0d64196ea21012) + +commit 8548d7c65e8743d4e743c77981acd702efab3fb2 +Author: Loic Dachary +Date: Thu Nov 13 17:12:35 2014 +0100 + + tests: ceph_objectstore_tool.py use env python + + Using #/usr/bin/env python instead of a hard coded path is more flexible + and can also be used to run from virtualenv. + + Signed-off-by: Loic Dachary + (cherry picked from commit 74506d2506d03d05935cbe342fef9dc1d9022a13) + +commit 5b4c2ee0902d94e6fa5bee67188fddcf0e0c38a2 +Author: David Zafman +Date: Wed Nov 12 15:22:04 2014 -0800 + + ceph_objectstore_tool: Fixes to make import work again + + The is_pg() call is now true even for pgs pending removal, fix broken + finish_remove_pgs() by removing is_pg() check. + Need to add create_collection() to the initial transaction on import + + Fixes: #10090 + + Signed-off-by: David Zafman + Reviewed-by: Sage Weil + (cherry picked from commit 5ce09198bf475e5c3a2df26232fa04ba9912b103) + + Conflicts: + src/tools/ceph_objectstore_tool.cc + +commit e0dab883d29bd3a54b5707f8d3ea830f8a1ce516 +Author: David Zafman +Date: Mon Oct 6 18:26:44 2014 -0700 + + ceph_objectstore_tool: Accept CEPH_ARGS environment arguments + + Signed-off-by: David Zafman + (cherry picked from commit 10fe7cfe561f91717f2ac2e13aeecc06a903704e) + +commit 5f788eaa61054c6cb27960b5544b321078706343 +Author: David Zafman +Date: Fri Oct 3 15:12:28 2014 -0700 + + ceph_objectstore_tool: MemStore needs a CephContext + + Pass g_ceph_context to ObjectStore::create() needed by MemStore + + Fixes: #9661 + + Signed-off-by: David Zafman + (cherry picked from commit 0b155d00c542f0d8b8b5b0324dac4b3cf7ff37b5) + +commit 7470c7ca02ef6313b925be3ce4b27437a0c2e1e0 +Author: David Zafman +Date: Tue Mar 3 10:41:28 2015 -0800 + + ceph_objectstore_tool: Rename generated binary to ceph-objectstore-tool + + Signed-off-by: David Zafman + (cherry picked from commit 4f72ba545e204a24a55adead43c61cb1d4394381) + + Conflicts: + debian/ceph-test.install + src/.gitignore + src/test/ceph-helpers.sh (doesn't exist in firefly) + src/test/ceph_objectstore_tool.py + src/tools/ceph_objectstore_tool.cc + +commit 4b51645fdfe8a761f7ebc0faee1a87187d498fee +Author: Sage Weil +Date: Wed Nov 12 13:35:43 2014 -0800 + + vstart.sh: warn less + + Signed-off-by: Sage Weil + (cherry picked from commit a69b8450f642af91a352d0de4378d93828291933) + +commit 3075919c0dcee1f33a0beb299e98d8a88836c5f8 +Author: David Zafman +Date: Mon Nov 17 23:02:50 2014 -0800 + + ceph_objectstore_tool: When exporting to stdout, don't cout messages + + Fixes: #10128 + Caused by a2bd2aa7 + + Signed-off-by: David Zafman + (cherry picked from commit 0d5262ac2f69ed3996af76a72894b1722a27b37d) + (cherry picked from commit 6cb9a2499cac2645e2cc6903ab29dfd95aac26c7) + +commit ca9df2803f3200431f5f7ea99a713005f15b7f5a +Author: Danny Al-Gaaf +Date: Tue Sep 2 14:56:10 2014 +0200 + + ceph_objectstore_tool.cc: prefer ++operator for non-primitive iterators + + Signed-off-by: Danny Al-Gaaf + (cherry picked from commit a5468abe4459159e8a9f7a4f21d082bb414e1cdd) + +commit ac396f5df3c51d09b9dbf7e6be7bed1d755d2317 +Author: Zhiqiang Wang +Date: Fri Aug 29 16:39:40 2014 +0800 + + Test: fixing a compile warning in ceph_objectstore_tool.cc + + For the compiler's sake: + tools/ceph_objectstore_tool.cc:2547:15: warning: ‘r’ may be used + uninitialized in this function [-Wmaybe-uninitialized] + + Signed-off-by: Zhiqiang Wang + (cherry picked from commit c3e1466b46076f133b62f98e2c0b712bdde0e119) + +commit b863970110e7f8e835e77864ea59b81f0b026158 +Author: David Zafman +Date: Wed Aug 20 01:33:45 2014 -0700 + + ceph_objectstore_tool: Bug fixes and test improvements + + ceph_objectgstore_tool: + Fix bugs in the way collection_list_partial() was being called + which caused objects to be seen over and over again. + + Unit test: + Fix get_objs() to walk pg tree for pg with sub-directories + Create more objects to test object listing code + Limit number of larger objects + Limit number of objects which get attributes and omaps + + Signed-off-by: David Zafman + (cherry picked from commit a03f719eb3a46f410550afce313e6720e0c27946) + +commit 4f83005bb8a615df370de7b6dfe5d926c7cfef7f +Author: David Zafman +Date: Tue Aug 5 18:26:11 2014 -0700 + + ceph_objectstore_tool, test: Implement import-rados feature and unit test code + + Fixes: #8276 + + Signed-off-by: David Zafman + (cherry picked from commit 23ec93a86140c4b271b45d87c62682288079cbba) + +commit df334617705fe862b820ef5de674ae54efad0cea +Author: David Zafman +Date: Wed Aug 6 19:53:43 2014 -0700 + + test: ceph_objectstore_tool unit test improvements + + Add namespaces to testing + Increase filesize so export will have multiple chunks + Put json for each object into the db dict + + Signed-off-by: David Zafman + +commit ecd25cf6ce1a1a34e536c7fd313225b3bdedd2e0 +Author: David Zafman +Date: Thu Aug 7 13:31:48 2014 -0700 + + ceph_objectstore_tool: Add operation "rm-past-intervals" for testing purposes + + Signed-off-by: David Zafman + (cherry picked from commit 3694068b67fd625495c4511390bc5bcbfbbd28f5) + +commit 6b50b384efc1f0735f8635a59663b50e3155de1a +Author: David Zafman +Date: Thu Aug 7 11:46:08 2014 -0700 + + ceph_objectstore_tool: Add past_intervals to export/import code + + Signed-off-by: David Zafman + (cherry picked from commit 9325ec15d4b89c5537cbcbda4c7594e25dc6e7df) + +commit 63529079b97c99cbaa863e1be865e2693e90c556 +Author: David Zafman +Date: Thu Aug 7 14:11:21 2014 -0700 + + ceph_objectstore_tool: Minor improvements + + Make all non-error non-debug output to stdout + Fix a message + + Signed-off-by: David Zafman + (cherry picked from commit a2bd2aa7babb4ad45ba21c70f9d179fda27742aa) + +commit 5e3f89ece7bdd09ed06ca4208cfa0a0b3104f109 +Author: David Zafman +Date: Tue Aug 5 12:26:42 2014 -0700 + + ceph_objectstore_tool, test: Add list-pgs operations and unit test case + + Signed-off-by: David Zafman + (cherry picked from commit f01e334c697057158354f0ce5ecff6d6ba8e2704) + +commit edc9f71efeebe5241004c669cc58089905907634 +Author: David Zafman +Date: Wed Jul 30 12:39:49 2014 -0700 + + Complete replacement of ceph_filestore_tool and ceph_filestore_dump + with unified ceph_objectstore_tool + + Move list-lost-objects and fix-lost-objects features from + ceph_filestore_tool to ceph_objectstore_tool as list-lost, fix-lost + Change --type to --op for info, log, export...operations + Add --type for the ObjectStore type (defaults to filestore) + Change --filestore-path to --data-path + Update installation, Makefile.am, and .gitignore + Fix and rename test case to match + Add some additional invalid option checks + + Signed-off-by: David Zafman + (cherry picked from commit 83fbc91e5c4e52cc1513f34908f99d2ac3b930df) + +commit a42273ecd955470105cba1cc4ac7eb782ac46833 +Author: David Zafman +Date: Wed Jul 30 11:22:29 2014 -0700 + + Renames and removal towards a unified ceph_objectstore_tool + + Rename ceph_filestore_dump.cc and ceph_filestore_dump.py + Remove ceph_filestore_tool.cc + + Signed-off-by: David Zafman + (cherry picked from commit 77864193a1162393ade783480aee39a232934377) + +commit 9ee2c27096784efceb02b06a0df4325979385f44 +Author: David Zafman +Date: Tue May 20 11:19:19 2014 -0700 + + ceph_filestore_dump: Add set-omaphdr object command + + Signed-off-by: David Zafman + (cherry picked from commit b4d95cc85af9af64d33d541cd69c5f28fd45423b) + +commit c7d261d40367ab01a577bf255da776a622f8984a +Author: David Zafman +Date: Tue May 20 10:44:37 2014 -0700 + + ceph_filestore_dump: Add get-omaphdr object command + + Signed-off-by: David Zafman + (cherry picked from commit 30c0f3114e665acdd99e64bf0d2a7399b33e8d61) + +commit 271e3ca19b9e878c6c3c26a9ec461faf06e7a19d +Author: David Zafman +Date: Mon May 19 20:55:47 2014 -0700 + + ceph_filestore_dump: Add rm-omap object command + + Signed-off-by: David Zafman + (cherry picked from commit 0fc6bd2777edf24a044f454beacf1647cc52f9fe) + +commit 20165d101a30c6beb591ca56b56bdf5505f70cf3 +Author: David Zafman +Date: Mon May 19 20:47:14 2014 -0700 + + ceph_filestore_dump: Add set-omap object command + + Signed-off-by: David Zafman + (cherry picked from commit 50cd57e902fe508f98f63fea30626780b07561d9) + +commit 7547f3d17bc89437c529aa96413b0bebb808da5e +Author: David Zafman +Date: Mon May 19 20:37:01 2014 -0700 + + ceph_filestore_dump: Add get-omap object command + + Signed-off-by: David Zafman + (cherry picked from commit b50c43ce5e52f5bbcb3684f6793d50f34ed741d1) + +commit f391feb1f50632adbe94c3e8cdc4b75091d6b8e1 +Author: David Zafman +Date: Mon May 19 18:33:24 2014 -0700 + + ceph_filestore_dump: Add rm-attr object command + + Signed-off-by: David Zafman + (cherry picked from commit 465d77733c7499fbd65bebe7141895714c625e0d) + +commit 7bd2dd3a1d022df6d3f886ad12a191d0cfcef1d6 +Author: David Zafman +Date: Mon May 19 18:17:27 2014 -0700 + + ceph_filestore_dump: Add set-attr object command + + Signed-off-by: David Zafman + (cherry picked from commit 95554e03dcb74b6d74b2f1b2891b3570abb187b8) + +commit 6724da821158ddee6ef6ee7b5bac9e97dcfc2292 +Author: David Zafman +Date: Thu May 15 15:50:48 2014 -0700 + + ceph_filestore_dump: Add get-attr object command + + Signed-off-by: David Zafman + (cherry picked from commit 55d43c0e20fc853daec134449b9954248fd7ef31) + +commit 55c21b898834d77234227d3fc14c8580ef698663 +Author: David Zafman +Date: Wed May 14 17:52:09 2014 -0700 + + ceph_filestore_dump: Add set-bytes object command + + Signed-off-by: David Zafman + (cherry picked from commit 3c24d1f46a624d0a053ad234997a1f8c8b036db5) + +commit 2f1926d2f57082666350d8223b09f61da5f95b6f +Author: David Zafman +Date: Wed May 14 17:51:29 2014 -0700 + + ceph_filestore_dump: Add get-bytes object command + + Signed-off-by: David Zafman + (cherry picked from commit 869dd92cc8ec29a3a684f88c335d359f225bba24) + +commit fcabb8133af3b90d5d9e976ce658ceccfc5b89c5 +Author: David Zafman +Date: Wed May 14 17:50:16 2014 -0700 + + ceph_filestore_dump: Add list-omap object command + + Signed-off-by: David Zafman + (cherry picked from commit 48890c7741d76cf92b5f589f49378ca57292e88b) + +commit 303e4cedd91ca3553e956eec495a05e3136b3c56 +Author: David Zafman +Date: Wed May 14 18:32:42 2014 -0700 + + ceph_filestore_dump: Add list-attrs object command + + Signed-off-by: David Zafman + (cherry picked from commit 00c6b0673288ca76fe144575b7af76eaa36f5857) + +commit aff2c995f67bdde1a592a9b24e4e96e85735d500 +Author: David Zafman +Date: Wed May 14 17:39:17 2014 -0700 + + ceph_filestore_dump: Add --type list to output objects in a pg in json + + Signed-off-by: David Zafman + (cherry picked from commit 844dabb7f311e68eba0293ae9ca4c68521745877) + +commit 7cda8b7a0b43d709b73d875088ecd169f47d59ab +Author: David Zafman +Date: Wed May 14 17:44:31 2014 -0700 + + ceph_filestore_dump: Add remove object command and usage for new commands + + Signed-off-by: David Zafman + (cherry picked from commit 605caec64b036f8ab5ae451d7e9e7515d414f28e) + +commit a4694643ae4503746d3fac8a0feac706ddc13a16 +Author: David Zafman +Date: Fri Jun 6 17:05:53 2014 -0700 + + ceph_filestore_dump: Add utility function get_fd_data() + + Signed-off-by: David Zafman + (cherry picked from commit d4a9dafe442f139562497d746f80ba49faa954e8) + +commit 62dc823fb3b969c0ad52594419e8a86a3ff1e6ef +Author: David Zafman +Date: Mon May 19 18:16:52 2014 -0700 + + ceph_filestore_dump: Fail import/export with a tty for stdin/stdout + + Signed-off-by: David Zafman + (cherry picked from commit 7520e504cf2cdd3de2f236acb2cbf8a5016e6698) + +commit 9816f872ad59bcaa1a125b297f3991b333aad39c +Author: David Zafman +Date: Tue May 20 11:56:20 2014 -0700 + + ceph_filstore_dump: Save if stdout is a tty and add routine to clean binary strings + + Signed-off-by: David Zafman + (cherry picked from commit 3a574cc78b0e3ec6d8dd0c39ee20e7a54ad64056) + +commit d4aedeb833f23bf4ce6187cb82910ab2e71d48e5 +Author: David Zafman +Date: Wed May 14 15:30:11 2014 -0700 + + common: Add missing ghobject_t::decode() for json + + Signed-off-by: David Zafman + (cherry picked from commit c05f895d15a1d0e78ff5e9ae1a83f0a5424103d0) + + Changes: + Adjusted for older shard_t + +commit dadecb1e05e528093642ba356fa7a70a0b546727 +Author: David Zafman +Date: Wed May 14 15:37:17 2014 -0700 + + ceph_filestore_dump: Add --skip-journal-replay and --skip-mount-omap + + Signed-off-by: David Zafman + (cherry picked from commit 2e9dcb256509e7c921556202052f0cc1dcd91398) + +commit c6369987b1e1e55a9d0ab0bc328f61f52fc608d0 +Author: David Zafman +Date: Wed May 14 15:41:15 2014 -0700 + + os: Add optional flags to generic ObjectStore creation (SKIP_JOURNAL_REPLAY + and SKIP_MOUNT_OMAP) + + Only FileStore cares about these flags, so passed on during create() + + Signed-off-by: David Zafman + (cherry picked from commit 3d9fde9d92322cd8ac3e3d8bcbf5b0a01ef0528b) + + Conflicts: + src/os/FileStore.cc + +commit 3381aebb113d14249f6998a86ebf6b4ec6adc42d +Author: David Zafman +Date: Fri May 16 18:20:11 2014 -0700 + + ceph_filestore_dump: Improve debug output by showing actual offset + + Signed-off-by: David Zafman + (cherry picked from commit 44b261d5d1b36528bfbcb37dbd866b615e14be99) + +commit 1164b2e1610028a40cddbed09f9da5649bd2023a +Author: David Zafman +Date: Wed May 14 12:36:37 2014 -0700 + + ceph_filestore_dump: Use cerr now that we aren't closing stderr + + Signed-off-by: David Zafman + (cherry picked from commit 087c0f9d31e0f3d5bae7eac6231978105a71677e) + +commit f224429aa4fcba897be5e438bbb49d1025e2c482 +Author: David Zafman +Date: Wed May 14 12:42:21 2014 -0700 + + common,ceph_filestore_dump: Add ability for utilities to suppress library dout output + + Suppress dout output with CODE_ENVIRONMENT_UTILITY_NODOUT + ceph_filestore_dump turns on dout output if --debug specified + When used it can still be enable with --log-to-stderr --err-to-stderr + + Signed-off-by: David Zafman + (cherry picked from commit f7f9b251fc377651d8da4cbfd1942c3b86f3247e) + +commit 3f4cabdb84e58fcec0c3f508f980881c59fba948 +Author: David Zafman +Date: Tue May 13 18:27:30 2014 -0700 + + ceph_filestore_dump: Export omap in batches for large omap case + + New function get_omap_batch() + Create a TYPE_OMAP section for each batch + + Signed-off-by: David Zafman + (cherry picked from commit 501dd3c05b8983159a289b021943cb828b908f53) + +commit 0b757af5be338b65fd9124ac5158bfe02ad5f899 +Author: David Zafman +Date: Mon May 12 15:50:34 2014 -0700 + + ceph_filestore_dump: Remove unused bufferlist databl + + Signed-off-by: David Zafman + (cherry picked from commit 398b418e2b9f8260bcfacac8bcebea5beffcceca) + +commit 4a742fe29b6d959912a38d132344c695f89dd34f +Author: Danny Al-Gaaf +Date: Wed May 7 14:12:15 2014 +0200 + + ceph_filestore_dump.cc: cleanup includes + + Signed-off-by: Danny Al-Gaaf + (cherry picked from commit 8620609884243596d35b69c571d2da751e63cf2b) + +commit 420ea03aa3cd52bd035d31ba111c3d6d0745352d +Author: Radoslaw Zarzynski +Date: Thu Jan 29 18:19:16 2015 +0100 + + librados: rectify the guard in RadosClient::wait_for_osdmap(). + + RadosClient::wait_for_osdmap() did not signalize lack of connection + via -ENOTCONN error code when the Objecter instance was allocated. + The proper way is to check the connection state explicitly. + + Signed-off-by: Radoslaw Zarzynski + (cherry picked from commit 34473f78f101d87d2606e0e7112682a47ff24830) + + Conflicts: + src/librados/RadosClient.cc + the modified guard was not present: add the new guard instead + of modifying it + +commit 1b2667211f90a1b630d2ddffe99b0fb00bb3c07c +Author: Radoslaw Zarzynski +Date: Mon Jan 19 15:07:21 2015 +0100 + + librados: fix resources leakage in RadosClient::connect(). + + If RadosClient::connect was called a second time (which could + happen as a part of recovery from failure), the instances + of Objecter and Messenger allocated by the first call were leaked. + + Additionally, the implementation of the method wrongly reported + memory allocation problems -- it throwed std::bad_alloc exception + instead of returning -ENOMEM error code. + + Fixes: #10425 + Signed-off-by: Radoslaw Zarzynski + (cherry picked from commit 624c056da093c8741242892413438a291c03c7d5) + + Conflicts: + src/librados/RadosClient.cc + resolve adding (std::nothrow) that failed because the + prototype of the constructor is not the same + +commit 4e32ff2b60549742d01b18429810c89f5f707548 +Author: Samuel Just +Date: Fri Dec 5 15:29:52 2014 -0800 + + osd_types: op_queue_age_hist and fs_perf_stat should be in osd_stat_t::operator== + + Fixes: 10259 + Backport: giant, firefly, dumpling + Signed-off-by: Samuel Just + (cherry picked from commit 1ac17c0a662e6079c2c57edde2b4dc947f547f57) + +commit 6e58732f546ec6241b198d2473902d66327cdc36 +Author: Joao Eduardo Luis +Date: Mon Jan 19 18:49:15 2015 +0000 + + mon: PGMonitor: skip zeroed osd stats on get_rule_avail() + + Fixes: #10257 + + Signed-off-by: Joao Eduardo Luis + (cherry picked from commit b311e7c36273efae39aa2602c1f8bd90d39e5975) + + Conflicts: + src/mon/PGMonitor.cc + ceph::unordered_map changed the context, simple resolution + +commit bcc8cfb24a96a7874a24760771755088a231a8d7 +Author: Joao Eduardo Luis +Date: Fri Jan 16 18:13:05 2015 +0000 + + mon: PGMonitor: available size 0 if no osds on pool's ruleset + + get_rule_avail() may return < 0, which we were using blindly assuming it + would always return an unsigned value. We would end up with weird + values if the ruleset had no osds. + + Signed-off-by: Joao Eduardo Luis + (cherry picked from commit 8be6a6ab2aa5a000a39c73a98b11a0ab32fffa1c) + +commit 894c8ad26fd2da203dcbf8eb0ad5e2af0223d5a9 +Author: Joao Eduardo Luis +Date: Fri Jan 16 18:12:42 2015 +0000 + + mon: PGMonitor: fix division by zero on stats dump + + Signed-off-by: Joao Eduardo Luis + (cherry picked from commit 50547dc3c00b7556e26b9a44ec68640c5c3a2384) + +commit 1e58bb49b99118d064c1ca92e42cbfb2786fdaff +Author: Loic Dachary +Date: Wed Dec 17 16:06:55 2014 +0100 + + crush: set_choose_tries = 100 for erasure code rulesets + + It is common for people to try to map 9 OSDs out of a 9 OSDs total ceph + cluster. The default tries (50) will frequently lead to bad mappings for + this use case. Changing it to 100 makes no significant CPU performance + difference, as tested manually by running crushtool on one million + mappings. + + http://tracker.ceph.com/issues/10353 Fixes: #10353 + + Signed-off-by: Loic Dachary + (cherry picked from commit 2f87ac807f3cc7ac55d9677d2051645bf5396a62) + + Conflicts: + src/erasure-code/lrc/ErasureCodeLrc.cc + safely ignored because the file does not exist + +commit 27dbbb3c312ea68a96b011ccb12394c75c0fb0f3 +Author: Sage Weil +Date: Thu Feb 12 13:49:50 2015 -0800 + + mon/OSDMonitor: do not trust small values in osd epoch cache + + If the epoch cache says the osd has epoch 100 and the osd is asking for + epoch 200+, do not send it 100+. + + Fixes: #10787 + Backport: giant, firefly + Signed-off-by: Sage Weil + (cherry picked from commit a5759e9b97107488a8508f36adf9ca1aba3fae07) diff --git a/doc/changelog/v0.80.8.txt b/doc/changelog/v0.80.8.txt new file mode 100644 index 0000000000000..849e933b1f79d --- /dev/null +++ b/doc/changelog/v0.80.8.txt @@ -0,0 +1,2547 @@ +commit 69eaad7f8308f21573c604f121956e64679a52a7 (tag: refs/tags/v0.80.8) +Author: Jenkins +Date: Tue Jan 13 06:28:08 2015 -0800 + + 0.80.8 + +commit 3c7cacf00b66a0e17272cf67fe0823ee863dfa7c +Author: Sage Weil +Date: Tue Aug 19 16:43:02 2014 -0700 + + mon/OSDMonitor: fix double-free on old MOSDBoot + + send_latest() does an m->put(). + + Backport: firefly, dumpling + Signed-off-by: Sage Weil + (cherry picked from commit 97f9b6df668315fba6a5924b79024c7a986f4110) + +commit 852d7b5b3c019c02c042b767fc88916088e1a94d +Author: Sage Weil +Date: Thu Jan 8 11:17:03 2015 -0800 + + osd: requeue PG when we skip handling a peering event + + If we don't handle the event, we need to put the PG back into the peering + queue or else the event won't get processed until the next event is + queued, at which point we'll be processing events with a delay. + + The queue_null is not necessary (and is a waste of effort) because the + event is still in pg->peering_queue and the PG is queued. + + This is a firefly-specific patch; a (very) similar one will appear in master + in 492ccc900c3358f36b6b14a207beec071eb06707. + + Backport: giant, firefly + Signed-off-by: Sage Weil + +commit c26ebd38335bb361aade5aacd05ba3217e602b9c +Merge: b67b7e7 96ba529 +Author: Sage Weil +Date: Thu Jan 8 18:31:26 2015 -0800 + + Merge pull request #3217 from boydc2014/firefly + + clear data and payload after removed from ops_in_flight + + Reviewed-by: Sage Weil + +commit b67b7e7ad72a1af96f9fb26ade815e65f72b4cb0 +Merge: db92718 7faae89 +Author: Loic Dachary +Date: Fri Jan 9 01:32:17 2015 +0100 + + Merge pull request #3127 from ktdreyer/firefly-no-epoch + + Revert "ceph.spec.: add epoch" + + Reviewed-by: Ken Dreyer + Reviewed-by: Sage Weil + Reviewed-by: Samuel Just + Reviewed-by: Loic Dachary + +commit db927186288cd4c63a3483b42e9eb9e016c96156 +Merge: 0d4abda 820dbfd +Author: Loic Dachary +Date: Fri Jan 9 01:31:29 2015 +0100 + + Merge pull request #3128 from dachary/wip-10281-make-check-fedora-20 + + tests: fixes to run make check on fedora 20 + + + Reviewed-by: Sage Weil + Reviewed-by: Samuel Just + Reviewed-by: Loic Dachary + +commit 0d4abdaf80f1fedff7975d595abaac0a620c8035 +Merge: 1fdcb52 11995b3 +Author: Loic Dachary +Date: Fri Jan 9 01:30:59 2015 +0100 + + Merge pull request #3169 from ceph/wip-8797-firefly + + Wip 8797 firefly + + Reviewed-by: Sage Weil + Reviewed-by: Samuel Just + Reviewed-by: Loic Dachary + +commit 1fdcb524411a02b5627be66d9fd821a473223e9d +Merge: 4b7b1b0 465eede +Author: Loic Dachary +Date: Fri Jan 9 01:28:49 2015 +0100 + + Merge pull request #3179 from dachary/wip-9998-crush-underfloat-firefly + + crush: fix weight underfloat issue (firefly) + + + Reviewed-by: Sage Weil + Reviewed-by: Samuel Just + Reviewed-by: Loic Dachary + +commit 4b7b1b03bfcb7cb056783555884f211009ea5d46 +Merge: 4897ba4 f55b097 +Author: Loic Dachary +Date: Fri Jan 9 01:28:11 2015 +0100 + + Merge pull request #3220 from ceph/wip-mon-backports.firefly + + mon: backports for #9987 against firefly + + Reviewed-by: Joao Eduardo Luis + Reviewed-by: Sage Weil + Reviewed-by: Samuel Just + Reviewed-by: Loic Dachary + +commit 4897ba4a304bcac548b2121312cd7235c34dd5aa +Merge: efe801f f20225c +Author: Loic Dachary +Date: Fri Jan 9 01:26:30 2015 +0100 + + Merge pull request #3258 from ceph/wip-10372-firefly + + osd: fix librados pool deletion race on firefly + + Reviewed-by: Sage Weil + Reviewed-by: Samuel Just + Reviewed-by: Loic Dachary + +commit efe801ff3a0d25243da18937c07b89227edbaac4 +Author: Warren Usui +Date: Thu Dec 18 20:00:28 2014 -0800 + + If trusty, use older version of qemu + + Fixes #10319 + Signed-off-by: Warren Usui + (cherry-picked from 46a1a4cb670d30397979cd89808a2e420cef2c11) + +commit 96ba529ef8ce76e07b8444c94883afb3468d6762 +Author: Lei Dong +Date: Tue Dec 30 21:02:45 2014 +0800 + + clear data and payload inside ops_in_flight_lock + + http://tracker.ceph.com/issues/9916 Fixes: #9916 + Signed-off-by: Dong Lei + +commit e0648e3d30de504b096c4ae3bbe7d9c17652bdb5 +Merge: 455f940 3624006 +Author: Sage Weil +Date: Mon Dec 29 14:31:23 2014 -0800 + + Merge pull request #3264 from dachary/wip-jerasure-firefly + + erasure-code: update links to jerasure upstream + +commit 455f940908f242b1e34983af61351fd3045ce8ab +Merge: b1ab685 aa95a2d +Author: Sage Weil +Date: Mon Dec 29 10:55:39 2014 -0800 + + Merge pull request #3268 from ceph/firefly-10415 + + libcephfs/test.cc: close fd before umount + +commit 362400667aad0b5098fbe8dbec1b0bde059f84a6 +Author: Loic Dachary +Date: Sun Dec 28 10:29:54 2014 +0100 + + erasure-code: update links to jerasure upstream + + It moved from bitbucket to jerasure.org + + Signed-off-by: Loic Dachary + (cherry picked from commit 8e86f901939f16cc9c8ad7a4108ac4bcf3916d2c) + +commit aa95a2d20dbba2f3a775f709493c987d0d001e9c +Author: Yan, Zheng +Date: Tue Dec 23 10:22:00 2014 +0800 + + libcephfs/test.cc: close fd before umount + + Fixes: #10415 + Signed-off-by: Yan, Zheng + (cherry picked from commit d3fb563cee4c4cf08ff4ee01782e52a100462429) + +commit f20225cb99a0d2d08fccfdf88dc89d758ecba077 (refs/remotes/gh/wip-10372-firefly) +Author: Sage Weil +Date: Tue Dec 23 15:49:26 2014 -0800 + + osdc/Objecter: handle reply race with pool deletion + + We need to handle this scenario: + + - send request in epoch X + - osd replies + - pool is deleted in epoch X+1 + - client gets map X+1, sends a map check + - client handles reply + -> asserts that no map checks are in flight + + This isn't the best solution. We could infer that a map check isn't needed + since the pool existed earlier and doesn't now. But this is firefly and + the fix is no more expensive than the old assert. + + Fixes: #10372 + Signed-off-by: Sage Weil + +commit 820dbfd9947455d07426981b7152861c3c216080 +Author: Sage Weil +Date: Tue Aug 5 15:11:18 2014 -0700 + + test/ceph-disk.sh: mkdir -p + + Signed-off-by: Sage Weil + (cherry picked from commit c2f58e6694a2457200ab3d59e037ad17b9c82028) + +commit 77d393024f9d867b574b8ec8e15ec48a1a291511 +Author: Danny Al-Gaaf +Date: Thu Sep 4 12:23:27 2014 +0200 + + test/ceph-disk.sh: resolve symlinks before check + + Make sure symlinks are resolved in command_fixture() + before compare result of which command and the current + path. + + Signed-off-by: Danny Al-Gaaf + (cherry picked from commit 8ea86dfa7c4a3d7e089cf9d4e49586657875f851) + +commit ed6ec2936513d7dd6c45bccd8edf69a12c71dc7b +Author: Danny Al-Gaaf +Date: Tue Jun 24 19:54:17 2014 +0200 + + test/ceph-disk.sh: fix for SUSE + + On SUSE 'which' returns always the full path of (shell) commands and + not e.g. './ceph-conf' as on Debian. Add check also for full + path returned by which. + + Signed-off-by: Danny Al-Gaaf + (cherry picked from commit 39530536ff923b91899f6303507c283b78040a20) + +commit 754363f4563e7dbda1ef23fadc8d6ef1a3fdd0af +Author: Loic Dachary +Date: Fri Jun 13 14:41:39 2014 +0200 + + tests: prevent kill race condition + + When trying to kill a daemon, keep its pid in a variable instead of + retrieving it from the pidfile multiple times. It prevents the following + race condition: + + * try to kill ceph-mon + * ceph-mon is in the process of dying and removed its pidfile + * try to kill ceph-mon fails because the pidfile is not found + * another ceph-mon is spawned and fails to bind the port + because the previous ceph-mon is still holding it + + Signed-off-by: Loic Dachary + (cherry picked from commit a1c13c57ba20fc329d943ea57523913e11067dc7) + +commit 5be6f2f60e3225bf3d214432044721fe474d55d7 +Author: Danny Al-Gaaf +Date: Wed Jun 25 00:31:48 2014 +0200 + + osd/OSD.cc: parse lsb release data via lsb_release + + Use lsb_release tool to be portable since parsing /etc/lsb-release + is not the same between different distributions. The old code failed + e.g. for SUSE products to parse LSB information. + + Fixes: #8654 + + Signed-off-by: Danny Al-Gaaf + (cherry picked from commit 0b3a3987d382ff33fdf892f189b30df22be80e59) + +commit b62187c52324a4489c2fc1385b1d6574a058f7e8 +Author: Loic Dachary +Date: Tue Aug 26 21:59:39 2014 +0200 + + tests: histogram prevent re-use of local variables + + By moving the tests to separate functions. + + http://tracker.ceph.com/issues/9235 Fixes: #9235 + + Signed-off-by: Loic Dachary + (cherry picked from commit 4b8b25ecd128c34a386ad7c4cc89f323c4d384e1) + +commit e2741c8f1d42cfe91b18201a6a49005d90d85d98 +Author: Loic Dachary +Date: Tue Aug 26 21:59:39 2014 +0200 + + tests: histogram prevent re-use of local variables + + By moving the test to a separate function. + + http://tracker.ceph.com/issues/9235 Fixes: #9235 + + Signed-off-by: Loic Dachary + (cherry picked from commit ee02293ad2ef050672fa8c164ba17b10e8d4ceeb) + +commit cf102df7a3f3b38824c26b9e44a21664fcf979a7 +Author: Loic Dachary +Date: Wed Dec 10 00:08:57 2014 +0100 + + tests: avoid tab interpretation problem on fedora 20 + + Use . instead of tab in echo to avoid variations in how escape sequences + are interpreted by the shell. + + http://tracker.ceph.com/issues/10281 Fixes: #10281 + + Signed-off-by: Loic Dachary + +commit f55b097764beb973c12866f2d7161c6bd870aa07 (refs/remotes/gh/wip-mon-backports.firefly) +Author: Sage Weil +Date: Sun Nov 2 08:50:59 2014 -0800 + + mon/PGMap and PGMonitor: update last_epoch_clean cache from new osd keys + + We were only invalidating the cached value from apply_incremental, which + is no longer called on modern clusters. + + Fix this by storing the update epoch in the key as well (it is not part + of osd_stat_t). + + Backport: giant, firefly, dumpling(?) + Fixes: #9987 + Signed-off-by: Sage Weil + (cherry picked from commit 093c5f0cabeb552b90d944da2c50de48fcf6f564) + +commit 1d314e7a9ab7af5b693583cf2faa5db54f6beb69 +Author: Sage Weil +Date: Sun Nov 2 08:49:48 2014 -0800 + + mon/PGMap: invalidate cached min_last_epoch_clean from new-style pg keys + + We were only invalidating the cache from the legacy apply_incremental(), + which is no longer called on modern clusters. + + Fixes: #9987 + Signed-off-by: Sage Weil + (cherry picked from commit 3fb731b722c50672a5a9de0c86a621f5f50f2d06) + +commit 465eedea9f7411b1e352dc3ccee60a3f1221541d +Author: Sage Weil +Date: Sun Nov 23 18:50:51 2014 -0800 + + crush/CrushWrapper: fix create_or_move_item when name exists but item does not + + We were using item_exists(), which simply checks if we have a name defined + for the item. Instead, use _search_item_exists(), which looks for an + instance of the item somewhere in the hierarchy. This matches what + get_item_weightf() is doing, which ensures we get a non-negative weight + that converts properly to floating point. + + Backport: giant, firefly + Fixes: #9998 + Reported-by: Pawel Sadowski + Signed-off-by: Sage Weil + (cherry picked from commit 9902383c690dca9ed5ba667800413daa8332157e) + +commit cee51af9c1dbde550177c95caf6c93f612442300 +Author: Sage Weil +Date: Fri Nov 21 17:47:56 2014 -0800 + + crush/builder: prevent bucket weight underflow on item removal + + It is possible to set a bucket weight that is not the sum of the item + weights if you manually modify/build the CRUSH map. Protect against any + underflow on the bucket weight when removing items. + + Signed-off-by: Sage Weil + (cherry picked from commit 8c87e9502142d5b4a282b94f929ae776a49be1dc) + +commit 7ccd5eec11e8cd945d24bf9f6390d6c3fa4a06de +Author: Sage Weil +Date: Fri Nov 21 17:37:03 2014 -0800 + + crush/CrushWrapper: fix _search_item_exists + + Reported-by: Pawel Sadowski + Signed-off-by: Sage Weil + (cherry picked from commit eeadd60714d908a3a033aeb7fd542c511e63122b) + +commit b1ab685e00034751a161a3d5e0325c6581999c75 +Merge: dd7c8c2 ef3773a +Author: Sage Weil +Date: Fri Dec 12 06:19:50 2014 -0800 + + Merge pull request #3124 from ceph/wip-10194-firefly + + rgw: optionally call FCGX_Free() on the fcgi connection + + Reviewed-by: Sage Weil + +commit 11995b329045341c17553269267cfd3688a51b0f +Author: Dan Mick +Date: Wed Dec 10 13:19:53 2014 -0800 + + Call Rados.shutdown() explicitly before exit + + This is mostly a demonstration of good behavior, as the resources will + be reclaimed on exit anyway. + + Signed-off-by: Dan Mick + (cherry picked from commit b038e8fbf9103cc42a4cde734b3ee601af6019ea) + +commit e00270b51896f168d5013b7dc92ec7f8b9e19da3 +Author: Dan Mick +Date: Wed Dec 10 13:19:16 2014 -0800 + + rados.py: remove Rados.__del__(); it just causes problems + + Recent versions of Python contain a change to thread shutdown that + causes ceph to hang on exit; see http://bugs.python.org/issue21963. + As it turns out, this is relatively easy to avoid by not spawning + threads on exit, as Rados.__del__() will certainly do by calling + shutdown(); I suspect, but haven't proven, that the problem is + that shutdown() tries to start() a threading.Thread() that never + makes it all the way back to signal start(). + + Also add a PendingReleaseNote and extra doc comments to clarify. + + Fixes: #8797 + Signed-off-by: Dan Mick + (cherry picked from commit 5ba9b8f21f8010c59dd84a0ef2acfec99e4b048f) + + Conflicts: + PendingReleaseNotes + +commit 7faae891aefa4c21c50430fa03d9204a86d082f8 +Author: Ken Dreyer +Date: Tue Dec 9 14:52:19 2014 -0700 + + Revert "ceph.spec.: add epoch" + + If ICE ships 0.80.8, then it will be newer than what RHCEPH ships + (0.80.7), and users won't be able to seamlessly upgrade via Yum. + + We have three options: + A) Revert the "Epoch: 1" change on the Firefly branch. + B) Revert the "Epoch: 1" change in the ICE packages. + C) Bump the Epoch to "2" in Red Hat's packages. + + This commit does Option A. + + Option B may or may not be feasible - it would require a "downstream" + change in ICE, and we haven't done that sort of thing before. + + Due to the RHEL release schedule, Option C is not available to us at + this point. + + This reverts commit b890c1e4706d7cfef7ed24c9df65b439b4f7ff1d. + +commit ef3773ac93413c644e056babce1971f846bbc276 +Author: Yehuda Sadeh +Date: Wed Nov 26 15:18:07 2014 -0800 + + rgw: optionally call FCGX_Free() on the fcgi connection + + Fixes: #10194 + + A new configurable controls this behavior. This forces disconnection of + the fcgi connection when done with the request. + + Signed-off-by: Yehuda Sadeh + +commit dd7c8c2bc6b2810ff7f483af940fa09dbe74e83a +Merge: d759467 a597096 +Author: Gregory Farnum +Date: Mon Dec 8 15:02:52 2014 -0800 + + Merge pull request #3109 from ceph/firefly-10263 + + mds: store backtrace for straydir + + Reviewed-by: Greg Farnum + +commit d7594672b673796901961cc684c9e7de8dc2c95d +Merge: adf9a75 73b47db +Author: Samuel Just +Date: Mon Dec 8 13:19:44 2014 -0800 + + Merge pull request #3009 from dachary/wip-10018-primary-erasure-code-hinfo-firefly + + osd: deep scrub must not abort if hinfo is missing (firefly) + + Reviewed-by: Samuel Just + +commit a5970963a2148697fc6da64facfbf6ab6686b9cd +Author: Yan, Zheng +Date: Fri Nov 7 11:38:37 2014 +0800 + + mds: store backtrace for straydir + + Backport: giant, firefly, emperor, dumpling + Signed-off-by: Yan, Zheng + (cherry picked from commit 0d89db5d3e5ae5d552d4058a88a4e186748ab1d2) + +commit adf9a758000182d27a6582d516356730d02e4099 +Merge: dea38a7 b4a4b75 +Author: Sage Weil +Date: Sat Dec 6 11:06:02 2014 -0800 + + Merge pull request #3089 from dachary/wip-10063-hobject-shard-firefly + + common: do not omit shard when ghobject NO_GEN is set (firefly) + +commit dea38a7af638c833304272c324ed2bc386a40f8f +Merge: ccc8b46 5138091 +Author: Sage Weil +Date: Sat Dec 6 10:59:44 2014 -0800 + + Merge pull request #2480 from dachary/wip-9420-erasure-code-non-regression-firefly + + erasure-code: store and compare encoded contents (firefly) + +commit ccc8b46b2cdc55c1a861f092259ef36a1296f073 +Merge: bef363c cd3447d +Author: Sage Weil +Date: Fri Dec 5 17:33:05 2014 -0800 + + Merge pull request #3096 from dachary/wip-9785-dmcrypt-keys-permissions-firefly + + ceph-disk: dmcrypt file permissions (firefly) + +commit cd3447d04cabf6745001afeef69f25a92400cd0e +Author: Loic Dachary +Date: Thu Dec 4 22:21:32 2014 +0100 + + ceph-disk: dmcrypt file permissions + + The directory in which key files are stored for dmcrypt must be 700 and + the file 600. + + http://tracker.ceph.com/issues/9785 Fixes: #9785 + + Signed-off-by: Loic Dachary + (cherry picked from commit 58682d1776ab1fd4daddd887d921ca9cc312bf50) + +commit bef363c1d1fc06fbb315024145a97a809a2471cd +Merge: cb2c83b 9f3b21d +Author: Sage Weil +Date: Fri Dec 5 09:04:00 2014 -0800 + + Merge pull request #3086 from dachary/wip-10125-radosgw-init-firefly + + rgw: run radosgw as apache with systemd (firefly) + +commit b4a4b75e6d4deb6818681902f85baa9f63acdb4f +Author: Loic Dachary +Date: Fri Nov 14 01:16:10 2014 +0100 + + common: do not omit shard when ghobject NO_GEN is set + + Do not silence the display of shard_id when generation is NO_GEN. + Erasure coded objects JSON representation used by ceph_objectstore_tool + need the shard_id to find the file containing the chunk. + + Minimal testing is added to ceph_objectstore_tool.py + + http://tracker.ceph.com/issues/10063 Fixes: #10063 + + Signed-off-by: Loic Dachary + (cherry picked from commit dcf09aed121f566221f539106d10283a09f15cf5) + + Conflicts: + src/test/ceph_objectstore_tool.py + +commit 9f3b21d1b70be591d68bfa57c3393e8f9af8e7df +Author: Loic Dachary +Date: Tue Dec 2 18:10:48 2014 +0100 + + rgw: run radosgw as apache with systemd + + Same as sysv. + + http://tracker.ceph.com/issues/10125 Fixes: #10125 + + Signed-off-by: Loic Dachary + (cherry picked from commit 7b621f4abf63456272dec3449aa108c89504a7a5) + + Conflicts: + src/init-radosgw.sysv + +commit cb2c83b2f216e503f7a52115f775bda1dbfe0c6a +Merge: e2ec37b 02d4685 +Author: Josh Durgin +Date: Thu Dec 4 11:32:18 2014 -0800 + + Merge pull request #3078 from ceph/wip-10030-firefly + + librbd: don't close an already closed parent image upon failure + + Reviewed-by: Josh Durgin + +commit e2ec37bf45fa5e7f5e787db9b67dbb2a98f2fbb7 +Merge: c4c63e8 af12194 +Author: Sage Weil +Date: Wed Dec 3 23:01:44 2014 -0800 + + Merge pull request #3063 from ceph/wip-10123-firefly + + librbd: protect list_children from invalid child pool IoCtxs + + Reviewed-by: Sage Weil + +commit c4c63e82cd7e4716315ca81208293a2567026c72 +Author: Samuel Just +Date: Tue Sep 23 15:52:08 2014 -0700 + + ReplicatedPG: don't move on to the next snap immediately + + If we have a bunch of trimmed snaps for which we have no + objects, we'll spin for a long time. Instead, requeue. + + Fixes: #9487 + Backport: dumpling, firefly, giant + Reviewed-by: Sage Weil + Signed-off-by: Samuel Just + (cherry picked from commit c17ac03a50da523f250eb6394c89cc7e93cb4659) + +commit 1b656450ca75b12fb98dee82bace914ef5f45c44 +Author: Sage Weil +Date: Tue Sep 23 16:21:33 2014 -0700 + + osd: initialize purged_snap on backfill start; restart backfill if change + + If we backfill a PG to a new OSD, we currently neglect to initialize + purged_snaps. As a result, the first time the snaptrimmer runs it has to + churn through every deleted snap for all time, and to make matters worse + does so in one go with the PG lock held. This leads to badness on any + cluster with a significant number of removed snaps that experiences + backfill. + + Resolve this by initializing purged_snaps when we finish backfill. The + backfill itself will clear out any stray snaps and ensure the object set + is in sync with purged_snaps. Note that purged_snaps on the primary + that is driving backfill will not change during this period as the + snaptrimmer is not scheduled unless the PG is clean (which it won't be + during backfill). + + If we by chance to interrupt backfill, go clean with other OSDs, + purge snaps, and then let this OSD rejoin, we will either restart + backfill (non-contiguous log) or the log will include the result of + the snap trim (the events that remove the trimmed snap). + + Fixes: #9487 + Backfill: firefly, dumpling + Signed-off-by: Sage Weil + (cherry picked from commit 255b430a87201c7d0cf8f10a3c1e62cbe8dd2d93) + +commit 02d4685c56e129cb179a5ddfb8e87aefc2fce0b5 +Author: Jason Dillaman +Date: Thu Nov 6 05:01:38 2014 -0500 + + librbd: don't close an already closed parent image upon failure + + If librbd is not able to open a child's parent image, it will + incorrectly close the parent image twice, resulting in a crash. + + Fixes: #10030 + Backport: firefly, giant + Signed-off-by: Jason Dillaman + (cherry picked from commit 61ebfebd59b61ffdc203dfeca01ee1a02315133e) + +commit af121942d7bdfc59fcfae0429ffb12993e7e019d +Author: Jason Dillaman +Date: Mon Nov 17 21:49:26 2014 -0500 + + librbd: protect list_children from invalid child pool IoCtxs + + While listing child images, don't ignore error codes returned + from librados when creating an IoCtx. This will prevent seg + faults from occurring when an invalid IoCtx is used. + + Fixes: #10123 + Backport: giant, firefly, dumpling + Signed-off-by: Jason Dillaman + (cherry picked from commit 0d350b6817d7905908a4e432cd359ca1d36bab50) + +commit c982da44e0e9e0be3c3d4e8f5e0a186fb2fcebb3 +Merge: 4a148df c4e4a31 +Author: Sage Weil +Date: Sun Nov 30 10:12:04 2014 -0800 + + Merge pull request #3014 from dachary/wip-9665-ceph-disk-partprobe-firefly + + ceph disk zap must call partprobe + +commit c4e4a310f14ca3049ac90422aea95051fe0d4b15 +Author: Loic Dachary +Date: Fri Oct 10 10:26:31 2014 +0200 + + ceph-disk: use update_partition in prepare_dev and main_prepare + + In the case of prepare_dev the partx alternative was missing and is not + added because update_partition does it. + + http://tracker.ceph.com/issues/9721 Fixes: #9721 + + Signed-off-by: Loic Dachary + (cherry picked from commit 23e71b1ee816c0ec8bd65891998657c46e364fbe) + + Conflicts: + src/ceph-disk + +commit e70a81464b906b9a304c29f474e6726762b63a7c +Author: Loic Dachary +Date: Thu Oct 9 18:52:17 2014 +0200 + + ceph-disk: run partprobe after zap + + Not running partprobe after zapping a device can lead to the following: + + * ceph-disk prepare /dev/loop2 + * links are created in /dev/disk/by-partuuid + * ceph-disk zap /dev/loop2 + * links are not removed from /dev/disk/by-partuuid + * ceph-disk prepare /dev/loop2 + * some links are not created in /dev/disk/by-partuuid + + This is assuming there is a bug in the way udev events are handled by + the operating system. + + http://tracker.ceph.com/issues/9665 Fixes: #9665 + + Signed-off-by: Loic Dachary + (cherry picked from commit fed3b06c47a5ef22cb3514c7647544120086d1e7) + +commit 5a5f427bc09076ef3fb13a710dede1b47bb232e0 +Author: Loic Dachary +Date: Fri Oct 10 10:23:34 2014 +0200 + + ceph-disk: encapsulate partprobe / partx calls + + Add the update_partition function to reduce code duplication. + The action is made an argument although it always is -a because it will + be -d when deleting a partition. + + Use the update_partition function in prepare_journal_dev + + Signed-off-by: Loic Dachary + (cherry picked from commit 922a15ea6865ef915bbdec2597433da6792c1cb2) + + Conflicts: + src/ceph-disk + +commit 73b47dbee8858f182fd2b4fd8eb5f3c786877bf4 +Author: Loic Dachary +Date: Thu Nov 6 17:11:20 2014 +0100 + + osd: deep scrub must not abort if hinfo is missing + + Instead it should set read_error. + + http://tracker.ceph.com/issues/10018 Fixes: #10018 + + Signed-off-by: Loic Dachary + (cherry picked from commit 9d84d2e8309d26e39ca849a75166d2d7f2dec9ea) + +commit 5138091a4073d966a65f537280f89372e801d019 +Author: Loic Dachary +Date: Tue Sep 23 11:38:09 2014 +0200 + + erasure-code: add corpus verification to make check + + Signed-off-by: Loic Dachary + +commit 8d3d6bf59aec3877c0231e637270e371d9ed3b8c +Author: Loic Dachary +Date: Sat Sep 13 13:36:09 2014 +0200 + + erasure-code: workunit to check for encoding regression + + Clone the archive of encoded objects and decode all archived objects, up + to and including the current ceph version. + + http://tracker.ceph.com/issues/9420 Refs: #9420 + + Signed-off-by: Loic Dachary + +commit 4f4358708ed3c261ca4027cc9c3dc3f952a99470 +Author: Loic Dachary +Date: Sat Sep 13 10:16:31 2014 +0200 + + erasure-code: store and compare encoded contents + + Introduce ceph_erasure_code_non_regression to check and compare how an + erasure code plugin encodes and decodes content with a given set of + parameters. For instance: + + ./ceph_erasure_code_non_regression \ + --plugin jerasure \ + --parameter technique=reed_sol_van \ + --parameter k=2 \ + --parameter m=2 \ + --stripe-width 3181 \ + --create \ + --check + + Will create an encoded object (--create) and store it into a directory + along with the chunks, one chunk per file. The directory name is derived + from the parameters. The content of the object is a random pattern of 31 + bytes repeated to fill the object size specified with --stripe-width. + + The check function (--check) reads the object back from the file, + encodes it and compares the result with the content of the chunks read + from the files. It also attempts recover from one or two erasures. + + Chunks encoded by a given version of Ceph are expected to be encoded + exactly in the same way by all Ceph versions going forward. + + http://tracker.ceph.com/issues/9420 Refs: #9420 + + Signed-off-by: Loic Dachary + +commit 4a148df544978383c1ed7cd8b90f590adb563f3d +Merge: c069bce 01faf13 +Author: Loic Dachary +Date: Wed Nov 19 02:45:26 2014 +0100 + + Merge pull request #2961 from ceph/wip-10114-firefly + + Add annotation to all assembly files to turn off stack-execute bit + + Reviewed-by: Loic Dachary + +commit 01faf1356f648ded9acda02e7cc67c1adb9e9ee3 +Author: Dan Mick +Date: Fri Nov 14 17:59:57 2014 -0800 + + Add annotation to all assembly files to turn off stack-execute bit + + See discussion in http://tracker.ceph.com/issues/10114 + + Building with these changes allows output from readelf like this: + + $ readelf -lW src/.libs/librados.so.2 | grep GNU_STACK + GNU_STACK 0x000000 0x0000000000000000 0x0000000000000000 0x000000 + 0x000000 RW 0x8 + + (note the absence of 'X' in 'RW') + + Fixes: #10114 + Signed-off-by: Dan Mick + (cherry picked from commit 06a245a9845c0c126fb3106b41b2fd2bc4bc4df3) + (not-yet-present-in-firefly files in isa-l manually removed) + +commit c069bce4e8180da3c0ca4951365032a45df76468 +Merge: 0d8ad6a fac1654 +Author: Samuel Just +Date: Thu Nov 13 10:36:12 2014 -0800 + + Merge pull request #2760 from ceph/wip-9835-firefly + + osd: fix erasure hung op bug (9835) + + Reviewed-by: Samuel Just + +commit fac165475031efdebbb88898ca5c12cd307a5bc3 +Author: Samuel Just +Date: Wed Nov 5 12:12:14 2014 -0800 + + osd: use OSDMap helper to tell if ops are misdirected + + calc_pg_role doesn't actually take into account primary affinity. + + Fixes: #9835 + Signed-off-by: Samuel Just + +commit 588602bf0095de5b59064123ca01345f1364bdde +Author: Sage Weil +Date: Mon Oct 20 13:55:33 2014 -0700 + + osd: discard rank > 0 ops on erasure pools + + Erasure pools do not support read from replica, so we should drop + any rank > 0 requests. + + This fixes a bug where an erasure pool maps to [1,2,3], temporarily maps + to [-1,2,3], sends a request to osd.2, and then remaps back to [1,2,3]. + Because the 0 shard never appears on osd.2, the request sits in the + waiting_for_pg map indefinitely and cases slow request warnings. + This problem does not come up on replicated pools because all instances of + the PG are created equal. + + Fix by only considering role == 0 for erasure pools as a correct mapping. + + Fixes: #9835 + Signed-off-by: Sage Weil + +commit 0c1c4152e6f402af7612c8c8d4719ab0f4cc6ad9 +Author: Sage Weil +Date: Wed Nov 12 17:04:35 2014 -0800 + + osd/OSDMap: add osd_is_valid_op_target() + + Helper to check whether an osd is a given op target for a pg. This + assumes that for EC we always send ops to the primary, while for + replicated we may target any replica. + + Signed-off-by: Sage Weil + (cherry picked from commit 89c02637914ac7332e9dbdbfefc2049b2b6c127d) + +commit 0d8ad6ad3c376bcab981bea9a49e1924d7eddb68 +Author: Josh Durgin +Date: Tue Nov 11 18:16:02 2014 -0800 + + qa: allow small allocation diffs for exported rbds + + The local filesytem may behave slightly differently. This isn't + foolproof, but seems to be reliable enough on rhel7 rootfs, where + exact comparison was failing. + + Fixes: #10002 + Signed-off-by: Josh Durgin + (cherry picked from commit e94d3c11edb9c9cbcf108463fdff8404df79be33) + +commit 0804deeab293e09123d1b58825051ccc4dddbc0e +Author: Sage Weil +Date: Sun May 25 08:38:38 2014 -0700 + + osd: fix map advance limit to handle map gaps + + The recent change in cf25bdf6b0090379903981fe8cee5ea75efd7ba0 would stop + advancing after some number of epochs, but did not take into consideration + the possibilty that there are missing maps. In that case, it is impossible + to advance past the gap. + + Fix this by increasing the max epoch as we go so that we can always get + beyond the gap. + + Signed-off-by: Sage Weil + (cherry picked from commit 1e0a82fd55dede473c0af32924f4bcb5bb697a2b) + +commit d30d6b986433eaef920b3703cf5af3c030f8dcf4 +Merge: d241aa7 d548431 +Author: Gregory Farnum +Date: Fri Nov 7 14:10:18 2014 -0800 + + Merge pull request #2880 from ceph/wip-10025-firefly + + #10025/firefly -- tools: fix MDS journal import + + Reviewed-by: Greg Farnum + +commit d548431a388da1130564d710e1f006772934224b +Author: John Spray +Date: Fri Nov 7 11:34:43 2014 +0000 + + tools: fix MDS journal import + + Previously it only worked on fresh filesystems which + hadn't been trimmed yet, and resulted in an invalid + trimmed_pos when expire_pos wasn't on an object + boundary. + + Fixes: #10025 + + Signed-off-by: John Spray + (cherry picked from commit fb29e71f9a97c12354045ad2e128156e503be696) + +commit d241aa7a347655242cc71b8fa3d778df6948c494 +Merge: 2c85b5d 4afb542 +Author: Samuel Just +Date: Thu Nov 6 10:37:42 2014 -0800 + + Merge remote-tracking branch 'origin/wip-sam-firefly-backports' into firefly + +commit 2c85b5d72953d01296213185382707122e06415c +Merge: 23cbffa 1228658 +Author: Samuel Just +Date: Thu Nov 6 10:30:20 2014 -0800 + + Merge pull request #2737 from ceph/wip-9629-firefly + + osd: do not clone/preserve snapdir on cache-evict (firefly backport) + + Reviewed-by: Samuel Just + +commit 23cbffaa2936dc2707b5b42f8c0e7ce804324ae2 +Merge: 3bba938 e296685 +Author: Samuel Just +Date: Thu Nov 6 10:26:21 2014 -0800 + + Merge pull request #2657 from ceph/wip-9053-9301-firefly + + mon: backport two paxos fixes to firefly + + Reviewed-by: Joao Luis + +commit 3bba9387eb123c6cf055e874db2925b998dc406c +Merge: 3f9bf73 835f8c6 +Author: Samuel Just +Date: Thu Nov 6 10:21:12 2014 -0800 + + Merge pull request #2656 from ceph/wip-9502-firefly + + mon: backport mon disk full check to firefly + + Reviewed-by: Samuel Just + +commit 3f9bf738daf47ff4ff56c9f76d1487a5afc5e30a +Merge: a340e6d 3e17a08 +Author: Samuel Just +Date: Thu Nov 6 10:18:27 2014 -0800 + + Merge pull request #2764 from ceph/wip-9851 + + osd: bring FileJournal in sync with giant + + Reviewed-by: Samuel Just + +commit a340e6d0b166019f58dca0703faf30dd3178c14f +Merge: b7d5f99 b9450b5 +Author: Samuel Just +Date: Thu Nov 6 10:12:17 2014 -0800 + + Merge pull request #2776 from ceph/wip-9675.firefly + + CrushWrapper: pick a ruleset same as rule_id + + Reviewed-by: Samuel Just + +commit b7d5f99c8f4d751e83dc29305649d7a465c657b1 +Author: Sage Weil +Date: Mon Sep 15 15:29:08 2014 -0700 + + ceph-disk: mount xfs with inode64 by default + + We did this forever ago with mkcephfs, but ceph-disk didn't. Note that for + modern XFS this option is obsolete, but for older kernels it was not the + default. + + Backport: firefly + Signed-off-by: Sage Weil + (cherry picked from commit 11496399ef318498c11e551f139d96db52d3309c) + +commit 1a9d000bb679a7392b9dd115373c3827c9626694 +Author: Yehuda Sadeh +Date: Thu Oct 9 10:20:27 2014 -0700 + + rgw: set length for keystone token validation request + + Fixes: #7796 + Backport: giany, firefly + Need to set content length to this request, as the server might not + handle a chunked request (even though we don't send anything). + + Tested-by: Mark Kirkwood + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 3dd4ccad7fe97fc16a3ee4130549b48600bc485c) + +commit 49d27efde2ce5d282c9ee6ca9c8ea9db8f609392 +Author: Yehuda Sadeh +Date: Tue Aug 19 13:15:46 2014 -0700 + + rgw: subuser creation fixes + + Fixes: #8587 + There were a couple of issues, one when trying to identify whether swift + user exists, we weren't using the correct swift id. The second problem + is that we relied on the gen_access flag in the swift case, where it + doesn't really need to apply. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 1441ffe8103f03c6b2f625f37adbb2e1cfec66bb) + +commit 8db2f0969e1715f57088c311a33e7e3499933afb +Merge: 9a15592 2a5d7f0 +Author: Sage Weil +Date: Fri Oct 31 08:35:50 2014 -0700 + + Merge pull request #2847 from dachary/wip-9752-past-intervals-firefly + + osd: past_interval display bug on acting + +commit 2a5d7f08303eb8b1687c5b58426443b3d40e415c +Author: Loic Dachary +Date: Fri Oct 31 00:49:21 2014 +0100 + + osd: past_interval display bug on acting + + The acting array was incorrectly including the primary and up_primary. + + http://tracker.ceph.com/issues/9752 Fixes: #9752 + + Signed-off-by: Loic Dachary + (cherry picked from commit c5f8d6eded52da451fdd1d807bd4700221e4c41c) + +commit 9a15592eae6739183049f8376392c5577145871e +Merge: 6fd8879 c20a242 +Author: Yan, Zheng +Date: Thu Oct 30 17:01:24 2014 -0700 + + Merge pull request #2840 from ceph/firefly-9869 + + Backport "client: cast m->get_client_tid() to compare to 16-bit Inode::flushing_cap_tid" + +commit c20a2421f4a5b33407d7f31806dd4587b4e9077c +Author: Greg Farnum +Date: Wed Oct 22 17:16:31 2014 -0700 + + client: cast m->get_client_tid() to compare to 16-bit Inode::flushing_cap_tid + + m->get_client_tid() is 64 bits (as it should be), but Inode::flushing_cap_tid + is only 16 bits. 16 bits should be plenty to let the cap flush updates + pipeline appropriately, but we need to cast in the proper direction when + comparing these differently-sized versions. So downcast the 64-bit one + to 16 bits. + + Fixes: #9869 + Backport: giant, firefly, dumpling + + Signed-off-by: Greg Farnum + (cherry picked from commit a5184cf46a6e867287e24aeb731634828467cd98) + +commit 4afb54274bb2087da348103d0a7641b3a018d777 +Author: Samuel Just +Date: Thu Sep 11 13:46:51 2014 -0700 + + ReplicatedPG: cancel cb on blacklisted watcher + + Fixes: #8315 + Backport: firefly + Signed-off-by: Samuel Just + (cherry picked from commit 16bd45777166c29c433af3b59254a7169e512d98) + +commit 037aa342fa277351fc605da41489a6ceba81ab05 +Author: Samuel Just +Date: Sun Sep 21 10:19:43 2014 -0700 + + ReplicatedPG::on_removal: clear rollback info + + Fixes: #9293 + Backport: firefly + Signed-off-by: Samuel Just + (cherry picked from commit 544b8c7ffb4af01765b87239f2d7ab88479ee779) + +commit 8978c8dc1f153a9052b5d967ceff11d4f08d51ea +Merge: 8401e7f 4be53d5 +Author: Samuel Just +Date: Thu Oct 30 13:48:18 2014 -0700 + + Merge remote-tracking branch 'origin/wip-9574' into wip-sam-firefly-backports + +commit 8401e7ffa1768770f451143b3c110d1deae1bd40 +Author: Samuel Just +Date: Mon Sep 29 15:01:25 2014 -0700 + + PG: release backfill reservations if a backfill peer rejects + + Also, the full peer will wait until the rejection from the primary + to do a state transition. + + Fixes: #9626 + Backport: giant, firefly, dumpling + Signed-off-by: Samuel Just + (cherry picked from commit 624aaf2a4ea9950153a89ff921e2adce683a6f51) + +commit 5df09fa1b5a42992f9e41aca09e86db0d03d9fbd +Merge: a0937ef a1aa06b +Author: Samuel Just +Date: Thu Oct 30 13:47:22 2014 -0700 + + Merge remote-tracking branch 'origin/wip-9113' into wip-sam-firefly-backports + +commit a0937ef214b9221e7a5e69e7b0f0697471d56293 +Author: Sage Weil +Date: Sun Oct 12 10:05:51 2014 -0700 + + osd/osd_types: consider CRUSH_ITEM_NONE in check_new_interval() min_size check + + Fixes: #9718 + Backport: firefly + Signed-off-by: Sage Weil + (cherry picked from commit d947050c82a511f91c98e1c76e48ffa9e187eee7) + + Conflicts: + src/osd/osd_types.cc + +commit 35e8e6b2c1733cd421bf6c3916553eea3786e76e +Author: Samuel Just +Date: Mon Oct 20 14:10:58 2014 -0700 + + PG:: reset_interval_flush and in set_last_peering_reset + + If we have a change in the prior set, but not in the up/acting set, we go back + through Reset in order to reset peering state. Previously, we would reset + last_peering_reset in the Reset constructor. This did not, however, reset the + flush_interval, which caused the eventual flush event to be ignored and the + peering messages to not be sent. + + Instead, we will always reset_interval_flush if we are actually changing the + last_peering_reset value. + + Fixes: #9821 + Backport: firefly + Signed-off-by: Samuel Just + (cherry picked from commit d9ff3a6b789c5b9c77aefa3751bd808f5d7b8ca7) + +commit 3f35db42977704a12ac4b5bcad6261aaf6b6a88d +Author: Samuel Just +Date: Thu Oct 23 09:11:28 2014 -0700 + + ReplicatedPG: writeout hit_set object with correct prior_version + + Fixes: #9875 + Backport: giant, firefly + Signed-off-by: Samuel Just + (cherry picked from commit 1a3ad307f1a4c0a956d6fd31d13f01ffe411a09d) + +commit 6fd88792e77cdc7ad33ff0acf9b3189a7c525430 +Merge: 0975ec9 afe6bd8 +Author: Sage Weil +Date: Sun Oct 26 20:37:52 2014 -0700 + + Merge pull request #2717 from dachary/wip-9747-ceph-spec-firefly + + rpm: 95-ceph-osd-alt.rules is not needed for centos7 / rhel7 (firefly) + +commit b9450b532ab7ad23ec6e2c22ed7cf55e6e1cc4c0 +Author: Xiaoxi Chen +Date: Wed Aug 20 15:35:44 2014 +0800 + + CrushWrapper: pick a ruleset same as rule_id + + Originally in the add_simple_ruleset funtion, the ruleset_id + is not reused but rule_id is reused. So after some add/remove + against rules, the newly created rule likely to have + ruleset!=rule_id. + + We dont want this happen because we are trying to hold the constraint + that ruleset == rule_id. + + Signed-off-by: Xiaoxi Chen + (cherry picked from commit 78e84f34da83abf5a62ae97bb84ab70774b164a6) + + Conflicts: + src/test/erasure-code/TestErasureCodeIsa.cc + + Fixes: #9675 + +commit 3e17a0872a3864cb6aee46959afd955ef0cbafeb +Author: Ma Jianpeng +Date: Mon Jul 21 15:08:55 2014 +0800 + + os/FileJournal: When dump journal, using correctly seq avoid misjudging joural corrupt. + + In func FileJournal::dump, it always using seq=0 as last-seq and it can + misjudge the journal corrupt. + + Signed-off-by: Ma Jianpeng + (cherry picked from commit 5f65b4db6d1dad7c2c5a09eab42af63a82ea9e9b) + +commit 350da8c98bc1e90cd392992aba290c7478280d88 +Author: Loic Dachary +Date: Fri Sep 26 01:15:53 2014 +0200 + + os: io_event.res is the size written + + And not an error code to be converted with cpp_strerror() + + Signed-off-by: Loic Dachary + (cherry picked from commit 7827e0035e3350ad2d9230f27a1629545f53af5c) + +commit ecff3761f3d15061a2cbf0a595ca249a4c424f4c +Author: Ma Jianpeng +Date: Thu Aug 21 15:10:46 2014 +0800 + + os/FileJournal: For journal-aio-mode, don't use aio when closing journal. + + For jouranl-aio-mode when closing journal, the write_finish_thread_entry may exit before + write_thread_entry. This cause no one wait last aios to complete. + On some platform, after that the journal-header on journal corrupted. + To avoid this, when closing jouranl we don't use aio. + + Fixes: 9073 + Reported-by: Mark Kirkwood + Tested-by: Mark Kirkwood + Signed-off-by: Ma Jianpeng + (cherry picked from commit e870fd09ce846e5642db268c33bbe8e2e17ffef2) + +commit dbc33fbab4b35e2ce1e46a881f6714262502c243 +Author: Ma Jianpeng +Date: Thu Aug 21 21:07:51 2014 +0800 + + os/FileJournal: Only using aio then alloc the related resources. + + If define HAVE_LIBAIO, it alloc related resouces. But itt don't check whether + using aio mode. Only using aio it alloc the related resources. + + Signed-off-by: Ma Jianpeng + (cherry picked from commit a66a4931d5be9ee26c0983b3154fdbe37261a51c) + +commit 3312c6eeca21dcd566df9bdd8de7b3fe33356b57 +Author: Ma Jianpeng +Date: Thu Aug 21 15:49:44 2014 +0800 + + os/FileJournal: Tune the judge logic for read_header. + + When reading journal-header, it should firstly check the result of + pread and then do decoce operation. + + Signed-off-by: Ma Jianpeng + (cherry picked from commit c8e2b89cf6bc36a0ff29887b9e76cbbeceef9f8f) + +commit b42107584449e1f85cbee97bfa486ebeb310e6a1 +Author: Sage Weil +Date: Tue Aug 19 20:50:13 2014 -0700 + + os/FileJournal: signal aio_cond even if seq is 0 + + This can happen if we write a journal but no events. + + Reported-by: Somnath Roy + Reported-by: Ma, Jianpeng + Signed-off-by: Sage Weil + (cherry picked from commit 57778e2c577c1e1bbf9525232720a2994fa36abc) + +commit 4a43ea171d84a9f3a13064030eb386fcfbe3bbb8 +Author: Ma Jianpeng +Date: Wed Jul 23 10:10:38 2014 -0700 + + os/FileJournal: Update the journal header when closing journal + + When closing journal, it should check must_write_header and update + journal header if must_write_header alreay set. + It can reduce the nosense journal-replay after restarting osd. + + Signed-off-by: Ma Jianpeng + Reviewed-by: Sage Weil + (cherry picked from commit 5bf472aefb7360a1fe17601b42e551df120badfb) + +commit 0975ec9cec1c466f7b15f5173541a7eab02dae18 +Author: Sage Weil +Date: Tue Oct 21 06:53:36 2014 -0700 + + Revert "os/FileJournal: stop aio completion thread *after* writer thread" + + This reverts commit 334631ae4641824b3df49245f36a8fd4b143bf3f. + +commit 5a10b95f7968ecac1f2af4abf9fb91347a290544 +Merge: cc69c16 d5bac46 +Author: Samuel Just +Date: Fri Oct 17 10:47:22 2014 -0700 + + Merge pull request #2716 from ceph/wip-firefly-9419 + + Backport fix for bug #9419 + +commit cc69c16c2108cb231b2434a53e3eef51b597756b +Merge: f9cdaab 334631a +Author: Samuel Just +Date: Fri Oct 17 10:44:30 2014 -0700 + + Merge pull request #2724 from dachary/wip-9073-journal-aio-mode-firefly + + os/FileJournal: stop aio completion thread *after* writer thread + +commit f9cdaabe078415d7927e2618030996f2f24be2f1 +Merge: 0b4b34a 412c277 +Author: Sage Weil +Date: Fri Oct 17 08:20:53 2014 -0700 + + Merge pull request #2742 from ceph/firefly-unknown-locktype + + mds: reply -EOPNOTSUPP for unknown lock type + +commit 412c2770c74abea73a94e10df7b83ebe11ac82ee +Author: Yan, Zheng +Date: Tue Oct 14 22:02:41 2014 +0800 + + mds: reply -EOPNOTSUPP for unknown lock type + + Signed-off-by: Yan, Zheng + (cherry picked from commit 675392335c53ff7879031fb9184e4f35bcc90fe2) + +commit 1228658871e53e350bdab3e72cdefd1caf33c291 +Author: Sage Weil +Date: Sun Sep 21 15:56:18 2014 -0700 + + osd/ReplicatedPG: do not clone or preserve snapdir on cache_evict + + If we cache_evict a head in a cache pool, we need to prevent + make_writeable() from cloning the head and finish_ctx() from + preserving the snapdir object. + + Fixes: #8629 + Backport: firefly + Signed-off-by: Sage Weil + (cherry picked from commit ce8eefca13008a9cce3aedd67b11537145e1fd77) + +commit 88e6014463e86e48d78ac419226644209f83f2a0 +Author: Sage Weil +Date: Sun Sep 21 15:54:15 2014 -0700 + + ceph_test_rados_api_tier: add EvictSnap2 test case + + Verify an evict doesn't create a snapdir object. Reproduces #8629 + + Signed-off-by: Sage Weil + (cherry picked from commit 398c74eacb1ce4e573aef0d24718a5925d90272b) + +commit 0b4b34aac497d17a6474c35891aab2bde962524b +Merge: 322958a 0a72235 +Author: Sage Weil +Date: Thu Oct 16 06:09:51 2014 -0700 + + Merge pull request #2734 from ceph/wip-firefly-undump + + mds: fix --undump-journal + + Reviewed-by: Sage Weil + +commit 0a72235b0556752fadebc3e155ad41b13a0a15e9 +Author: John Spray +Date: Thu Oct 16 11:17:40 2014 +0100 + + mds: fix --undump-journal + + This hadn't worked for a long time. This is a fix + for firefly only, as this code was refactored in giant. + + Signed-off-by: John Spray + +commit 835f8c6f6121f3ebdec3a0d2d5cb1376301dc03a +Author: Joao Eduardo Luis +Date: Tue Sep 23 14:02:55 2014 +0100 + + ceph-mon: check fs stats just before preforking + + Otherwise statfs may fail if mkfs hasn't been run yet or if the monitor + data directory does not exist. There are checks to account for the mon + data dir not existing and we should wait for them to clear before we go + ahead and check the fs stats. + + Signed-off-by: Joao Eduardo Luis + (cherry picked from commit 7f71c11666b25e91dd612c58b4eda9ac0d4752f8) + + Conflicts: + src/ceph_mon.cc + +commit 1ddf435464562f70f63cdb0032da3187f34ce853 +Author: Joao Eduardo Luis +Date: Thu Sep 18 16:53:43 2014 +0100 + + ceph_mon: check available storage space for mon data dir on start + + error out if available storage space is below 'mon data avail crit' + + Fixes: #9502 + + Signed-off-by: Joao Eduardo Luis + (cherry picked from commit 2da1a2914ac7df18ce842b0aac728fffb5bed2b6) + + Conflicts: + src/ceph_mon.cc + +commit 112317791b744d9890a65adcc13554c85e90f3af +Author: Joao Eduardo Luis +Date: Thu Sep 18 16:52:34 2014 +0100 + + mon: DataHealthService: use get_fs_stats() instead + + and relieve the DataStats struct from clutter by using + ceph_data_stats_t instead of multiple fields. + + Signed-off-by: Joao Eduardo Luis + (cherry picked from commit 9996d446988768658db751a7843b13cf3d194213) + + Conflicts: + src/mon/DataHealthService.cc + +commit f0a92d72fd44542619338db7d6da98e147b6a9fc +Author: Joao Eduardo Luis +Date: Thu Sep 18 16:32:20 2014 +0100 + + common: util: add get_fs_stats() function + + simplifies the task of obtaining available/used disk space, as well as + used available percentage. + + Signed-off-by: Joao Eduardo Luis + (cherry picked from commit 3d74230d1c0fbfa15487e2a90ac60b883476e840) + +commit a8fa009fbe5d5d4d9cfa134d5ecd05c92290a8eb +Author: Joao Eduardo Luis +Date: Thu Sep 18 16:25:44 2014 +0100 + + include/util.h: prevent multiple inclusion of header + + Signed-off-by: Joao Eduardo Luis + (cherry picked from commit 76eff9503493312cb97e4a2f9236f4dbcbf931df) + +commit e296685e8f3f5158238216eefb76482bd6d55134 +Author: Sage Weil +Date: Thu Sep 18 14:23:36 2014 -0700 + + mon: re-bootstrap if we get probed by a mon that is way ahead + + During bootstrap we verify that our paxos commits overlap with the other + mons we will form a quorum with. If they do not, we do a sync. + + However, it is possible we pass those checks, then fail to join a quorum + before the quorum moves ahead in time such that we no longer overlap. + Currently nothing kicks up back into a probing state to discover we need + to sync... we will just keep trying to call or join an election instead. + + Fix this by jumping back to bootstrap if we get a probe that is ahead of + us. Only do this from non probe or sync states as these will be common; + it is only the active and electing states that matter (and probably just + electing!). + + Fixes: #9301 + Backport: giant, firefly + Signed-off-by: Sage Weil + (cherry picked from commit c421b55e8e15ef04ca8aeb47f7d090375eaa8573) + +commit 0e57767d5fc524939e8968b506ce2fb3f4f80656 +Author: Sage Weil +Date: Thu Sep 18 14:11:24 2014 -0700 + + mon/Paxos: fix off-by-one in last_ vs first_committed check + + peon last_committed + 1 == leader first_committed is okay. Note that the + other check (where I clean up whitespace) gets this correct. + + Fixes: #9301 (partly) + Signed-off-by: Sage Weil + (cherry picked from commit d81cd7f86695185dce31df76c33c9a02123f0e4a) + +commit 1f4aaf648f4aa6f6056d0e8ce629eeea05c5424d +Author: Sage Weil +Date: Wed Aug 13 16:17:02 2014 -0700 + + mon/Paxos: share state and verify contiguity early in collect phase + + We verify peons are contiguous and share new paxos states to catch peons + up at the end of the round. Do this each time we (potentially) get new + states via a collect message. This will allow peons to be pulled forward + and remain contiguous when they otherwise would not have been able to. + For example, if + + mon.0 (leader) 20..30 + mon.1 (peon) 15..25 + mon.2 (peon) 28..40 + + If we got mon.1 first and then mon.2 second, we would store the new txns + and then boot mon.1 out at the end because 15..25 is not contiguous with + 28..40. However, with this change, we share 26..30 to mon.1 when we get + the collect, and then 31..40 when we get mon.2's collect, pulling them + both into the final quorum. + + It also breaks the 'catch-up' work into smaller pieces, which ought to + smooth out latency a bit. + + Signed-off-by: Sage Weil + (cherry picked from commit c54f1e4d66b22bad715ac17e9baa72ab93e48c46) + +commit 6c5b9a666fcd94e175a8b9771368b55246957efe +Author: Sage Weil +Date: Thu Aug 14 16:55:58 2014 -0700 + + mon/Paxos: verify all new peons are still contiguous at end of round + + During the collect phase we verify that each peon has overlapping or + contiguous versions as us (and can therefore be caught up with some + series of transactions). However, we *also* assimilate any new states we + get from those peers, and that may move our own first_committed forward + in time. This means that an early responder might have originally been + contiguous, but a later one moved us forward, and when the round finished + they were not contiguous any more. This leads to a crash on the peon + when they get our first begin message. + + For example: + + - we have 10..20 + - first peon has 5..15 + - ok! + - second peon has 18..30 + - we apply this state + - we are now 18..30 + - we finish the round + - send commit to first peon (empty.. we aren't contiguous) + - send no commit to second peon (we match) + - we send a begin for state 31 + - first peon crashes (it's lc is still 15) + + Prevent this by checking at the end of the round if we are still + contiguous. If not, bootstrap. This is similar to the check we do above, + but reverse to make sure *we* aren't too far ahead of *them*. + + Fixes: #9053 + Signed-off-by: Sage Weil + (cherry picked from commit 3e5ce5f0dcec9bbe9ed4a6b41758ab7802614810) + +commit 11d2c9dd4aeb835ca73bfb41fb15b1038547adf6 +Author: Sage Weil +Date: Wed Aug 13 16:01:01 2014 -0700 + + mon/Paxos: put source mon id in a temp variable + + Signed-off-by: Sage Weil + (cherry picked from commit bb046ed01ecf58b8c87eeeee2e00a476e6fba467) + +commit 322958a5aad82c031b54592b372aa053e8993be4 +Author: Sage Weil +Date: Wed Oct 15 12:26:00 2014 -0700 + + qa/workunits/rbd/import_export.sh: be case insensitive + + Stop tripping over this change (from dumpling). + + Signed-off-by: Sage Weil + (cherry picked from commit 5558afa03dbd1b20766b76e9410ef5bc3e73784f) + +commit 334631ae4641824b3df49245f36a8fd4b143bf3f +Author: Sage Weil +Date: Fri Aug 29 19:40:29 2014 -0700 + + os/FileJournal: stop aio completion thread *after* writer thread + + The writer thread may submit a new aio to update the header in its + final moments before shutting down. Do not stop the aio thread until after + that has happened or else we may not wait for those aio completions. + + Signed-off-by: Sage Weil + (cherry picked from commit c776a89880fdac270e6334ad8e49fa616d05d0d4) + + Conflicts: + src/os/FileJournal.cc + +commit 111eec9ff6325a12fcbf066ae08f27919aeae5d8 +Merge: cf4e300 6c0127f +Author: Sage Weil +Date: Tue Oct 14 14:57:42 2014 -0700 + + Merge remote-tracking branch 'gh/firefly' into firefly-next + +commit cf4e30095e8149d1df0f2c9b4c93c9df0779ec84 +Author: Xiaoxi Chen +Date: Tue Aug 5 16:12:22 2014 +0800 + + mon/OSDMonitor : Use user provided ruleset for replicated pool + + When creating a replicated pool, currently ceph ignore the ruleset + name provided by user but use a global default ruleset. + + This patch fix this bug, so the rulset specified by + ceph osd pool create replicated + can be properly set. + + Signed-off-by: Xiaoxi Chen + (cherry picked from commit bf9726a294abd32b429170284ac328a592802648) + +commit bfd7da10e708a6eefc6d992b2b6337b7f06fd5ed +Author: Loic Dachary +Date: Tue Jun 3 13:05:19 2014 +0200 + + documentation: update osd pool create erasure + + The properties are replaced with erasure code profiles. Remove the + reference to properties and the documentation of each erasure-code + related property. + + Signed-off-by: Loic Dachary + (cherry picked from commit 8ff4edda73abb920c91e1226a330e3659def1fbe) + +commit afe6bd89f8a1588fb67063d1a08a4be8c1ab2ce6 +Author: Loic Dachary +Date: Sat Oct 11 18:20:36 2014 +0200 + + rpm: 95-ceph-osd-alt.rules is not needed for centos7 / rhel7 + + The || instead of && had it always installed. That was fixed in EPEL + already. + + http://tracker.ceph.com/issues/9747 Fixes: #9747 + + Signed-off-by: Loic Dachary + (cherry picked from commit 5ff4a850a0d809b3f25988c6cceb82c35095ef84) + +commit d5bac46e06c5420f29a021b294e391b2c6694cbd +Author: David Zafman +Date: Wed Sep 24 16:02:21 2014 -0700 + + osd: Return EOPNOTSUPP if a set-alloc-hint occurs with OSDs that don't support + + Add CEPH_FEATURE_OSD_SET_ALLOC_HINT feature bit + Collect the intersection of all peer feature bits during peering + When handling CEPH_OSD_OP_SETALLOCHINT check that all OSDs support it + by checking for CEPH_FEATURE_OSD_SET_ALLOC_HINT feature bit. + + Fixes: #9419 + Backport: firefly + + Signed-off-by: David Zafman + (cherry picked from commit 9b39033f2b2bcdd2be0f6da4dff06023d0f77499) + + Conflicts: + + src/include/ceph_features.h + src/osd/PG.cc + src/osd/PG.h + src/osd/ReplicatedPG.cc + +commit de08802dcf35aea516d013d3d6116aaa7707b923 +Author: David Zafman +Date: Fri Sep 19 15:12:55 2014 -0700 + + osd: Remove unused PG functions queue_notify(), queue_info(), queue_log() + + Signed-off-by: David Zafman + (cherry picked from commit 70ef4c11cbae669799c30c7592073ad7aa11dcd6) + +commit 5b5aba73031e901457ca27cf15600ce1ca90e258 +Merge: 345714b a1ae7f4 +Author: Gregory Farnum +Date: Fri Oct 10 06:57:06 2014 -0700 + + Merge pull request #2691 from ceph/firefly-unused-variable + + Firefly unused variable + +commit a1ae7f471c809e69d363b9145e70160533bfa48c +Author: Yan, Zheng +Date: Fri Oct 10 21:36:39 2014 +0800 + + mds: Locker: remove unused variable + + Signed-off-by: Yan, Zheng + +commit 345714b6b4d004ad03cc7952dc56c6db87664ee4 +Merge: fd20a1d 2afb6fe +Author: Yan, Zheng +Date: Fri Oct 10 09:37:53 2014 +0800 + + Merge pull request #2681 from ceph/firefly-locker-null + + mds: Locker: fix a NULL deref in _update_cap_fields + +commit 2afb6febdd8482b8fec5890d79944d656faf1382 +Author: Greg Farnum +Date: Thu Oct 9 15:12:19 2014 -0700 + + mds: Locker: fix a NULL deref in _update_cap_fields + + The MClientCaps* is allowed to be NULL, so we can't deref it unless + the dirty param is non-zero. So don't do the ahead-of-time lookup; + just call it explicitly in the if block. + + Signed-off-by: Greg Farnum + (cherry picked from commit 3cd8a7fb9683577a7d6e934f18c29b7e84415be6) + +commit fd20a1d01bde67fb1edc6058e38435af9d5d6abc +Merge: e1bd1b2 86926c6 +Author: Loic Dachary +Date: Wed Oct 8 08:44:46 2014 +0200 + + Merge pull request #2662 from dachary/wip-9677-ioprio-class-firefly + + common: ceph_ioprio_string_to_class always returns -EINVAL + +commit 86926c6089d63014dd770b4bb61fc7aca3998542 +Author: Loic Dachary +Date: Tue Oct 7 14:06:38 2014 +0200 + + common: ceph_ioprio_string_to_class always returns -EINVAL + + The l string is always empty because std::transform needs a + pre-allocated string. Replace with the in-place version. Add unit tests. + + http://tracker.ceph.com/issues/9677 Fixes: #9677 + + Signed-off-by: Loic Dachary + (cherry picked from commit 3535b7aba3df8b54fa5117b8a9c2f52b8f0f118b) + + Conflicts: + src/test/Makefile.am + +commit 5f2eec5036a2910aca1e8ce2d94444d3ed0477df +Author: Loic Dachary +Date: Tue Oct 7 14:05:08 2014 +0200 + + osd: log error if set_ioprio fails to parse class + + Signed-off-by: Loic Dachary + (cherry picked from commit 5088e0d49332d579ba7e33c2c9baee3d5f701a3e) + +commit 2796d5151df4dcde324a4d09a83c9a779cece00e +Author: Loic Dachary +Date: Tue Oct 7 14:03:39 2014 +0200 + + common: set_ioprio debug message including pid + + Signed-off-by: Loic Dachary + (cherry picked from commit 33339c7754875eb7e513345ee6b26a9b2b4d2707) + +commit d5ed6b0587b9999b2fd41377b0426e3b09ef8ab9 +Author: Loic Dachary +Date: Tue Oct 7 14:02:09 2014 +0200 + + common: do not set ioprio if pid is not set + + Signed-off-by: Loic Dachary + (cherry picked from commit c7e4c0bfe70bf29d3b8fe4df4e4b934853e33d26) + +commit e1bd1b2774f4eae5444b5f7b984193cb91a2dd98 +Merge: 726c6a1 459dca1 +Author: Sage Weil +Date: Tue Oct 7 09:58:03 2014 -0700 + + Merge pull request #2632 from ceph/wip-9039-firefly + + rgw: copy object data if target bucket is in a different pool + +commit 726c6a147a14c00cf12eb6c6561655475282419f +Author: Sage Weil +Date: Mon Oct 6 15:50:51 2014 -0700 + + debian/control: fix python-ceph -> ceph file move to allow upgrades + + This is a backport of 5c6c366d2abe771c581690270c2d176ebb30c571 with the + version numbers changed, to compensate for the change in + fe3434f41cd09433975d7d0f9dbb2fae662e4a1b (backported in + bf1933e5c184476a354664c42fec834e9f59067c). + + Tested-by: Tamil Muthamizhan + Signed-off-by: Sage Weil + +commit 884f7c40c4a28d519847d3995c8d98e5837ceaf0 +Merge: 31d57c9 c8a8e47 +Author: Sage Weil +Date: Mon Oct 6 07:01:50 2014 -0700 + + Merge remote-tracking branch 'gh/wip-rpm-epoch-firefly' into firefly + + Reviewed-by: Boris Ranto + +commit 31d57c9a28502a4a72f8aa141f7ed63ffe1e0192 +Merge: 9a3bac0 548be0b +Author: Loic Dachary +Date: Mon Oct 6 09:50:33 2014 +0200 + + Merge pull request #2643 from johnugeorge/wip-9492-crush-firefly + + Crush: Backporting fixes for #9492 to firefly + + Reviewed-by: Loic Dachary + +commit 548be0b2aea18ed3196ef8f0ab5f58a66e3a9af4 +Author: Johnu George +Date: Mon Sep 29 10:07:44 2014 -0700 + + Crush: Ensuring at most num-rep osds are selected + + Crush temporary buffers are allocated as per replica size configured + by the user.When there are more final osds (to be selected as per + rule) than the replicas, buffer overlaps and it causes crash.Now, it + ensures that at most num-rep osds are selected even if more number of + osds are allowed by indep rule. The fix for firstn rules is already + merged as part of bug #9492. Required test files are added. + + Fixes: #9492 + + Signed-off-by: Johnu George johnugeo@cisco.com + (cherry picked from commit 234b066ba04976783d15ff2abc3e81b6cc06fb10) + +commit e30c570ce691a994898b4a933c57e7ae014cdc30 +Author: Johnu George +Date: Wed Sep 24 09:32:50 2014 -0700 + + Crush: Ensuring at most num-rep osds are selected + + Crush temporary buffers are allocated as per replica size configured + by the user.When there are more final osds (to be selected as per + rule) than the replicas, buffer overlaps and it causes crash.Now, it + ensures that at most num-rep osds are selected even if more number of + osds are allowed by the rule. + + Fixes: #9492 + + Signed-off-by: Johnu George + (cherry picked from commit 6b4d1aa99718e3b367496326c1e64551330fabc0) + +commit 9a3bac0c1a7a42cdf7ba846c9ad0a3ae0f15b4bb +Merge: 0b978fb 5a5e7e7 +Author: Sage Weil +Date: Fri Oct 3 11:58:41 2014 -0700 + + Merge pull request #2634 from dachary/wip-9653-ceph-disk-bootstrap-osd-firefly + + ceph-disk: bootstrap-osd keyring ignores --statedir (firefly) + +commit c8a8e4763a55dec44836bc679254ee7dcc448567 +Author: Sage Weil +Date: Wed Oct 1 12:33:38 2014 -0700 + + ceph.spec: fix typo + + Signed-off-by: Sage Weil + (cherry picked from commit da9ae5c92ecb4059e0ec6be5ce03af46430e76a5) + +commit b890c1e4706d7cfef7ed24c9df65b439b4f7ff1d +Author: Sage Weil +Date: Wed Oct 1 06:02:02 2014 -0700 + + ceph.spec.: add epoch + + This is done in fedora packaging. Do it here too so that you can move + between upstream packages (from ceph.com) and fedora and other derivatives + will builds. + + Backport: firefly, dumpling + Signed-off-by: Sage Weil + (cherry picked from commit 83888362089346e473d6fd6e1d366b826d7bd739) + + Conflicts: + + ceph.spec.in + +commit d01db8080d97bfae34dcee3d153bf10e6f5327dd +Author: Sage Weil +Date: Tue May 20 13:41:35 2014 -0700 + + ceph.spec.in: remove BuildRoot + + Deprecated + + Fixes: #8143 + Signed-off-by: Sage Weil + (cherry picked from commit 401319a1527dd9cb5398916105d31e7ec065763d) + +commit 3c2b5c440863df548afc2bd8aa5440f15a44ac02 +Author: Dan Mick +Date: Tue Aug 12 16:31:22 2014 -0700 + + ceph.spec.in: tests for rhel or centos need to not include _version + + rhel_version and centos_version are apparently the OpenSUSE Build + names; the native macros are just "rhel" and "centos" (and contain + a version number, should it be necessary). + + Signed-off-by: Dan Mick + (cherry picked from commit 7474f720c2418cf3d52b755f2b60c524e413570a) + +commit c82c29d6174022be45929fe9ba8a84993eef974a +Author: Dan Mick +Date: Tue Aug 12 14:46:52 2014 -0700 + + ceph.spec.in: Add a small comment on the empty %files section + + as suggested by Dan Mick. + + Signed-off-by: Erik Logtenberg + (cherry picked from commit e37b262c7928934530c5bb09fe56f83eb61f4244) + +commit a4f748aa906fc65b14f65515721bc3a815c18fb8 +Author: Dan Mick +Date: Tue Aug 12 14:39:18 2014 -0700 + + ceph.spec.in: Obsolete all older versions. + + Now this changeset can be used on all current ceph releases that already + have the package split. + + Signed-off-by: Erik Logtenberg + (cherry picked from commit 875a99e25f0ad2cb47149a3b5a28b4771a09125c) + +commit 74c7f3caec1cc7a5da89ef33de36a8b59249cfcd +Author: Dan Mick +Date: Tue Aug 12 14:09:43 2014 -0700 + + ceph.spec.in: No version on ceph-libs Obsoletes. + + If we are installing with the new package structure we don't ever want the + new package to co-exist with the old one; this includes the mistakenly- + released v0.81 on Fedora, which should be removed in favor of this + version. + + Signed-off-by: Sandon Van Ness + Reviewed-by: Dan Mick + (cherry picked from commit 8f95daf66b5fdb2a8141988480f984c1249599c5) + +commit 561261b6efb7ea442686bb8fce387c4de2482067 +Author: Sandon Van Ness +Date: Fri Aug 8 18:01:30 2014 -0700 + + ceph.spec.in: Obselete ceph-libcephfs (not libcephfs) + + I am guessing that because it was a sub-package libcephfs was mistakenly + used instead of ceph-libcephfs. + + Signed-off-by: Sandon Van Ness + (cherry picked from commit 75985024bd30ca6fbe4c61aa7f7cbe5306c9a988) + +commit 107bfd9ee7dbf360561187b9e0946964d40b9b1c +Author: Erik Logtenberg +Date: Fri Aug 1 14:20:18 2014 +0200 + + ceph.spec.in: We need those nice recent changes for rhel7 in Fedora too. + + Signed-off-by: Erik Logtenberg + (cherry picked from commit 00877ae502ac52613bcd5c5c834d72787d668dca) + +commit 7946c5e5de8d6cd25d20beee15f3489113e51539 +Author: Dan Mick +Date: Wed Aug 27 12:56:43 2014 -0700 + + Move fedora patch file (used by ceph.spec.in) to rpm/ subdir + + Signed-off-by: Dan Mick + (cherry picked from commit 06b92cee621cbe33a6f17e8c64169db4453a5160) + +commit cb2ae9afa611175226efb5544f7d2aa705d55ece +Author: Erik Logtenberg +Date: Fri Aug 1 00:13:50 2014 +0200 + + ceph.spec.in, init-ceph.in: Don't autostart ceph service on Fedora. + + This patch is taken from the current Fedora package and makes the upstream + ceph.spec compliant with Fedora policy. The goal is to be fully compliant + upstream so that we can replace current Fedora package with upstream + package to fix many bugs in Fedora. + + Addition from Dan Mick : + Do this for RHEL and Centos as well, since they surely will benefit + from the same policy. Note: this requires changes to + autobuild-ceph and ceph-build scripts, which currently copy + only the dist tarball to the rpmbuild/SOURCES dir. + + Signed-off-by: Erik Logtenberg + Signed-off-by: Dan Mick : + (cherry picked from commit 461523b06cdf93e32f1d8b354ac3799e73162d33) + +commit 2b11376f1ee8925ab16065ebda912b11d3d7be59 +Author: Erik Logtenberg +Date: Thu Jul 31 23:54:03 2014 +0200 + + ceph.spec.in: Add obsoletes for libcephfs + + This fixes a bug for Fedora: + https://bugzilla.redhat.com/show_bug.cgi?id=1116614 + + Signed-off-by: Erik Logtenberg + (cherry picked from commit e9da2d8f2142771f206ef67f19e7f194855275d0) + +commit eefc62e3b85f402a7e1ae31c272c8a432d979379 +Author: Erik Logtenberg +Date: Thu Jul 31 23:49:56 2014 +0200 + + ceph.spec.in: add ceph-libs-compat + + Added a ceph-libs-compat package in accordance with Fedora packaging + guidelines [1], to handle the recent package split more gracefully. + In Fedora this is necessary because there are already other packages + depending on ceph-libs, that need to be adjusted to depend on the new + split packages instead. In the mean time, ceph-libs-compat prevents + breakage. + + [1] http://fedoraproject.org/wiki/Upgrade_paths_%E2%80%94_renaming_or_splitting_packages + + Signed-off-by: Erik Logtenberg + (cherry picked from commit 6c264f2204cbd54d90b02101e40ac9aa5aa72d7c) + + Conflicts: + + ceph.spec.in + +commit 0b978fb15a1307644aba3119419bb7386f98ee04 +Author: Sage Weil +Date: Sun Aug 10 14:41:19 2014 -0700 + + mon/Paxos: add perfcounters for most paxos operations + + I'm focusing primarily on the ones that result in IO here. + + Signed-off-by: Sage Weil + (cherry picked from commit b09b8563d35dda23faed43afef2a983e93a879c5) + +commit 74aa7afc719e517dbed300f802c1bc2dafe43ee0 +Author: Sage Weil +Date: Sun Aug 10 14:00:11 2014 -0700 + + mon/MonitorDBStore: add get_{keys,bytes}() accounting to Transaction + + Signed-off-by: Sage Weil + (cherry picked from commit fd421b26748e872ddf8e0f068dda2106853edff1) + +commit 5a5e7e7bcedbccbe4ae8aab159af6d8615eb3887 +Author: Loic Dachary +Date: Fri Oct 3 14:08:57 2014 +0200 + + ceph-disk: bootstrap-osd keyring ignores --statedir + + The STATEDIR variable is used to initialize the bootstrap-osd keyring + before it gets a chance to be overriden by --statedir. Replace it with + {statedir} so that it can be substituted after all options have been + parsed. + + http://tracker.ceph.com/issues/9653 Fixes: #9653 + + Signed-off-by: Loic Dachary + (cherry picked from commit fa0bd06b4657e5b84e237b76033ac3d3478b6a1f) + +commit 459dca1613a14cfad8d3afd7e3c783d825573a42 +Author: Yehuda Sadeh +Date: Tue Aug 12 13:36:11 2014 -0700 + + rgw: copy object data if target bucket is in a different pool + + Fixes: #9039 + Backport: firefly + + The new manifest does not provide a way to put the head and the tail in + separate pools. In any case, if an object is copied between buckets in + different pools, we may really just want the object to be copied, rather + than reference counted. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 5d3a7e595f47455896304bf358e5251915d0f16f) + +commit 711a7e6f81983ff2091caa0f232af914a04a041c +Author: Jason Dillaman +Date: Mon Sep 15 00:53:50 2014 -0400 + + rbd: ObjectCacher reads can hang when reading sparse files + + The pending read list was not properly flushed when empty objects + were read from a space file. + + Signed-off-by: Jason Dillaman + (cherry picked from commit cdb7675a21c9107e3596c90c2b1598def3c6899f) + +commit b7784dc1baa47560a733fe9dcd2acec51bc93165 +Author: Jason Dillaman +Date: Sat Sep 6 22:59:40 2014 -0400 + + Enforce cache size on read requests + + In-flight cache reads were not previously counted against + new cache read requests, which could result in very large + cache usage. This effect is most noticeable when writing + small chunks to a cloned image since each write requires + a full object read from the parent. + + Signed-off-by: Jason Dillaman + (cherry picked from commit 4fc9fffc494abedac0a9b1ce44706343f18466f1) + +commit ac4fca065a12f3b00a47bf5ec37983696255174b +Author: Alexandre Marangone +Date: Fri Sep 5 10:36:24 2014 -0700 + + rgw: add .log to default log path + + Fixes: #9353 + Signed-off-by: Alexandre Marangone + (cherry picked from commit 46732420897a2619059050044f4980a4737df43e) + +commit f03ae7b00e5694f1670493396a1cee195fcc6b35 +Merge: 78c3ef9 ee02cfd +Author: Yehuda Sadeh +Date: Thu Oct 2 15:28:40 2014 -0700 + + Merge pull request #2565 from ceph/wip-rgw-firefly-backports + + Wip rgw firefly backports + +commit 78c3ef90604ca117255cefe232771a7564fed8b1 +Author: Sage Weil +Date: Thu Sep 25 13:16:52 2014 -0700 + + osdc/Objecter: only post_rx_buffer if no op timeout + + If we post an rx buffer and there is a timeout, the revocation can happen + while the reader has consumed the buffers but before it has decoded and + constructed the message. In particular, we calculate a crc32c over the + data portion of the message after we've taken the buffers and dropped the + lock. + + Instead of fixing this race (for example, by reverifying rx_buffers under + the lock while calculating the crc.. bleh), just skip the rx buffer + optimization entirely when a timeout is present. + + Note that this doesn't cover the op_cancel() paths, but none of those users + provide static buffers to read into. + + Fixes: #9582 + Backport: firefly, dumpling + Signed-off-by: Sage Weil + + backport of 126d0b30e990519b8f845f99ba893fdcd56de447 + +commit a261b4952056aab7b067453930342960bbe55089 +Author: Sage Weil +Date: Mon Sep 29 14:28:32 2014 -0700 + + debian: move ceph_rest_api.py into ceph + + Signed-off-by: Sage Weil + (cherry picked from commit fe3434f41cd09433975d7d0f9dbb2fae662e4a1b) + +commit eb0f6e347969b40c0655d3165a6c4531c6b595a3 +Author: Sage Weil +Date: Mon Sep 29 14:24:01 2014 -0700 + + ceph.spec.in: move ceph_rest_api.py into ceph + + Signed-off-by: Sage Weil + (cherry picked from commit 8cda623e0ba34a48a70e9ea988d619b15605c4fd) + +commit c4188e31f7bc8f3c337e637cd99c41d5ee4b6787 +Author: Sage Weil +Date: Mon Sep 29 13:44:03 2014 -0700 + + ceph.spec: fix python-flask dependency + + This is needed by ceph-rest-api, which is in ceph.rpm; it's not related to + python-ceph (except that ceph-rest-api happens to require that too). + + Backport: firefly + Signed-off-by: Sage Weil + (cherry picked from commit b2416240b88b2e067dfc79a2723335f1584562d0) + +commit bf1933e5c184476a354664c42fec834e9f59067c +Author: Sage Weil +Date: Mon Sep 29 13:40:18 2014 -0700 + + debian: python-flask is needed by ceph, not python-ceph + + It's used by ceph-rest-api which is in the 'ceph' (server) package. + + Backport: firefly + Signed-off-by: Sage Weil + (cherry picked from commit e42424e777e4f7d8b03650482253734c1fa8709d) + + Conflicts: + + debian/control + +commit 94a7fbaa11c51db294dce0dc1df728f69aef5bf8 +Author: Danny Al-Gaaf +Date: Fri Sep 19 12:25:07 2014 +0200 + + rgw_main.cc: add missing virtual destructor for RGWRequest + + CID 1160858 (#1 of 1): Non-virtual destructor (VIRTUAL_DTOR) + nonvirtual_dtor: Class RGWLoadGenRequest has a destructor + and a pointer to it is upcast to class RGWRequest which doesn't + have a virtual destructor. + + Signed-off-by: Danny Al-Gaaf + (cherry picked from commit b82ceda7775ff85943d9143b73789eb37b09bfa9) + +commit 9fee8de25ab5c155cd6a3d32a71e45630a5ded15 +Author: Greg Farnum +Date: Mon Sep 29 16:10:36 2014 -0700 + + Locker: accept ctime updates from clients without dirty write caps + + The ctime changes any time the inode does. That can happen even without + the file itself having changed, so we'd better accept the update whenever + the auth caps have dirtied, without worrying about the file caps! + + Fixes: #9514 + Backport: firefly + + Signed-off-by: Greg Farnum + Reviewed-by: Sage Weil + Reviewed-by: John Spray + (cherry picked from commit 0ea20a668cf859881c49b33d1b6db4e636eda18a) + +commit 461ece5e9fb1d4994a6214a3b6bdae136773629d +Author: Sage Weil +Date: Wed Oct 1 18:01:51 2014 -0700 + + doc/release-notes: fix attributions for 8702 fix + + Oops! + + Signed-off-by: Sage Weil + (cherry picked from commit 188370a94353e29fcb8981699022803e23f3fedd) + +commit 917529a78e5046f621df5c48fe5d50d2f7e56560 +Author: Sage Weil +Date: Wed Oct 1 17:48:12 2014 -0700 + + doc/release-notes: v0.80.6 + + Signed-off-by: Sage Weil + (cherry picked from commit c0dc3a56974a469b61523b67cc032cc5726a3a5f) + + Conflicts: + + doc/release-notes.rst + +commit 060a5b1422fcdfb8e84636579a2f0c2f1ec14300 +Author: Adam Crume +Date: Thu Sep 18 16:57:27 2014 -0700 + + common: Add cctid meta variable + + Fixes: #6228 + Signed-off-by: Adam Crume + (cherry picked from commit bb45621cb117131707a85154292a3b3cdd1c662a) + +commit a1aa06b7fb30e509193e1b1bb7355b5f21aedc82 +Author: Samuel Just +Date: Wed Oct 1 14:30:59 2014 -0700 + + ReplicatedPG: dump snap_trimq on pg query + + Signed-off-by: Samuel Just + +commit 34f38b68d89baf1dcbb4571d4f4d3076dc354538 +Author: Samuel Just +Date: Mon Sep 29 16:26:54 2014 -0700 + + ReplicatedPG: do not queue the snap trimmer constantly + + Previously, we continuously requeued the snap trimmer while in + TrimmingObjects. This is not a good idea now that we try to + limit the number of snap trimming repops in flight and requeue + the snap trimmer directly as those repops complete. + + Fixes: #9113 + Backport: giant, dumpling, firefly + Signed-off-by: Samuel Just + +commit b29bf00f68cf133151c98db06e9498b3e8be22ed +Author: Samuel Just +Date: Wed Sep 24 13:55:47 2014 -0700 + + ReplicatedPG: clean out completed trimmed objects as we go + + Also, explicitely maintain a max number of concurrently trimming + objects. + + Fixes: 9113 + Backport: dumpling, firefly, giant + Signed-off-by: Samuel Just + +commit ee02cfd23facb3404fc377f643b213c2f498474d +Author: Yehuda Sadeh +Date: Thu Sep 18 20:53:10 2014 -0700 + + rgw: calculate hash after writing data + + Since data is written asynchronously, we should do the hash calculation + while it's pending. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 5bb94ede19a50543a02a8019ed6c9680b3852d4e) + +commit 216730221575d88a72b06ed3d71c9a54cffc5719 +Author: Yehuda Sadeh +Date: Thu Sep 18 20:51:02 2014 -0700 + + crypto: don't hash zero sized buffer + + libnss returns an error and we assert in that case. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 7b137246b49a9f0b4d8b8d5cebfa78cc1ebd14e7) + +commit dab7a4f6ad054b53cedca76ee329a6395918b1ab +Author: Yehuda Sadeh +Date: Fri Sep 12 14:07:44 2014 -0700 + + rgw: push hash calculater deeper + + This might have been the culprit for #9307. Before we were calculating + the hash after the call to processor->handle_data(), however, that + method might have spliced the bufferlist, so we can't be sure that the + pointer that we were holding originally is still invalid. Instead, push + the hash calculation down. Added a new explicit complete_hash() call to + the processor, since when we're at complete() it's too late (we need to + have the hash at that point already). + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit d41c3e858c6f215792c67b8c2a42312cae07ece9) + + Conflicts: + src/rgw/rgw_rados.h + +commit bd0a91343ce70d71acced753688a502b7e8b552e +Author: Yehuda Sadeh +Date: Thu Aug 21 16:30:10 2014 -0700 + + rgw: separate civetweb log from rgw log + + The civetweb log now is independent from the rgw log. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 850242cad749e33e1e6bc008baa75c8ea7eda0c1) + + Conflicts: + src/civetweb + src/rgw/rgw_main.cc + +commit a777562b780e7ab312f881c38b1db26983a1ac47 +Author: Yehuda Sadeh +Date: Tue Sep 23 13:40:39 2014 -0700 + + civetweb: update submodule + + Update submodule to include multiple fixes. + + Signed-off-by: Yehuda Sadeh + +commit 1bdcc079d79d8211b44e2a46511cd2240f71744b +Author: Yehuda Sadeh +Date: Fri Aug 22 15:12:16 2014 -0700 + + rgw: convert header field underscores into dashes + + Fixes: 9206 + Backport: firefly + + Certain web servers filter out underscores in the header field name. + Convert them into dashes. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 11acb7097ce21c6218dd48d0c21e0e04a361eb9a) + +commit b8fa2ed60b6cce51701df972dbb6f5e02e0d84ba +Author: Yehuda Sadeh +Date: Mon Aug 25 10:38:42 2014 -0700 + + rgw: fix test to identify whether object has tail + + Fixes: #9226 + Reported-by: Sylvain Munaut + Backport: firefly + + We need to identify whether an object is just composed of a head, or + also has a tail. Test for pre-firefly objects ("explicit objs") was + broken as it was just looking at the number of explicit objs in the + manifest. However, this is insufficient, as we might have empty head, + and in this case it wouldn't appear, so we need to check whether the + sole object is actually pointing at the head. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 751b3e26532932a42ca34f9c062a0a3e29a58cff) + +commit 6fee71154d838868807fd9824d829c8250d9d2eb +Author: Yehuda Sadeh +Date: Wed Aug 27 17:44:18 2014 -0700 + + rgw: don't try to authenticate a CORS preflight request + + Fixes: #8718 + Backport: firefly + + CORS preflight requests don't need to be authenticated. Treat them as + coming from anonymous user. + + Reported-by: Robert Hubbard + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 848fcf7871e07fc689bdcd18943ace36b2f4906e) + +commit c75a79cbac30cd14d37d89217113824d98693572 +Author: Yehuda Sadeh +Date: Thu Aug 14 13:35:12 2014 -0700 + + rgw: fix compilation + + RGWRadosPutObj couldn't refer to the ceph context. + + Reviewed-by: Sage Weil + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 6a555434ee3edaf742ee7e5910bcba8dd0de46dd) + +commit b0d08aab837808f18708a4f8ced0503c0fce2fec +Author: Yehuda Sadeh +Date: Tue Aug 12 11:17:47 2014 -0700 + + rgw: call throttle_data() even if renew_state() failed + + Otherwise we're going to leak the aio callback handle. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 7998c322179dd69a1250937321c3c2bb023e0e57) + +commit a953b313f1e2f884be6ee2ce356780f4f70849dd +Author: Yehuda Sadeh +Date: Wed Jul 30 21:32:48 2014 -0700 + + rgw: disable civetweb url decoding + + Fixes: #8621 + + We want to have the raw request uri, as we do the decoding ourselves. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit ffac52b316e7022796d44ae58804d9c20b9c3df9) + +commit ba5357714a19b8af989fef1c75ef775837c6a9d6 +Author: Yehuda Sadeh +Date: Tue Aug 12 14:23:46 2014 -0700 + + rgw: copy_obj_data() uses atomic processor + + Fixes: #9089 + + copy_obj_data was not using the current object write infrastructure, + which means that the end objects weren't striped. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 800eff24824c0083b8e2441fc34e0bdca5da36dc) + +commit d73dbc3a39117eddaaabb2c25d9238cd7c51711b +Author: Yehuda Sadeh +Date: Thu Aug 21 21:53:38 2014 -0700 + + rgw: clear bufferlist if write_data() successful + + Fixes: #9201 + Backport: firefly + + We sometimes need to call RGWPutObjProcessor::handle_data() again, + so that we send the pending data. However, we failed to clear the buffer + that was already sent, thus it was resent. This triggers when using non + default pool alignments. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 9181114d6f6062c55ee4b351fc3495345e545c36) + +commit 4be53d5eebbc294878ba38050d841359b6c8e19e +Author: Samuel Just +Date: Tue Sep 23 12:16:55 2014 -0700 + + PG: check full ratio again post-reservation + + Otherwise, we might queue 30 pgs for backfill at 0.80 fullness + and then never check again filling the osd after pg 11. + + Fixes: #9574 + Backport: dumpling, firefly, giant + Signed-off-by: Samuel Just diff --git a/doc/changelog/v0.80.9.txt b/doc/changelog/v0.80.9.txt new file mode 100644 index 0000000000000..aa274d1378075 --- /dev/null +++ b/doc/changelog/v0.80.9.txt @@ -0,0 +1,1148 @@ +commit b5a67f0e1d15385bc0d60a6da6e7fc810bde6047 (tag: refs/tags/v0.80.9, refs/remotes/gh/firefly) +Author: Jenkins +Date: Mon Mar 9 10:42:08 2015 -0700 + + 0.80.9 + +commit 37901afd1556257151c029395caa1143e84860f2 +Merge: 00e5947 3dac68a +Author: Loic Dachary +Date: Tue Mar 3 15:58:53 2015 +0100 + + Merge pull request #3852 from dachary/wip-10965-rgw-firefly + + rgw: enable ipv6 in civetweb + + Reviewed-by: Loic Dachary + +commit 00e5947a28f7810a28329d2f4901aed09a289421 +Merge: 293222a b13f483 +Author: Loic Dachary +Date: Tue Mar 3 15:58:39 2015 +0100 + + Merge pull request #3853 from dachary/wip-10907-rgw-firefly + + rgw: pass civetweb configurables to civetweb + + Reviewed-by: Loic Dachary + +commit 293222a284c072a13950831205c106fec7a400df +Merge: 6512b06 d57b38f +Author: Loic Dachary +Date: Tue Mar 3 15:58:25 2015 +0100 + + Merge pull request #3851 from dachary/wip-10978-rgw-firefly + + rgw: don't overwrite bucket / object owner when setting acls + + Reviewed-by: Loic Dachary + +commit d57b38f85b683dfb365c3cb98362d486594f9eb3 +Author: Yehuda Sadeh +Date: Fri Feb 27 15:32:50 2015 -0800 + + rgw: don't overwrite bucket / object owner when setting acls + + Fixes: #10978 + Backport: hammer, firefly + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit eb13f2d4b60c031f16139f7cc4237c012644dd78) + + Conflicts: + src/rgw/rgw_op.cc : trivial s/.empty()/== NULL/ + +commit b13f483e51b37a768c7f4313b6933bf648950c7d +Author: Yehuda Sadeh +Date: Tue Feb 17 15:05:40 2015 -0800 + + rgw: pass civetweb configurables to civetweb + + Fixes: #10907 + Backport: firefly + + Pass any configurables in the rgw frontends config line to civetweb. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 986d7554426764a149621ba733c5c075b94e0431) + +commit 3dac68a17a909b212a36b0a3c0ae2c47d323deee +Author: Yehuda Sadeh +Date: Fri Feb 27 08:14:27 2015 -0800 + + civetweb: update submodule + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit f28fe744285117a1715eac0d08911cdb37285103) + +commit 14aca3af18119a76c1cdfa6d71e6085d360e45e2 +Author: Yehuda Sadeh +Date: Fri Feb 27 08:14:41 2015 -0800 + + rgw: update makefile to enable civetweb config + + Fixes: #10965 + Backport: hammer, firefly + + Civetweb compilation now includes conf header to enable ipv6. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit caa90225bad9fe5e9c275e6189b3396b4d396e3f) + + Conflicts: + src/rgw/Makefile.am + radosgw_CFLAGS = -Icivetweb/include + was not yet changed into + radosgw_CFLAGS = -I$(srcdir)/civetweb/include + in firefly + +commit 6512b06fc8a80e3befbe15b543f8850991f74e8a +Merge: 7b748c6 555cc42 +Author: Loic Dachary +Date: Sat Feb 28 16:47:55 2015 +0100 + + Merge pull request #3820 from tchaikov/firefly-pg-leak-10421 + + osd: fix PG leak in SnapTrimWQ._clear() + + Reviewed-by: Loic Dachary + +commit 7b748c62764311572342593820ce3f28f74fe9ca +Merge: d971c95 da95149 +Author: Loic Dachary +Date: Sat Feb 28 16:46:42 2015 +0100 + + Merge pull request #3772 from ceph/wip-10883-firefly + + osd: Fix FileJournal wrap to get header out first + + Reviewed-by: Loic Dachary + +commit d971c95c3d8b48494d05b68f40c4c9a4cc6f87a8 +Merge: 0820041 e539971 +Author: Loic Dachary +Date: Sat Feb 28 16:45:48 2015 +0100 + + Merge pull request #3521 from dzafman/wip-10676 + + Backport doc fixes that appear to apply to firefly + + Reviewed-by: Loic Dachary + +commit 555cc42fc826fd801f0d45187429079d4072d129 +Author: Kefu Chai +Date: Tue Feb 10 16:29:45 2015 +0800 + + osd: fix PG leak in SnapTrimWQ._clear() + + Fixes: #10421 + Signed-off-by: Kefu Chai + (cherry picked from commit 01e154d592d6cdbf3f859cf1b4357e803536a6b4) + +commit 0820041e6515c82c41b81b9e6825e2dd5fcb8165 +Merge: 6565774 702dbc0 +Author: Gregory Farnum +Date: Thu Feb 26 15:59:24 2015 -0800 + + Merge pull request #3730 from ceph/wip-firefly-flock + + backport ceph-fuse file locking patches to Firefly + + Reviewed-by: Yan, Zheng + +commit 6565774d0356efc6225ad561bb13e7cf11da7b1e +Merge: 7ab02ee 08c2fda +Author: Josh Durgin +Date: Thu Feb 26 14:18:10 2015 -0800 + + Merge pull request #3406 from ceph/wip-10299-firefly + + librbd: complete all pending aio ops prior to closing image + + Reviewed-by: Josh Durgin + +commit 08c2fda12cf46937a09a59bb032379c3c5321292 +Author: Jason Dillaman +Date: Mon Dec 15 10:53:53 2014 -0500 + + librbd: complete all pending aio ops prior to closing image + + It was possible for an image to be closed while aio operations + were still outstanding. Now all aio operations are tracked and + completed before the image is closed. + + Fixes: #10299 + Backport: giant, firefly, dumpling + Signed-off-by: Jason Dillaman + +commit 7ab02ee5afb3d017b94d58c3dfc7731f7a3866d9 +Merge: 0c0a552 c23e42e +Author: Josh Durgin +Date: Thu Feb 26 14:12:29 2015 -0800 + + Merge pull request #3404 from ceph/wip-10270-firefly + + librbd: gracefully handle deleted/renamed pools + + Reviewed-by: Josh Durgin + +commit 0c0a5520b1b883bcdd5b865b217ba61e471e3ca2 +Merge: 9ef7743 836ab86 +Author: Josh Durgin +Date: Thu Feb 26 13:31:37 2015 -0800 + + Merge pull request #3410 from ceph/wip-9854-firefly + + osdc: Constrain max number of in-flight read requests + + Reviewed-by: Josh Durgin + +commit da951497b7e5ec227aa6a5e459b0d04d8b88ef13 (refs/remotes/gh/wip-10883-firefly) +Author: David Zafman +Date: Wed Feb 18 16:21:12 2015 -0800 + + osd: Fix FileJournal wrap to get header out first + + Correct and restore assert that was removed + + Cause by f46b1b473fce0322a672b16c7739e569a45054b6 + Fixes: #10883 + Backport: dumpling, firefly, giant + + Signed-off-by: David Zafman + (cherry picked from commit 970bb4901f93575709421b5b25c3eff213de61b8) + +commit 702dbc0a247c149d53b52d1929f9880bc99d0522 +Author: Yan, Zheng +Date: Wed Oct 15 12:00:58 2014 +0800 + + qa/workunits/fs/misc: Add a workunit for file lock interruption + + Signed-off-by: Yan, Zheng + (cherry picked from commit ac92c455a9aa19e4288acdf0c9a746e03a640efb) + +commit d2523b82c5c7b29293d70a66ba95493a1564a840 +Author: Yan, Zheng +Date: Wed Oct 15 12:03:46 2014 +0800 + + mds: fix neighbor lock check + + Signed-off-by: Yan, Zheng + (cherry picked from commit b0e6e85aa08ea74cd209aad04f3f0bf991761e12) + +commit f9b6b66b05ddadef043d81676728bf40730ea16c +Author: Yan, Zheng +Date: Mon Oct 13 11:34:18 2014 +0800 + + client: use finisher to abort MDS request + + When a request is interrupted, libfuse first locks an internal mutex, + then calls the interrupt callback. libfuse need to lock the same mutex + when unregistering interrupt callback. We unregister interrupt callback + while client_lock is locked, so we can't acquiring the client_lock in + the interrupt callback. + + Signed-off-by: Yan, Zheng + (cherry picked from commit 09699454e729592d426aeff5b578697e850af12e) + + Conflicts: + src/client/Client.cc + src/client/Client.h + + Signed-off-by: Greg Farnum + +commit ea355e9ca5203b77e6f74ca4a3e39ce23cc86f67 +Author: Yan, Zheng +Date: Thu Oct 9 13:16:18 2014 +0800 + + client: use atomic variable to track reference of MetaRequeset + + this allow us to increase reference count of MetaRequest while not holding + the client_lock + + Signed-off-by: Yan, Zheng + (cherry picked from commit e464a7765230c504b3e3b88bcb1106f67b7c3eb9) + +commit ccbdf514717cb0e48d67b57197d6e538faeea415 +Author: Yan, Zheng +Date: Mon Oct 13 10:44:46 2014 +0800 + + client: allow interrupting blocked file lock operation + + This commit introduce two new types of setfilelock request. Unlike + setfilelock (UNLOCK) request, these two new types of setfilelock request + do not drop locks that have alread been acquired, they only interrupt + blocked setfilelock request. + + Signed-off-by: Yan, Zheng + (cherry picked from commit 4134c149d3759dd6a3aaa1a353b77bbfe8e9491b) + +commit 875e2fcb060554941d94714ad48ebbc0cbbf8077 +Author: Yan, Zheng +Date: Thu Oct 9 09:42:08 2014 +0800 + + client: register callback for fuse interrupt + + libfuse allows program to reigster a callback for interrupt. When a file + system operation is interrupted, the fuse kernel driver sends interupt + request to libfuse. libfuse calls the interrupt callback when receiving + interrupt request. + + Signed-off-by: Yan, Zheng + (cherry picked from commit 289e8b4a7efa1ae6427115af9bbe541c9e1f0e90) + +commit c96aabbd54b24a0273af21a450cd6f517fe4ada0 +Author: Yan, Zheng +Date: Sat Oct 4 09:14:44 2014 +0800 + + client: add helper function that updates lock state + + Signed-off-by: Yan, Zheng + (cherry picked from commit 6a2303a6b6d97f2a6d1422e42d3d88991857618f) + + Conflicts: + src/client/Client.h + + Signed-off-by: Greg Farnum + +commit ebbd3ca66722cdc61b56d243baf8e63f7b8c1c1b +Author: Yan, Zheng +Date: Thu Oct 2 20:21:36 2014 +0800 + + fuse: enable fuse_multithreaded by default + + GETFILELOCK MDS request may block for a long time, so we need to + use multithread event loop, + + Signed-off-by: Yan, Zheng + (cherry picked from commit b17b43a8660ed0db29fbecf44798265e47712f85) + +commit 924e6f2b6b03456a8b18d8b158b8f325051f0519 +Author: Yan, Zheng +Date: Thu Oct 2 19:07:41 2014 +0800 + + client: posix file lock support + + Signed-off-by: Yan, Zheng + (cherry picked from commit a1b2c8ff955b30807ac53ce6bdc97cf61a7262ca) + + Conflicts: + src/client/Client.cc + src/client/Client.h + src/client/Inode.h + + Signed-off-by: Greg Farnum + +commit 82994946bdcb95867a61e0acf443e30b13925a34 +Author: Yan, Zheng +Date: Thu Oct 2 18:02:50 2014 +0800 + + common: link mds/flock.o to libcommon + + later commit will use this code to track file locks held by cephfs + client. + + Signed-off-by: Yan, Zheng + (cherry picked from commit e075c27c3554380c59dce0cc17ef0944eb415025) + + Conflicts: + src/mds/Makefile.am + + Signed-off-by: Greg Farnum + +commit 9ef77430f3d46789b0ba1a2afa42729627734500 +Merge: 1205867 b668566 +Author: Loic Dachary +Date: Fri Feb 13 18:34:11 2015 +0100 + + Merge pull request #3684 from ceph/wip-crush-straw-firefly + + osd: backport straw bucket fixes and all other recent crush goodness to firefly + + Reviewed-by: Loic Dachary + +commit b668566fd8148414b8074f096b85b22c42ed3af9 (refs/remotes/gh/wip-crush-straw-firefly) +Author: Sage Weil +Date: Fri Jan 16 09:02:28 2015 -0800 + + crush/builder: fix warnings + + crush/builder.c: In function 'crush_remove_list_bucket_item': + crush/builder.c:977:13: warning: comparison between signed and unsigned integer expressions [-Wsign-compare] + if (weight < bucket->h.weight) + ^ + crush/builder.c: In function 'crush_remove_tree_bucket_item': + crush/builder.c:1031:14: warning: comparison between signed and unsigned integer expressions [-Wsign-compare] + if (weight < bucket->h.weight) + ^ + + Signed-off-by: Sage Weil + (cherry picked from commit 14eb1a73c71d81b7f193fce27c59cb3babf3e74a) + +commit 247afa4efe7d11b5ef06e094680b50ea30d7d5e3 +Author: Loic Dachary +Date: Wed Oct 15 17:02:58 2014 -0700 + + crush: improve constness of CrushWrapper methods + + A number of CrushWrapper get methods or predicates were not const + because they need to maintain transparently the rmaps. Make the rmaps + mutable and update the constness of the methods to match what the caller + would expect. + + Signed-off-by: Loic Dachary + (cherry picked from commit 236895eea65f8706baa5fdef96fb00ad5b82218c) + +commit c59279a25b6c53e0ab3988b0f00ae3cce94f33d7 +Author: Xiaoxi Chen +Date: Fri Sep 5 10:56:36 2014 +0800 + + Change CrushWrapper::crush to private + + Currently in CrushWrapper, the member "struct crush_map *crush" is a public member, + so people can break the encapsulation and manipulate directly to the crush structure. + + This is not a good practice for encapsulation and will lead to inconsistent if code + mix use the CrushWrapper API and crush C API.A simple example could be: + 1.some code use crush_add_rule(C-API) to add a rule, which will not set the have_rmap flag to false in CrushWrapper + 2.another code using CrushWrapper trying to look up the newly added rule by name will get a -ENOENT. + + This patch move CrushWrapper::crush to private, together with three reverse map(type_rmap, name_rmap, rule_name_rmap) + and also change codes accessing the CrushWrapper::crush to make it compile. + + Signed-off-by: Xiaoxi Chen + (cherry picked from commit d734600f9251b52f525faa35441e2b5dd660161b) + +commit 9fc2fd50aec94233528870aaa371347238a6ea75 +Author: Sage Weil +Date: Fri Jul 11 06:58:57 2014 -0700 + + crush: include CRUSH_V3, v2/v3 rules checks in dump_tunables() + + Backport: firefly + Signed-off-by: Sage Weil + (cherry picked from commit cf94cf3531a349bbd1fc6ee56c3fc260110a252a) + +commit f6009a614a3689cc9a96f55b35e70362be4ec64b +Author: Sage Weil +Date: Fri Dec 5 15:58:03 2014 -0800 + + mon: 'osd crush reweight-all' + + This corresponds to the crushtool --reweight command. + + Signed-off-by: Sage Weil + (cherry picked from commit 89b2feea8d53b9dc15ab5ae7f5920ad19c8bba18) + +commit d5d7495432dd023ed8c9ba2ac08222f06afee48f +Author: Sage Weil +Date: Fri Dec 5 15:55:24 2014 -0800 + + crush: set straw_calc_version=1 for default+optimal; do not touch for presets + + When using the presets for compatibility (i.e., based on version), do not + touch the straw behavior, as it does not affect mapping or compatibility. + However, make a point of setting it by default and for optimal. + + For most users, this means that they will not see any change unless they + explicitly enable the new behavior, or switch to default or optimal + tunables. The idea is that if they touched it, they shouldn't be + too surprised by the subsequent data movement. + + Signed-off-by: Sage Weil + (cherry picked from commit dd7b58f3b1aa1febfc6dc227937df93ee6e284eb) + + (Note: differs from original in that the hammer tunable profile is not + present) + +commit 5b7b7599123a9d7a837ded52946bd553b09b1ac7 +Author: Sage Weil +Date: Wed Dec 3 22:30:00 2014 -0800 + + crush/builder: a note about the original crush_calc_straw() + + Signed-off-by: Sage Weil + (cherry picked from commit adf5c6de0aca24a53d3c7b4e7eeb0a5dce9db0f1) + +commit 3ab835b059fd74a525cc2a8ebe8b6a1453e0cc87 +Author: Sage Weil +Date: Tue Dec 2 16:43:16 2014 -0800 + + mon: add 'osd crush {get,set}-tunable [value]' commands + + For now, just add the straw_calc_version tunable. + + Signed-off-by: Sage Weil + (cherry picked from commit 9000068ae45a8b89315c152b7d5509ac873f2957) + + Conflicts: + src/mon/OSDMonitor.cc + +commit 884414c5788bac9a269f01b26cbc0c55850c34f6 +Author: Sage Weil +Date: Tue Dec 2 16:33:11 2014 -0800 + + crush: fix crush_calc_straw() scalers when there are duplicate weights + + The straw bucket was originally tested with uniform weights and with a + few more complicated patterns, like a stair step (1,2,3,4,5,6,7,8,9). And + it worked! + + However, it does not behave with a pattern like + 1, 2, 2, 3, 3, 4, 4 + + Strangely, it does behave with + 1, 1, 2, 2, 3, 3, 4, 4 + + and more usefully it does behave with + 1, 2, 2.001, 3, 3.001, 4, 4.001 + + That is, the logic that explicitly copes with weights that are duplicates + is broken. + + The fix is to simply remove the special handling for duplicate weights -- + it isn't necessary and doesn't work correctly anyway. + + Add a test that compares the mapping result of [1, 2, 2, 3, 3, ...] with + [1, 2, 2.001, 3, 3.001, ...] and verifies that the difference is small. + With the fix, we get .00012, whereas the original implementation gets + .015. + + Note that this changes the straw bucket scalar *precalculated* values that + are encoded with the map, and only when the admin opts into the new behavior. + + Backport: giant, firefly + Signed-off-by: Sage Weil + (cherry picked from commit 43d5c7caa7ce478477bde1bbd4f0649b5159cdcf) + +commit bf677093302f475a23de541471ddde6a7c7153b6 +Author: Sage Weil +Date: Tue Dec 2 14:50:21 2014 -0800 + + crush: fix distortion of straw scalers by 0-weight items + + The presence of a 0-weight item in a straw bucket should have no effect + on the placement of other items. Add a test validating that and fix + crush_calc_straw() to fix the distortion. + + Note that this effects the *precalculation* of the straw bucket inputs and + does not effect the actually mapping process given a compiled or encoded + CRUSH map, and only when straw_calc_version == 1 (i.e., the admin opted in + to the new behavior). + + Backport: giant, firefly + Signed-off-by: Sage Weil + (cherry picked from commit 85498bc8f62ca56506b33f3c5ec4fc4b111ed73d) + +commit e02574ef5d3b66e73424a16460366de2d36eded3 +Author: Sage Weil +Date: Tue Dec 2 14:49:42 2014 -0800 + + crush/builder: break out new version 1 of crush_calc_straw + + No change, yet. + + Signed-off-by: Sage Weil + (cherry picked from commit 7c1203635f2b5911f281ce3a441905df6e9bd103) + +commit c7312a47865c758e67852999803d8aa90ff809c1 +Author: Sage Weil +Date: Tue Dec 2 14:45:04 2014 -0800 + + crush: pass crush_map * to various builder methods + + In particular, we will need it for crush_calc_straw(). + + Signed-off-by: Sage Weil + (cherry picked from commit f35a3d88cb944c292e966f679ac7a8d7a1cd3093) + + Conflicts: + src/crush/CrushWrapper.cc + +commit 5137cc656548d942d0f21ba3ef28a5b7d4d21831 +Author: Rongze Zhu +Date: Fri Oct 10 19:18:00 2014 +0800 + + crush: fix incorrect use of adjust_item_weight method + + adjust_item_weight method will adjust all buckets which the item + inside. If the osd.0 in host=fake01 and host=fake02, we execute + "ceph osd crush osd.0 10 host=fake01", it not only will adjust fake01's + weight, but also will adjust fake02's weight. + + the patch add adjust_item_weightf_in_loc method and fix remove_item, + _remove_item_under, update_item, insert_item, detach_bucket methods. + + Signed-off-by: Rongze Zhu + (cherry picked from commit 9850227d2f0ca2f692a154de2c14a0a08e751f08) + + Conflicts: + src/crush/CrushWrapper.cc + +commit 80ec22014deb8536b9c34e8d57b286052898d05a +Author: Sage Weil +Date: Thu Nov 13 10:59:22 2014 -0800 + + crush/CrushWrapper: fix detach_bucket + + In commit 9850227d2f0ca2f692a154de2c14a0a08e751f08 we changed the call that + changed the weight of all instances of item to one that explicitly + changes it in the parent bucket, but parent_id may not be valid at the + call site. Move this into the conditional block to fix. + + Fixes: #10095 + Signed-off-by: Sage Weil + (cherry picked from commit 6f8b96a9fe4793906c74a571109a457aca7ca220) + +commit 5bc554ac5a7a4cd86b873bfc4f4313c91287b52c +Author: Sage Weil +Date: Tue Dec 2 14:10:49 2014 -0800 + + crush: default to straw_calc_version 1 + + Stick with bobtail tunables where it affects compatibility. Use v1 of + straw_calc, though, since that does not, and we want the best for new + clusters. + + Signed-off-by: Sage Weil + (cherry picked from commit 9565621d6007302fdd68ba27b6aef22e487f0985) + +commit cfc718a5a1fec937cf00e6c2b55f66d4390088bb +Author: Sage Weil +Date: Fri Feb 13 08:30:35 2015 -0800 + + crush: add straw_calc_version tunable + + It doesn't do anything, yet. + + Signed-off-by: Sage Weil + (cherry picked from commit 50e2ceefee6a5dfbecbe54890139b1fa80a313c8) + + (Modified from original to not create the 'hammer' tunable profile, which + we will not backport in its entirety.) + +commit 1205867d3afe6d457483f9f51aaee79cca74e040 +Author: Josh Durgin +Date: Mon Feb 2 16:43:35 2015 +0100 + + qa: use correct binary path on rpm-based systems + + Fixes: #10715 + Signed-off-by: Josh Durgin + (cherry picked from commit 05ce2aa1bf030ea225300b48e7914577a412b38c) + +commit da4146a5e03503083e6bc5c12984c06f41a3b4be +Author: Greg Farnum +Date: Thu Feb 5 21:12:17 2015 -0800 + + fsync-tester: print info about PATH and locations of lsof lookup + + We're seeing the lsof invocation fail (as not found) in testing and nobody can + identify why. Since attempting to reproduce the issue has not worked, this + patch will gather data from a genuinely in-vitro location. + + Signed-off-by: Greg Farnum + (cherry picked from commit a85051483874ff5b8b0fb50426a3577040457596) + +commit 61b4f09848796faeacf7eb78dd6dc06513beb737 +Author: Sage Weil +Date: Tue Dec 2 14:04:34 2014 -0800 + + crush/CrushWrapper: dump chooseleaf_vary_r with other tunables + + Signed-off-by: Sage Weil + (cherry picked from commit c133a83fe7b77e2a7e7c711cb8ab943c59ff0885) + +commit e4939ed5535cc4678cf1d1ae80906290448e6590 +Author: Sage Weil +Date: Tue Dec 2 10:12:25 2014 -0800 + + crush/CrushTester: add new --show-mappings option + + This makes --show-utilization and --show-statistics usable. + + Signed-off-by: Sage Weil + (cherry picked from commit 3877f00877f53e9f86630c2d89e81ab9cff729ec) + +commit 8c48ebbf13e6bdb689b4b8ae58ac811653ad2acc +Author: Sage Weil +Date: Tue Dec 2 10:11:57 2014 -0800 + + crushtool/CrushTester: output utilization even with 1 batch + + Signed-off-by: Sage Weil + (cherry picked from commit 294b06c6424f4cb69394976add826d9725073b50) + +commit e9faab9f296af9cf26b9475afd3562c3f3b9236b +Author: Sage Weil +Date: Tue Dec 2 10:08:18 2014 -0800 + + crush: recalculate straw scalers during a reweight + + The crushtool --reweight function triggers a fresh calculation of bucket + weights so that they are always the sum of the item weights. In the + straw bucket case, the weights were updated but the corresponding straw + scalers were not being recalculated. The result is that there was not + effect on placement in adjusted buckets until the next time a bucket item's + weight was adjusted. + + Backport: giant, firefly + Signed-off-by: Sage Weil + (cherry picked from commit 35062937f174a45fb13d9c177eaa1fe4ed5ff4c2) + +commit 4c951e9dc1ca31429119de15755caf3d73f6ffce +Author: Sage Weil +Date: Tue Dec 2 08:36:41 2014 -0800 + + osdmaptool: --test-map-pgs-dump + + Signed-off-by: Sage Weil + (cherry picked from commit 7a99b489909b368bade36d8cc9722ad33d03a2e8) + +commit 3cb5d08feee4811624dd495f74912824203edf00 +Author: Sage Weil +Date: Tue Dec 2 06:53:20 2014 -0800 + + crush: add dprintk's for crush_calc_straw + + These are compiled out by default. + + Signed-off-by: Sage Weil + (cherry picked from commit 946bebd94b109642f95710802e892c59eb4f0b76) + +commit ca8471d65fb2cf7d2247d823c5436faa273efda5 +Author: Rongze Zhu +Date: Tue Nov 11 00:13:42 2014 +0800 + + crush: fix tree bucket functions + + There are incorrect nodes' weight in tree bucket when construct tree + bucket. The tree bucket don't store item id in items array, so the tree + bucket will not work correctly. The patch fix above bugs and add a + simple test for tree bucket. + + Signed-off-by: Rongze Zhu + (cherry picked from commit 13425488882d360fa740613dfcfd0d098c1b7616) + +commit af502f25b04cd0758b753941ecf0b1b59d33ea9e +Author: Sage Weil +Date: Tue Nov 11 11:16:58 2014 -0800 + + crush/builder: replace printf with an empty dprintk macro + + This mirrors mapper.c. + + Signed-off-by: Sage Weil + (cherry picked from commit e444b221733360cdea4f4bbdbbbfbf2b6ee30ff5) + +commit 26966c4aa9cd79cb53db052553a5fc5653f2591b +Author: Greg Farnum +Date: Thu Nov 6 17:48:01 2014 -0800 + + qa: use sudo even more when rsyncing /usr + + Signed-off-by: Greg Farnum + (cherry picked from commit 3aa7797741f9cff06053a2f31550fe6929039692) + +commit 6af48421c0ba6195e9a1607053c42eeb62b14ccb +Author: Greg Farnum +Date: Tue Oct 21 10:55:06 2014 -0700 + + qa: use sudo when rsyncing /usr so we can read everything + + Signed-off-by: Greg Farnum + (cherry picked from commit fa07c04231db2d130de54647957ffab4a7a53733) + +commit 861a18b1f7dd5bdcbb85bc3fa15d1719bb4b2813 +Author: Yehuda Sadeh +Date: Thu Jan 15 16:31:22 2015 -0800 + + rgw: fix partial GET in swift + + Fixes: #10553 + backport: firefly, giant + + Don't set the ret code to reflect partial download, just set the + response status when needed. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 7e1553cedff90fa0fefded65cde87ad068eb5f0c) + +commit e539971e2d528b4de6009ea44565f037acb2be66 (refs/remotes/gh/wip-10676) +Author: Nilamdyuti Goswami +Date: Thu Dec 18 17:13:27 2014 +0530 + + doc: Adds updated man page for ceph under man/ + + Signed-off-by: Nilamdyuti Goswami + (cherry picked from commit 8de9a0f437822c770600c19a9f61977745b7e530) + +commit 15596ffe388147b7984457041a38cbb9f472556c +Author: Nilamdyuti Goswami +Date: Sat Dec 13 02:27:45 2014 +0530 + + doc: Adds man page for ceph under man/. + + Signed-off-by: Nilamdyuti Goswami + (cherry picked from commit ffd6c7e49686f8f92ddb400ffdec62520708e64b) + +commit 893e5cd1f4fd5ea77d65f0e78cc8cff88eb19c1a +Author: Nilamdyuti Goswami +Date: Sat Dec 13 02:24:41 2014 +0530 + + doc: Adds man page for ceph. + + Signed-off-by: Nilamdyuti Goswami + (cherry picked from commit 76da87a64ca6b3cc0ceeaf63e19a9f440d6f4161) + +commit 8d29a4a231666830914903b95599d80da7b97def +Author: Sage Weil +Date: Mon Dec 15 17:04:32 2014 -0800 + + osd: handle no-op write with snapshot case + + If we have a transaction that does something to the object but it !exists + both before and after, we will continue through the write path. If the + snapdir object already exists, and we try to create it again, we will + leak a snapdir obc and lock and later crash on an assert when the obc + is destroyed: + + 0> 2014-12-06 01:49:51.750163 7f08d6ade700 -1 osd/osd_types.h: In function 'ObjectContext::~ObjectContext()' thread 7f08d6ade700 time 2014-12-06 01:49:51.605411 + osd/osd_types.h: 2944: FAILED assert(rwstate.empty()) + + Fix is to not recreated the snapdir if it already exists. + + Fixes: #10262 + Signed-off-by: Sage Weil + (cherry picked from commit 02fae9fc54c10b5a932102bac43f32199d4cb612) + +commit 8ba48d10c252b28cde5b4da1286421db12b57cc2 +Author: Sage Weil +Date: Mon Jan 19 18:28:20 2015 -0800 + + ceph_test_rados_api_misc: do not assert rbd feature match + + This test fails on upgrades when we (or the server) have new + features. Make it less fragile. + + Fixes: #10576 + Signed-off-by: Sage Weil + (cherry picked from commit 9147c62989871cea8b3a85b02c53017825efb55b) + +commit 836ab86d89ae15ed5b228ff656bc81c7cc3495aa +Author: Jason Dillaman +Date: Mon Oct 27 14:47:19 2014 -0400 + + osdc: Constrain max number of in-flight read requests + + Constrain the number of in-flight RADOS read requests to the + cache size. This reduces the chance of the cache memory + ballooning during certain scenarios like copy-up which can + invoke many concurrent read requests. + + Fixes: #9854 + Backport: giant, firefly, dumpling + Signed-off-by: Jason Dillaman + (cherry picked from commit 068d68850d09dfcaccc5a3ce85c80b2f6d808ea9) + +commit 2964efaf327cf798f892a6722eb4e24f2ffa0fde +Author: Jason Dillaman +Date: Mon Jan 19 12:46:00 2015 -0500 + + Revert "Enforce cache size on read requests" + + This reverts commit b7784dc1baa47560a733fe9dcd2acec51bc93165. + +commit e4c5b153735aba3f84554a50243f779f36568373 +Author: Jason Dillaman +Date: Mon Jan 19 12:45:25 2015 -0500 + + Revert "rbd: ObjectCacher reads can hang when reading sparse files" + + This reverts commit 711a7e6f81983ff2091caa0f232af914a04a041c. + +commit c23e42e78ea3ba44706951a728e2ccb74cac7b33 +Author: Jason Dillaman +Date: Mon Jan 19 10:28:56 2015 -0500 + + librbd: gracefully handle deleted/renamed pools + + snap_unprotect and list_children both attempt to scan all + pools. If a pool is deleted or renamed during the scan, + the methods would previously return -ENOENT. Both methods + have been modified to more gracefully handle this condition. + + Fixes: #10270 + Backport: giant, firefly + Signed-off-by: Jason Dillaman + (cherry picked from commit 436923c68b77c900b7774fbef918c0d6e1614a36) + +commit 24c13d87039d4f61df0bcabdb8862e0e94fe575d +Author: Yehuda Sadeh +Date: Fri Dec 12 05:24:01 2014 -0800 + + rgw: change multipart upload id magic + + Fixes: #10271 + Backport: firefly, giant + + Some clients can't sign requests correctly with the original magic + prefix. + + Reported-by: Georgios Dimitrakakis + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 5fc7a0be67a03ed63fcc8408f8d71a31a1841076) + +commit 617002d3ff469ef409a83e35d4f4fd6a0b5b1278 +Author: Yehuda Sadeh +Date: Thu Dec 11 09:07:10 2014 -0800 + + rgw: url decode http query params correctly + + Fixes: #10271 + Backport: firefly + + This got broken by the fix for #8702. Since we now only url_decode if + we're in query, we need to specify that we're in query when decoding + these args. + + Reported-by: Georgios Dimitrakakis + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 21e07eb6abacb085f81b65acd706b46af29ffc03) + +commit d7ccf71d962cec1571f53c9392f9b58350569062 +Author: Josh Durgin +Date: Wed Jan 14 15:01:38 2015 -0800 + + qa: ignore duplicates in rados ls + + These can happen with split or with state changes due to reordering + results within the hash range requested. It's easy enough to filter + them out at this stage. + + Backport: giant, firefly + Signed-off-by: Josh Durgin + (cherry picked from commit e7cc6117adf653a4915fb7a75fac68f8fa0239ec) + +commit aef69572588a0dfad58df94cb0d0980d0590d8e4 +Merge: 69eaad7 534624b +Author: Yehuda Sadeh +Date: Thu Jan 15 16:40:48 2015 -0800 + + Merge branch 'wip-firefly-rgw-backports' into firefly + +commit 534624b7e9decc880e88496355a6fbbe008ede5f +Author: Sage Weil +Date: Tue Oct 21 17:59:30 2014 -0700 + + init-radosgw.sysv: set ulimit -n before starting daemon + + If we do the ulimit inside the daemon command we will have already + dropped privs and will fail. + + Fixes: #9587 + Backport: giant, firefly + Signed-off-by: Sage Weil + (cherry picked from commit 9803cedf54a7baff45ccd0e0f65d2bc220958a46) + +commit fd49cbc535d0d7fa64ebfa458386b47c8ec8616e +Author: Yehuda Sadeh +Date: Fri Dec 12 17:07:30 2014 -0800 + + rgw: use s->bucket_attrs instead of trying to read obj attrs + + Fixes: #10307 + Backport: firefly, giant + + This is needed, since we can't really read the bucket attrs by trying to + read the bucket entry point attrs. We already have the bucket attrs + anyway, use these. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 5cf193c8686196d5235889e68cb5ea8f1fc8e556) + +commit 79bfffb55a0b3a302368e34417d62f74b82dc224 +Author: Yehuda Sadeh +Date: Wed Nov 5 13:40:55 2014 -0800 + + rgw: remove swift user manifest (DLO) hash calculation + + Fixes: #9973 + Backport: firefly, giant + + Previously we were iterating through the parts, creating hash of the + parts etags (as S3 does for multipart uploads). However, swift just + calculates the etag for the empty manifest object. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit ef6d3ad964d34bc526dc4435486bd5c8cdc3b230) + + Conflicts: + src/rgw/rgw_op.cc + +commit ac799f0834783590cbb6eb91784c8e0753cb1e03 +Author: Lei Dong +Date: Mon Oct 27 10:29:48 2014 +0800 + + fix can not disable max_size quota + + Currently if we enable quota and set max_size = -1, it doesn’t + mean max_size is unlimited as expected. Instead, it means object + with any size is not allowed to upload because of “QuotaExceeded”. + The root cause is the function rgw_rounded_kb which convert max_size + to max_size_kb returns 0 for -1 because it takes an unsigned int + but we pass an int to it. A simple fix is check max_size before + it’s rounded to max_size_kb. + + Test case: + 1 enable and set quota: + radosgw-admin quota enable --uid={user_id} --quota-scope=user + radosgw-admin quota set --quota-scope=user --uid={user_id}\ + --max-objects=100 --max-size=-1 + 2 upload any object with non-zero length + it will return 403 with “QuotaExceeded” and return 200 if you apply the fix. + + Fixes: #9907 + Backport: giant, firefly + Signed-off-by: Dong Lei leidong@yahoo-inc.com + (cherry picked from commit abd3fd3ef9ee9999b99811937af60b7a5e673e35) + +commit 30963fdc98f5650a68b5737729920d43459b5899 +Author: Yehuda Sadeh +Date: Tue Sep 23 12:43:55 2014 -0700 + + rgw: rados->set_attrs() updates bucket index + + Fixes: #5595 + Backport: dumpling, firefly + We need to update the bucket index when updating object attrs, otherwise + we're missing meta changes that need to be registered. It also + solves issue of bucket index not knowing about object acl changes, + although this one still requires some more work. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit f833f12a200ecc2c4f74ddb443d6fa61b7ad14db) + +commit 31f5e332c616dbb7f7338af3ab37ac65ff66f733 +Author: Yehuda Sadeh +Date: Tue Nov 4 22:05:03 2014 -0800 + + rgw: RGWRados::get_obj() returns wrong len if len == 0 + + Fixes: #9877 + We only updated if len was > 0, should update it if r >= 0. This was the + culprit for issue #9877. + Backport: giant, firefly + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit fe7bf06366adaf787816d1e68f5e3f68e8c91134) + +commit 8682d1b15e097ececd927dfbd096dbb46403afca +Author: Yehuda Sadeh +Date: Wed Nov 5 13:28:02 2014 -0800 + + rgw: send back ETag on S3 object copy + + Fixes: #9479 + Backport: firefly, giant + We didn't send the etag back correctly. Original code assumed the etag + resided in the attrs, but attrs only contained request attrs. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit b1bfc3a7e0c9088f01f8ff770ae14f569fbc570d) + + Conflicts: + src/rgw/rgw_rados.cc + +commit 2d61fc76e97e5e28cf24cc6e341d49a8fe47059c +Author: Yehuda Sadeh +Date: Wed Nov 5 12:35:42 2014 -0800 + + rgw: S3 object copy content type fix + + Fixes: #9478 + Backport: firefly, giant + Content type for S3 object copy response should be set to + application/xml. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 31963459a0a869c4d32f55baa629f36df33eaa90) + +commit 011a8c3bd621337e5a1746f18882be20d3854b14 +Author: Yehuda Sadeh +Date: Fri Dec 12 11:20:26 2014 -0800 + + rgw: http headers need to end with \r\n + + Fixes: #9254 + Backport: firefly, giant + + Reported-by: Benedikt Fraunhofer + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 7409ab3df18fb312dd6c9f79084f889c523afdce) + + Conflicts: + src/rgw/rgw_civetweb.cc + src/rgw/rgw_fcgi.cc + +commit 71d119f2952716aa1f75817e1daaf5fb67ecde94 +Author: Yehuda Sadeh +Date: Wed Jan 7 13:56:14 2015 -0800 + + rgw: index swift keys appropriately + + Fixes: #10471 + Backport: firefly, giant + + We need to index the swift keys by the full uid:subuser when decoding + the json representation, to keep it in line with how we store it when + creating it through other mechanism. + + Reported-by: hemant burman + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 478629bd2f3f32afbe6e93eaebb8a8fa01af356f) + +commit 7b89a827a063acf78a848a34c1c3298db213950f +Author: Yehuda Sadeh +Date: Thu Nov 20 10:36:05 2014 -0800 + + rgw-admin: create subuser if needed when creating user + + Fixes: #10103 + Backport: firefly, giant + This turned up after fixing #9973. Earlier we also didn't create the + subuser in this case, but we didn't really read the subuser info when it + was authenticating. Now we do that as required, so we end up failing the + authentication. This only applies to cases where a subuser was created + using 'user create', and not the 'subuser create' command. + + Reviewed-by: Sage Weil + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 9ba17a321db06d3d76c9295e411c76842194b25c) + +commit 9ee29b5355e9ffeac76707e8d4070bfff5dc99d5 +Author: Yehuda Sadeh +Date: Wed Nov 5 14:38:46 2014 -0800 + + rgw: update swift subuser perm masks when authenticating + + Fixes: #9918 + Backport: firefly, giant + It seems that we weren't setting the swift perm mask correctly. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 5d9f36f757a7272c24d2c9adc31db1ed5e712992) + +commit d298fdda2aa177e1a4b43ca94a2292435a50dea4 +Author: Yehuda Sadeh +Date: Thu Oct 23 17:39:42 2014 -0700 + + rgw: send http status reason explicitly in fastcgi + + There are issues in certain versions of apache 2.4, where the reason is + not sent back. Instead, just provide the reason explicitly. + + Backport: firefly, giant + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit a9dd4af401328e8f9071dee52470a0685ceb296b) diff --git a/doc/changelog/v0.87.1.txt b/doc/changelog/v0.87.1.txt new file mode 100644 index 0000000000000..cf0eecac66115 --- /dev/null +++ b/doc/changelog/v0.87.1.txt @@ -0,0 +1,2151 @@ +commit 283c2e7cfa2457799f534744d7d549f83ea1335e (tag: refs/tags/v0.87.1, refs/remotes/gh/giant) +Author: Jenkins +Date: Mon Feb 23 12:02:04 2015 -0800 + + 0.87.1 + +commit 4178e32dd085adeead84fb168ab8a8a121256259 +Merge: ccb0914 734e9af +Author: Loic Dachary +Date: Tue Feb 17 01:09:54 2015 +0100 + + Merge pull request #3731 from liewegas/wip-10834-giant + + osd: tolerate sessionless con in fast dispatch path + + Reviewed-by: Loic Dachary + +commit 734e9af5df4ae419ded108f5036bee068a9bc2b2 +Author: Sage Weil +Date: Mon Dec 1 18:15:59 2014 -0800 + + osd: tolerate sessionless con in fast dispatch path + + We can now get a session cleared from a Connection at any time. Change + the assert to an if in ms_fast_dispatch to cope. It's pretty rare, but it + can happen, especially with delay injection. In particular, a racing + thread can call mark_down() on us. + + Fixes: #10209 + Backport: giant + Signed-off-by: Sage Weil + (cherry picked from commit 01df2227125abf94571b4b0c7bccca57098ed2dc) + +commit ccb0914f76da23acdd7374233cd1939ab80ef3c8 +Author: Josh Durgin +Date: Mon Feb 2 16:43:35 2015 +0100 + + qa: use correct binary path on rpm-based systems + + Fixes: #10715 + Signed-off-by: Josh Durgin + (cherry picked from commit 05ce2aa1bf030ea225300b48e7914577a412b38c) + +commit 78c71b9200da5e7d832ec58765478404d31ae6b5 +Merge: 222aa22 91515e7 +Author: Loic Dachary +Date: Wed Feb 11 00:11:57 2015 +0100 + + Merge pull request #3407 from ceph/wip-9854-giant + + osdc: Constrain max number of in-flight read requests + +commit 222aa22ebc0ccb1b04156e0c9d05f4e4733ec290 +Merge: b9ff170 a5cb39c +Author: Loic Dachary +Date: Tue Feb 10 22:01:21 2015 +0100 + + Merge pull request #3568 from dachary/wip-10471-rgw-giant + + rgw: use s->bucket_attrs instead of trying to read obj attrs + +commit b9ff1708ad85ca5aeb10b4202bcbe197251e3bd8 +Merge: 34103b6 b1e4882 +Author: Loic Dachary +Date: Tue Feb 10 21:59:40 2015 +0100 + + Merge pull request #3263 from dachary/wip-jerasure-giant + + erasure-code: update links to jerasure upstream (giant) + +commit 34103b6355881820aa10b354c2427654bf229e8f +Merge: 94889cf d125743 +Author: Loic Dachary +Date: Tue Feb 10 21:59:17 2015 +0100 + + Merge pull request #3191 from ceph/giant-10277 + + Giant 10277 + +commit 94889cf6bef5a542e51bf8434dbe7c68f64604ce +Merge: d7b10d8 d28c8e0 +Author: Loic Dachary +Date: Tue Feb 10 21:58:52 2015 +0100 + + Merge pull request #3186 from ceph/wip-giant-mon-backports + + mon: backports for #9987 against giant + +commit d7b10d897e17bc3fa690c8484ad2d6f233896237 +Merge: 11f7d06 16c6d0d +Author: Loic Dachary +Date: Tue Feb 10 21:58:29 2015 +0100 + + Merge pull request #3185 from dachary/wip-10325-cauchy-giant + + erasure-code: relax cauchy w restrictions (giant) + +commit 11f7d064e5d93bc0ed8896750344c6cf6b37aeab +Merge: 975be75 636b98f +Author: Loic Dachary +Date: Tue Feb 10 21:58:05 2015 +0100 + + Merge pull request #3178 from dachary/wip-9998-crush-underfloat-giant + + crush: fix weight underfloat issue (giant) + +commit 975be75f4bcea88b232ea76087b49e288d7c29f7 +Merge: 51fe79d d759e71 +Author: Loic Dachary +Date: Tue Feb 10 21:57:50 2015 +0100 + + Merge pull request #3579 from dachary/wip-9877-rgw-giant + + rgw: RGWRados::get_obj() returns wrong len if len == 0 + +commit 51fe79d9e63c7df4da547a0ba7a12aa9c6cd7ab2 +Merge: fca9ead 319f9c9 +Author: Loic Dachary +Date: Tue Feb 10 21:57:18 2015 +0100 + + Merge pull request #3168 from ceph/wip-8797-giant + + Wip 8797 giant + +commit fca9eadaf3fcef77e269d4936d4eea86ab6c3faf +Merge: 317532b 9886620 +Author: Loic Dachary +Date: Tue Feb 10 21:56:53 2015 +0100 + + Merge pull request #3582 from dachary/wip-10062-rgw-giant + + rgw: s3 keystone auth fixes + +commit 317532b70383762f473a910e043c889574eb6087 +Merge: 3e8f3e3 debc0c5 +Author: Loic Dachary +Date: Tue Feb 10 21:56:37 2015 +0100 + + Merge pull request #3581 from dachary/wip-9918-rgw-giant + + rgw: update swift subuser perm masks when authenticating + +commit 3e8f3e38af76fa2cba86aedf962d3230d7979f63 +Merge: 1d77591 76f9de3 +Author: Loic Dachary +Date: Tue Feb 10 21:56:17 2015 +0100 + + Merge pull request #3580 from dachary/wip-9907-rgw-giant + + fix can not disable max_size quota + +commit 1d7759149697242192be05decf7ffafb17b24cbe +Merge: b016863 ad04a67 +Author: Loic Dachary +Date: Tue Feb 10 21:55:56 2015 +0100 + + Merge pull request #3083 from dachary/wip-10211-erasure-code-buffer-alignement-giant + + erasure-code: enforce chunk size alignment (giant) + +commit b016863ad243388e7571da9ffca3013c8f99237a +Merge: bdcc9dc d21f4e3 +Author: Loic Dachary +Date: Tue Feb 10 21:55:23 2015 +0100 + + Merge pull request #3577 from dachary/wip-9587-rgw-giant + + init-radosgw.sysv: set ulimit -n before starting daemon + +commit bdcc9dcb8586c91b432c7087e33a2b52ef467b54 +Merge: df475f9 7b5f746 +Author: Loic Dachary +Date: Tue Feb 10 21:54:58 2015 +0100 + + Merge pull request #3576 from dachary/wip-5595-9576-rgw-giant + + update object content-length doesn't work correctly + +commit df475f92a41e3bd5a022335b2c9023ad40c3b47b +Merge: db7adf8 b2f6f7f +Author: Loic Dachary +Date: Tue Feb 10 21:54:13 2015 +0100 + + Merge pull request #3575 from dachary/wip-9479-rgw-giant + + rgw: send back ETag on S3 object copy + +commit db7adf8d8ca225fea2d0277ced614e936df086c9 +Merge: 9b50db9 67ba4d3 +Author: Loic Dachary +Date: Tue Feb 10 21:53:55 2015 +0100 + + Merge pull request #3574 from dachary/wip-9478-rgw-giant + + rgw: S3 object copy content type fix + +commit 9b50db97a9552841ed143588e2f63bab56d0aecb +Merge: 583fe31 84e9b6c +Author: Loic Dachary +Date: Tue Feb 10 21:53:33 2015 +0100 + + Merge pull request #3573 from dachary/wip-9254-rgw-giant + + rgw: http headers need to end with \r\n + +commit 583fe31681c4eea8b85b413674074445e2b424a6 +Merge: 5240db5 1cb0955 +Author: Loic Dachary +Date: Tue Feb 10 21:53:15 2015 +0100 + + Merge pull request #3572 from dachary/wip-9973-rgw-giant + + rgw: remove swift user manifest (DLO) hash calculation + +commit 5240db588e9017dd8a487b7a9ee16f171fdda1ff +Merge: 62e1552 e230fab +Author: Loic Dachary +Date: Tue Feb 10 21:52:54 2015 +0100 + + Merge pull request #3571 from dachary/wip-8911-rgw-giant + + rgw: swift GET / HEAD object returns X-Timestamp field + +commit 62e15528dda20b5419e39744fa9e0c9c4cae053c +Merge: 16cd892 c24fab3 +Author: Loic Dachary +Date: Tue Feb 10 21:52:36 2015 +0100 + + Merge pull request #3570 from dachary/wip-10701-rgw-giant + + rgw: use strict_strtoll() for content length + +commit 16cd892aab4ffb1dc15b93a4101d9bc209591c94 +Merge: 028904c 6aef29e +Author: Loic Dachary +Date: Tue Feb 10 21:52:00 2015 +0100 + + Merge pull request #3569 from dachary/wip-10103-rgw-giant + + rgw-admin: create subuser if needed when creating user + +commit 028904cf7c36a1d5342cf29c115bc0437e9b2d74 +Merge: 520dcf8 425ee8a +Author: Loic Dachary +Date: Tue Feb 10 21:51:06 2015 +0100 + + Merge pull request #3567 from dachary/wip-10307-rgw-giant + + rgw: use s->bucket_attrs instead of trying to read obj attrs + +commit 520dcf8624eeafd694115b382616be83f9b344d3 +Merge: cae1de2 14cdb9b +Author: Loic Dachary +Date: Tue Feb 10 21:50:34 2015 +0100 + + Merge pull request #3443 from ceph/wip-10590-giant + + rbd: ensure aio_write buffer isn't invalidated during image import + +commit cae1de29922c2183eff021c6fe2b921a87b5f5b2 +Merge: b346ad3 83a0a2e +Author: Loic Dachary +Date: Tue Feb 10 21:50:14 2015 +0100 + + Merge pull request #3557 from dachary/wip-10688-boost-157-giant + + support Boost 1.57.0 + +commit b346ad37a84b7cfd0bae84528f2533a8cc4a8e3d +Merge: aacd51c 13bb880 +Author: Loic Dachary +Date: Tue Feb 10 21:49:47 2015 +0100 + + Merge pull request #2954 from sponce/giant + + Fixed trivial locking issue in the trunc method of libradosstriper - Giant branch + +commit aacd51c74c102d44982421b9bc384d12fc160e3c +Merge: 1d97c7c 081f49b +Author: Loic Dachary +Date: Tue Feb 10 21:49:20 2015 +0100 + + Merge pull request #3405 from ceph/wip-10299-giant + + librbd: complete all pending aio ops prior to closing image + +commit 1d97c7c9a3087e7bc98774d9fe2882bdc4a84531 +Merge: 53dec0e 436923c +Author: Loic Dachary +Date: Tue Feb 10 21:48:49 2015 +0100 + + Merge pull request #3403 from ceph/wip-10270-giant + + librbd: gracefully handle deleted/renamed pools + +commit 53dec0eeee60b315e88acb4ba05666857ae3e0eb +Merge: df8285c 1261bf2 +Author: Loic Dachary +Date: Tue Feb 10 21:48:20 2015 +0100 + + Merge pull request #3356 from liewegas/wip-msgr-giant + + msgr: fast dispatch backports for giant + +commit df8285c5e5b14d9a4cd42fb9be8e18fe6cdf6f83 +Author: Greg Farnum +Date: Thu Feb 5 21:12:17 2015 -0800 + + fsync-tester: print info about PATH and locations of lsof lookup + + We're seeing the lsof invocation fail (as not found) in testing and nobody can + identify why. Since attempting to reproduce the issue has not worked, this + patch will gather data from a genuinely in-vitro location. + + Signed-off-by: Greg Farnum + (cherry picked from commit a85051483874ff5b8b0fb50426a3577040457596) + +commit 91515e750bfe2453ce8ac9ec568b0e314823dd82 +Author: Jason Dillaman +Date: Mon Oct 27 14:47:19 2014 -0400 + + osdc: Constrain max number of in-flight read requests + + Constrain the number of in-flight RADOS read requests to the + cache size. This reduces the chance of the cache memory + ballooning during certain scenarios like copy-up which can + invoke many concurrent read requests. + + Fixes: #9854 + Backport: giant, firefly, dumpling + Signed-off-by: Jason Dillaman + +commit 98866208c64348ca885335d95a1c737071a17004 +Author: Abhishek Lekshmanan +Date: Mon Dec 22 19:57:19 2014 +0530 + + rgw: check keystone auth also for s3 post requests + + This patch adds keystone auth for s3 post requests, once a user fails in + cephx authentication, they are checked for keystone if configured. + + Fixes #10062 + Signed-off-by: Abhishek Lekshmanan + + (cherry picked from commit 8b3dfc9472022ea45ad24e02e0aa21dfdad798f8) + +commit 4e4372b8e551bb1b974f08dc69f5b27bdd22bb4b +Author: Abhishek Lekshmanan +Date: Mon Nov 17 17:37:00 2014 +0530 + + rgw: check for timestamp for s3 keystone auth + + This commit ensures that we check for timestamp of s3 request is within + acceptable grace time of radosgw + Addresses some failures in #10062 + Fixes: #10062 + Signed-off-by: Abhishek Lekshmanan + + (cherry picked from commit 4b35ae067fef9f97b886afe112d662c61c564365) + +commit debc0c593fb7401d07a34f7916380092ad7285f9 +Author: Yehuda Sadeh +Date: Wed Nov 5 14:38:46 2014 -0800 + + rgw: update swift subuser perm masks when authenticating + + Fixes: #9918 + Backport: firefly, giant + It seems that we weren't setting the swift perm mask correctly. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 5d9f36f757a7272c24d2c9adc31db1ed5e712992) + +commit 76f9de330deaa4fafb86d3f6f2754f0d419306b1 +Author: Lei Dong +Date: Mon Oct 27 10:29:48 2014 +0800 + + fix can not disable max_size quota + + Currently if we enable quota and set max_size = -1, it doesn’t + mean max_size is unlimited as expected. Instead, it means object + with any size is not allowed to upload because of “QuotaExceeded”. + The root cause is the function rgw_rounded_kb which convert max_size + to max_size_kb returns 0 for -1 because it takes an unsigned int + but we pass an int to it. A simple fix is check max_size before + it’s rounded to max_size_kb. + + Test case: + 1 enable and set quota: + radosgw-admin quota enable --uid={user_id} --quota-scope=user + radosgw-admin quota set --quota-scope=user --uid={user_id}\ + --max-objects=100 --max-size=-1 + 2 upload any object with non-zero length + it will return 403 with “QuotaExceeded” and return 200 if you apply the fix. + + Fixes: #9907 + Backport: giant, firefly + Signed-off-by: Dong Lei leidong@yahoo-inc.com + (cherry picked from commit abd3fd3ef9ee9999b99811937af60b7a5e673e35) + +commit d759e71c8167ea29c8fda9483039a3e491083da5 +Author: Yehuda Sadeh +Date: Tue Nov 4 22:05:03 2014 -0800 + + rgw: RGWRados::get_obj() returns wrong len if len == 0 + + Fixes: #9877 + We only updated if len was > 0, should update it if r >= 0. This was the + culprit for issue #9877. + Backport: giant, firefly + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit fe7bf06366adaf787816d1e68f5e3f68e8c91134) + +commit d21f4e326eb4821cc9bd38a1b62a0210272277d4 +Author: Sage Weil +Date: Tue Oct 21 17:59:30 2014 -0700 + + init-radosgw.sysv: set ulimit -n before starting daemon + + If we do the ulimit inside the daemon command we will have already + dropped privs and will fail. + + Fixes: #9587 + Backport: giant, firefly + Signed-off-by: Sage Weil + (cherry picked from commit 9803cedf54a7baff45ccd0e0f65d2bc220958a46) + +commit 7b5f746d2f97c7139f9c31962c107a074bfd1863 +Author: Yehuda Sadeh +Date: Tue Sep 23 12:42:10 2014 -0700 + + rgw: PutObjMetadata, clear bufferlist before appending into it + + Fixes: #9576 + Backport: firefly, dumpling + + We need to completely rewrite the bufferlist, not append into it. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 44cfd88dc65d30f4295743c5834768bb13f7b805) + +commit e24f27b7b2e2aeb84b14788e8bf2757ecdf8f0c0 +Author: Yehuda Sadeh +Date: Tue Sep 23 12:43:55 2014 -0700 + + rgw: rados->set_attrs() updates bucket index + + Fixes: #5595 + Backport: dumpling, firefly + We need to update the bucket index when updating object attrs, otherwise + we're missing meta changes that need to be registered. It also + solves issue of bucket index not knowing about object acl changes, + although this one still requires some more work. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit f833f12a200ecc2c4f74ddb443d6fa61b7ad14db) + +commit b2f6f7f6205682aeb09c0785e373ddf5d89c6d04 +Author: Yehuda Sadeh +Date: Wed Nov 5 13:28:02 2014 -0800 + + rgw: send back ETag on S3 object copy + + Fixes: #9479 + Backport: firefly, giant + We didn't send the etag back correctly. Original code assumed the etag + resided in the attrs, but attrs only contained request attrs. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit b1bfc3a7e0c9088f01f8ff770ae14f569fbc570d) + +commit 67ba4d3444f0f64dae6286be28276ba85376ecf6 +Author: Yehuda Sadeh +Date: Wed Nov 5 12:35:42 2014 -0800 + + rgw: S3 object copy content type fix + + Fixes: #9478 + Backport: firefly, giant + Content type for S3 object copy response should be set to + application/xml. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 31963459a0a869c4d32f55baa629f36df33eaa90) + +commit 84e9b6c32c6fdc38e2c64f3360c185332e691bf4 +Author: Yehuda Sadeh +Date: Fri Dec 12 11:20:26 2014 -0800 + + rgw: http headers need to end with \r\n + + Fixes: #9254 + Backport: firefly, giant + + Reported-by: Benedikt Fraunhofer + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 7409ab3df18fb312dd6c9f79084f889c523afdce) + +commit 1cb09555d3fdb568296797cd83eb5557552f056c +Author: Yehuda Sadeh +Date: Wed Nov 5 13:40:55 2014 -0800 + + rgw: remove swift user manifest (DLO) hash calculation + + Fixes: #9973 + Backport: firefly, giant + + Previously we were iterating through the parts, creating hash of the + parts etags (as S3 does for multipart uploads). However, swift just + calculates the etag for the empty manifest object. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit ef6d3ad964d34bc526dc4435486bd5c8cdc3b230) + +commit e230fabf29c4660594d19027af49810e57b82e35 +Author: Yehuda Sadeh +Date: Tue Sep 30 14:15:47 2014 -0700 + + rgw: swift GET / HEAD object returns X-Timestamp field + + Fixes: #8911 + Backport: giant, firefly, dumpling + Swift clients expect X-Timestamp header, dump it. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 5b41d80b7fb9ed96c26801fc42c044191bb18d84) + +commit c24fab3065ba6d81435981b609f2b69c3d98d21d +Author: Yehuda Sadeh +Date: Fri Jan 30 10:51:52 2015 -0800 + + rgw: use strict_strtoll() for content length + + instead of strict_strtol(). + + Backport: giant, firefly + Fixes: #10701 + + Reported-by: Axel Dunkel + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 1c25dbafb45caf1b261cfcec15b868a2ba6b5fef) + +commit 6aef29e31e9c7c7ccf8e95d573700c08218b2b45 +Author: Yehuda Sadeh +Date: Thu Nov 20 10:36:05 2014 -0800 + + rgw-admin: create subuser if needed when creating user + + Fixes: #10103 + Backport: firefly, giant + This turned up after fixing #9973. Earlier we also didn't create the + subuser in this case, but we didn't really read the subuser info when it + was authenticating. Now we do that as required, so we end up failing the + authentication. This only applies to cases where a subuser was created + using 'user create', and not the 'subuser create' command. + + Reviewed-by: Sage Weil + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 9ba17a321db06d3d76c9295e411c76842194b25c) + +commit a5cb39cbb6aee869b92ac20975b5c80a01210b63 +Author: Yehuda Sadeh +Date: Fri Dec 12 17:07:30 2014 -0800 + + rgw: use s->bucket_attrs instead of trying to read obj attrs + + Fixes: #10307 + Backport: firefly, giant + + This is needed, since we can't really read the bucket attrs by trying to + read the bucket entry point attrs. We already have the bucket attrs + anyway, use these. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 5cf193c8686196d5235889e68cb5ea8f1fc8e556) + +commit 425ee8a07bb8ce12eee124b3c374031f644aa32b +Author: Yehuda Sadeh +Date: Fri Dec 12 17:07:30 2014 -0800 + + rgw: use s->bucket_attrs instead of trying to read obj attrs + + Fixes: #10307 + Backport: firefly, giant + + This is needed, since we can't really read the bucket attrs by trying to + read the bucket entry point attrs. We already have the bucket attrs + anyway, use these. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 5cf193c8686196d5235889e68cb5ea8f1fc8e556) + +commit 83a0a2e5b52b18a25009faaf09fa4f48af3c52ee +Author: William A. Kennington III +Date: Sat Sep 20 22:52:31 2014 -0700 + + osd: Cleanup boost optionals + + Signed-off-by: William A. Kennington III + (cherry picked from commit a53ead14c113047567177630b4906136a2109b65) + +commit eb30631ec3d081fd1bc2cdbd4812a334de9e1282 +Author: Petr Machata +Date: Thu Jan 29 10:15:02 2015 -0700 + + support Boost 1.57.0 + + Sometime after 1.55, boost introduced a forward declaration of + operator<< in optional.hpp. In 1.55 and earlier, when << was used + without the _io having been included, what got dumped was an implicit + bool conversion. + + http://tracker.ceph.com/issues/10688 Refs: #10688 + Signed-off-by: Ken Dreyer + (cherry picked from commit 85717394c33137eb703a7b88608ec9cf3287f67a) + + Conflicts: + src/include/encoding.h + trivial conflict + +commit 1ccd73a16e1829b5519ec5b83b2554af173ad052 +Author: Yehuda Sadeh +Date: Thu Oct 23 17:39:42 2014 -0700 + + rgw: send http status reason explicitly in fastcgi + + There are issues in certain versions of apache 2.4, where the reason is + not sent back. Instead, just provide the reason explicitly. + + Backport: firefly, giant + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit a9dd4af401328e8f9071dee52470a0685ceb296b) + +commit 3bf42af2e932a473b19cb54637e8543a666a4a28 +Author: Yehuda Sadeh +Date: Thu Jan 15 16:31:22 2015 -0800 + + rgw: fix partial GET in swift + + Fixes: #10553 + backport: firefly, giant + + Don't set the ret code to reflect partial download, just set the + response status when needed. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 7e1553cedff90fa0fefded65cde87ad068eb5f0c) + +commit aa038684dce1964d5d23802d23f2bd772458ea11 +Author: Sage Weil +Date: Mon Dec 15 17:04:32 2014 -0800 + + osd: handle no-op write with snapshot case + + If we have a transaction that does something to the object but it !exists + both before and after, we will continue through the write path. If the + snapdir object already exists, and we try to create it again, we will + leak a snapdir obc and lock and later crash on an assert when the obc + is destroyed: + + 0> 2014-12-06 01:49:51.750163 7f08d6ade700 -1 osd/osd_types.h: In function 'ObjectContext::~ObjectContext()' thread 7f08d6ade700 time 2014-12-06 01:49:51.605411 + osd/osd_types.h: 2944: FAILED assert(rwstate.empty()) + + Fix is to not recreated the snapdir if it already exists. + + Fixes: #10262 + Signed-off-by: Sage Weil + (cherry picked from commit 02fae9fc54c10b5a932102bac43f32199d4cb612) + +commit e045ad4a39076547209ac1dc298df5ebffb76669 +Merge: a463b92 9f865fa +Author: Gregory Farnum +Date: Tue Jan 27 09:40:16 2015 -0800 + + Merge pull request #3502 from ceph/wip-10382-giant + + [giant backport] mds: handle heartbeat_reset during shutdown + + Reviewed-by: Greg Farnum + +commit 9f865fae095a1fe8a26acb50667f1d774d6020b6 +Author: John Spray +Date: Wed Jan 14 10:35:53 2015 +0000 + + mds: handle heartbeat_reset during shutdown + + Because any thread might grab mds_lock and call heartbeat_reset + immediately after a call to suicide() completes, this needs + to be handled as a special case where we tolerate MDS::hb having + already been destroyed. + + Fixes: #10382 + Signed-off-by: John Spray + +commit a463b92e475cd1f4cdb963e402033ebc9d37dbdc +Author: Sage Weil +Date: Mon Jan 19 18:28:20 2015 -0800 + + ceph_test_rados_api_misc: do not assert rbd feature match + + This test fails on upgrades when we (or the server) have new + features. Make it less fragile. + + Fixes: #10576 + Signed-off-by: Sage Weil + (cherry picked from commit 9147c62989871cea8b3a85b02c53017825efb55b) + +commit 14cdb9bb6d27f2017a3a8e6c1f274b9f40fb7456 +Author: Jason Dillaman +Date: Wed Jan 21 14:55:02 2015 -0500 + + rbd: ensure aio_write buffer isn't invalidated during image import + + The buffer provided to aio_write shouldn't be invalidated until + after aio_write has indicated that the operation has completed. + + Fixes: #10590 + Backport: giant + Signed-off-by: Jason Dillaman + Reviewed-by: Josh Durgin + (cherry picked from commit 4d3b49e9d62bc1040356ca3ebe7f90c181734eb6) + +commit 081f49b47ca8d7583211f546ab5699b14f773bfc +Author: Jason Dillaman +Date: Mon Dec 15 10:53:53 2014 -0500 + + librbd: complete all pending aio ops prior to closing image + + It was possible for an image to be closed while aio operations + were still outstanding. Now all aio operations are tracked and + completed before the image is closed. + + Fixes: #10299 + Backport: giant, firefly, dumpling + Signed-off-by: Jason Dillaman + +commit 436923c68b77c900b7774fbef918c0d6e1614a36 +Author: Jason Dillaman +Date: Mon Jan 19 10:28:56 2015 -0500 + + librbd: gracefully handle deleted/renamed pools + + snap_unprotect and list_children both attempt to scan all + pools. If a pool is deleted or renamed during the scan, + the methods would previously return -ENOENT. Both methods + have been modified to more gracefully handle this condition. + + Fixes: #10270 + Backport: giant, firefly + Signed-off-by: Jason Dillaman + +commit 4c8a5cedcb7942e1e01ab4cedfbf03e4c56cc1e4 +Author: Yehuda Sadeh +Date: Fri Dec 12 05:24:01 2014 -0800 + + rgw: change multipart upload id magic + + Fixes: #10271 + Backport: firefly, giant + + Some clients can't sign requests correctly with the original magic + prefix. + + Reported-by: Georgios Dimitrakakis + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 5fc7a0be67a03ed63fcc8408f8d71a31a1841076) + +commit b10c0d5110547586b2edac53c267391d3d42f974 +Author: Yehuda Sadeh +Date: Thu Dec 11 09:07:10 2014 -0800 + + rgw: url decode http query params correctly + + Fixes: #10271 + Backport: firefly + + This got broken by the fix for #8702. Since we now only url_decode if + we're in query, we need to specify that we're in query when decoding + these args. + + Reported-by: Georgios Dimitrakakis + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 21e07eb6abacb085f81b65acd706b46af29ffc03) + +commit 65bf3b08b572b9b25ad064fb784742e5d6456f06 +Author: Josh Durgin +Date: Wed Jan 14 15:01:38 2015 -0800 + + qa: ignore duplicates in rados ls + + These can happen with split or with state changes due to reordering + results within the hash range requested. It's easy enough to filter + them out at this stage. + + Backport: giant, firefly + Signed-off-by: Josh Durgin + (cherry picked from commit e7cc6117adf653a4915fb7a75fac68f8fa0239ec) + +commit 1261bf24624f871672002ab0915e23f1c95b0aa5 +Author: Sage Weil +Date: Tue Oct 14 12:42:40 2014 -0700 + + Revert "Objecter: disable fast dispatch of CEPH_MSG_OSD_OPREPLY messages" + + This reverts commit 3f23709c474292f9239f77a6cce26309fc86ce29. + + We have changed mark_down() behavior so that it no longer blocks on + fast dispatch. This makes the objecter reply handler safe again. + + Fixes: #9598 + Signed-off-by: Sage Weil + (cherry picked from commit c9f9e72e558521cb90f90538bc27f995f82d76c2) + +commit 300d4c6ff7e998dba0c67f6dde746dc23d681397 +Author: Sage Weil +Date: Tue Oct 14 12:41:48 2014 -0700 + + msg/simple: do not stop_and_wait on mark_down + + We originally blocked in mark_down for fast dispatch threads + to complete to avoid various races in the code. Most of these + were in the OSD itself, where we were not prepared to get + messges on connections that had no attached session. Since + then, the OSD checks have been cleaned up to handle this. + There were other races we were worried about too, but the + details have been lost in the depths of time. + + Instead, take the other route: make mark_down never block on + dispatch. This lets us remove the special case that + was added in order to cope with fast dispatch calling + mark_down on itself. + + Now, the only stop_and_wait() user is the shutdown sequence. + + Signed-off-by: Sage Weil + (cherry picked from commit 00907e032011b9d2acd16ea588555cf379830814) + +commit c3335c7aa6c1e6c3f1879c0cd3cd2f13091221be +Author: Sage Weil +Date: Fri Oct 31 16:25:09 2014 -0700 + + msg/Pipe: inject delay in stop_and_wait + + Inject a delay in stop_and_wait. This will mostly affect the connection + race Pipe takeover code which currently calls stop_and_wait while holding + the msgr->lock. This should make it easier for a racing fast_dispatch + method to get stuck on a call that (indirectly) needs the msgr lock. + See #9921. + + Signed-off-by: Sage Weil + (cherry picked from commit 2fe5c4c305218fdb1771857e4e0ef7c98a8d0fb6) + +commit 1dbe8f5a6f7bf2b7c86d24f27d569d71e0076ee9 +Author: Greg Farnum +Date: Tue Oct 28 16:45:43 2014 -0700 + + SimpleMessenger: Pipe: do not block on takeover while holding global lock + + We previously were able to cause deadlocks: + 1) Existing pipe is fast_dispatching + 2) Replacement incoming pipe is accepted + *) blocks on stop_and_wait() of existing Pipe + 3) External things are blocked on SimpleMessenger::lock() while + blocking completion of the fast dispatch. + + To resolve this, if we detect that an existing Pipe we want to take over is + in the process of fast dispatching, we unlock our locks and wait on it to + finish. Then we go back to the lookup step and retry. + + The effect of this should be safe: + 1) We are not making any changes to the existing Pipe in new ways + 2) We have not registered the new Pipe anywhere + 3) We have not sent back any replies based on Messenger state to + the remote endpoint. + + Backport: giant + Fixes: #9921 + Signed-off-by: Greg Farnum + (cherry picked from commit 2d6980570af2226fdee0edfcfe5a8e7f60fae615) + +commit 16c023d8fa5575d4dd138aeee4d4fd9b8f32c0f6 +Author: Sage Weil +Date: Thu Jan 8 13:34:52 2015 -0800 + + osd: requeue PG when we skip handling a peering event + + If we don't handle the event, we need to put the PG back into the peering + queue or else the event won't get processed until the next event is + queued, at which point we'll be processing events with a delay. + + The queue_null is not necessary (and is a waste of effort) because the + event is still in pg->peering_queue and the PG is queued. + + Note that this only triggers when we exceeed osd_map_max_advance, usually + when there is a lot of peering and recovery activity going on. A + workaround is to increase that value, but if you exceed osd_map_cache_size + you expose yourself to crache thrashing by the peering work queue, which + can cause serious problems with heavily degraded clusters and bit lots of + people on dumpling. + + Backport: giant, firefly + Fixes: #10431 + Signed-off-by: Sage Weil + (cherry picked from commit 492ccc900c3358f36b6b14a207beec071eb06707) + +commit 16c6d0d589d53aad7bb2cd0e104300fb920d5caf +Author: Loic Dachary +Date: Tue Dec 16 13:31:30 2014 +0100 + + erasure-code: relax cauchy w restrictions + + A restriction that the w parameter of the cauchy technique is limited to + 8, 16 or 32 was added incorrectly while refactoring parameter parsing in + the jerasure plugin and must be relaxed. + + http://tracker.ceph.com/issues/10325 Fixes: #10325 + + Signed-off-by: Loic Dachary + (cherry picked from commit bb80437f247345502203ad87a7e7bbb5b5602b9a) + +commit 636b98faa6b1c9fd6de1b8653d1d282577b54684 +Author: Sage Weil +Date: Sun Nov 23 18:50:51 2014 -0800 + + crush/CrushWrapper: fix create_or_move_item when name exists but item does not + + We were using item_exists(), which simply checks if we have a name defined + for the item. Instead, use _search_item_exists(), which looks for an + instance of the item somewhere in the hierarchy. This matches what + get_item_weightf() is doing, which ensures we get a non-negative weight + that converts properly to floating point. + + Backport: giant, firefly + Fixes: #9998 + Reported-by: Pawel Sadowski + Signed-off-by: Sage Weil + (cherry picked from commit 9902383c690dca9ed5ba667800413daa8332157e) + +commit ced2472664fab06d03de03d7b23325f9319163b7 +Author: Sage Weil +Date: Fri Nov 21 17:47:56 2014 -0800 + + crush/builder: prevent bucket weight underflow on item removal + + It is possible to set a bucket weight that is not the sum of the item + weights if you manually modify/build the CRUSH map. Protect against any + underflow on the bucket weight when removing items. + + Signed-off-by: Sage Weil + (cherry picked from commit 8c87e9502142d5b4a282b94f929ae776a49be1dc) + +commit adf8798dabb679110c6815af5d73ab6ff20a1af8 +Author: Sage Weil +Date: Fri Nov 21 17:37:03 2014 -0800 + + crush/CrushWrapper: fix _search_item_exists + + Reported-by: Pawel Sadowski + Signed-off-by: Sage Weil + (cherry picked from commit eeadd60714d908a3a033aeb7fd542c511e63122b) + +commit 6caa4fa42c6eaa76b3b9caf37e4ee09844f017a7 +Author: Warren Usui +Date: Thu Dec 18 20:00:28 2014 -0800 + + If trusty, use older version of qemu + + Fixes #10319 + Signed-off-by: Warren Usui + (cherry-picked from 46a1a4cb670d30397979cd89808a2e420cef2c11) + +commit 44c944e96440bd338d22533779e0650b99115a16 +Merge: abdbbd6 910ec62 +Author: Sage Weil +Date: Mon Dec 29 10:55:22 2014 -0800 + + Merge pull request #3266 from ceph/giant-10415 + + libcephfs/test.cc: close fd before umount + +commit b1e48820785a1d3153fc926ad21355b3927b44e9 +Author: Loic Dachary +Date: Sun Dec 28 10:29:54 2014 +0100 + + erasure-code: update links to jerasure upstream + + It moved from bitbucket to jerasure.org + + Signed-off-by: Loic Dachary + (cherry picked from commit 8e86f901939f16cc9c8ad7a4108ac4bcf3916d2c) + +commit 910ec624156d26a1830078161f47328a950a4eee +Author: Yan, Zheng +Date: Tue Dec 23 10:22:00 2014 +0800 + + libcephfs/test.cc: close fd before umount + + Fixes: #10415 + Signed-off-by: Yan, Zheng + (cherry picked from commit d3fb563cee4c4cf08ff4ee01782e52a100462429) + +commit abdbbd6e846727385cf0a1412393bc9759dc0244 +Author: Warren Usui +Date: Tue Dec 16 22:01:26 2014 -0800 + + Remove sepia dependency (use fqdn) + + Fixes: #10255 + Signed-off-by: Warren Usui + (cherry picked from commit 19dafe164833705225e168a686696fb4e170aba7) + +commit d1257436fdf79bad5fe0719a6be71e2abb2d2462 (refs/remotes/gh/giant-10277) +Author: Yan, Zheng +Date: Wed Dec 17 15:59:44 2014 +0800 + + client: use remount to trim kernel dcache + + when remounting a file system, linux kernel trims all unused dentry + in the file system. + + Fixes: #10277 + Signed-off-by: Yan, Zheng + +commit 9de9901cacd2ed2c8c5f65a938fb6a996efab4cd +Author: Yan, Zheng +Date: Wed Dec 17 15:46:49 2014 +0800 + + client: cleanup client callback registration + + Signed-off-by: Yan, Zheng + +commit d28c8e0fb924fbf36e4e15e19554ad30da3ff8f2 (refs/remotes/gh/wip-giant-mon-backports) +Author: Sage Weil +Date: Sun Nov 2 08:50:59 2014 -0800 + + mon/PGMap and PGMonitor: update last_epoch_clean cache from new osd keys + + We were only invalidating the cached value from apply_incremental, which + is no longer called on modern clusters. + + Fix this by storing the update epoch in the key as well (it is not part + of osd_stat_t). + + Backport: giant, firefly, dumpling(?) + Fixes: #9987 + Signed-off-by: Sage Weil + (cherry picked from commit 093c5f0cabeb552b90d944da2c50de48fcf6f564) + +commit 7646f239476609c96b6baf94dfd5f727fff49502 +Author: Sage Weil +Date: Sun Nov 2 08:49:48 2014 -0800 + + mon/PGMap: invalidate cached min_last_epoch_clean from new-style pg keys + + We were only invalidating the cache from the legacy apply_incremental(), + which is no longer called on modern clusters. + + Fixes: #9987 + Signed-off-by: Sage Weil + (cherry picked from commit 3fb731b722c50672a5a9de0c86a621f5f50f2d06) + +commit 6ec14b07940ff64d6a121e21a730f691a1a71546 +Merge: 758d9cf 7bbf80f +Author: Gregory Farnum +Date: Thu Dec 11 17:03:07 2014 -0800 + + Merge pull request #3159 from ceph/wip-10229-giant + + osdc/Filer: use finisher to execute C_Probe and C_PurgeRange [giant backport] + + Reviewed-by: Greg Farnum + +commit 7bbf80ff7388f104cf318dd5ac61ca7d35274694 +Author: Yan, Zheng +Date: Thu Dec 4 12:18:47 2014 +0800 + + osdc/Filer: use finisher to execute C_Probe and C_PurgeRange + + Currently contexts C_Probe/C_PurgeRange are executed while holding + OSDSession::completion_lock. C_Probe and C_PurgeRange may call + Objecter::stat() and Objecter::remove() respectively, which acquire + Objecter::rwlock. This can cause deadlock because there is intermediate + dependency between Objecter::rwlock and OSDSession::completion_lock: + + Objecter::rwlock -> OSDSession::lock -> OSDSession::completion_lock + + The fix is exexcute C_Probe/C_PurgeRange in finisher thread. + + Fixes: #10229 + Signed-off-by: Yan, Zheng + (cherry picked from commit d3ee89ace660161df7796affbf9a70f3d0dedce1) + +commit 758d9cf30bfc7736cc297ba3b047756f7eb8183e +Merge: a8e5638 994dcbb +Author: Gregory Farnum +Date: Thu Dec 11 10:47:38 2014 -0800 + + Merge pull request #3151 from ceph/wip-10288-giant + + mon: fix `fs ls` on peons [giant backport] + + Reviewed-by: Greg Farnum + +commit 994dcbbef4bea532aea4143c3ac1372ca14d2aea +Author: John Spray +Date: Thu Dec 11 14:00:57 2014 +0000 + + mon: fix `fs ls` on peons + + This was incorrectly using pending_mdsmap instead + of mdsmap. We didn't notice in test because of + single-mon configurations. + + Fixes: #10288 + Backport: giant + + Signed-off-by: John Spray + (cherry picked from commit 5559e6aea9e9374ecdac0351777dfd6f5f5d1e67) + +commit 319f9c9352bfd1b95bd685500922e6cee2199b34 (refs/remotes/gh/wip-8797-giant) +Author: Dan Mick +Date: Wed Dec 10 13:19:53 2014 -0800 + + Call Rados.shutdown() explicitly before exit + + This is mostly a demonstration of good behavior, as the resources will + be reclaimed on exit anyway. + + Signed-off-by: Dan Mick + (cherry picked from commit b038e8fbf9103cc42a4cde734b3ee601af6019ea) + +commit ed8c9af3376aeb6f245cbab694fdbc0ce95634a8 +Author: Dan Mick +Date: Wed Dec 10 13:19:16 2014 -0800 + + rados.py: remove Rados.__del__(); it just causes problems + + Recent versions of Python contain a change to thread shutdown that + causes ceph to hang on exit; see http://bugs.python.org/issue21963. + As it turns out, this is relatively easy to avoid by not spawning + threads on exit, as Rados.__del__() will certainly do by calling + shutdown(); I suspect, but haven't proven, that the problem is + that shutdown() tries to start() a threading.Thread() that never + makes it all the way back to signal start(). + + Also add a PendingReleaseNote and extra doc comments to clarify. + + Fixes: #8797 + Signed-off-by: Dan Mick + (cherry picked from commit 5ba9b8f21f8010c59dd84a0ef2acfec99e4b048f) + + Conflicts: + PendingReleaseNotes + +commit a8e56380f08cd5940def4cc47cadba699a8ba45d +Merge: 247a6fa e7faed5 +Author: Samuel Just +Date: Mon Dec 8 13:19:20 2014 -0800 + + Merge pull request #3010 from dachary/wip-10018-primary-erasure-code-hinfo-giant + + osd: deep scrub must not abort if hinfo is missing (giant) + + Reviewed-by: Samuel Just + +commit 247a6fac54854e92a7df0e651e248a262d3efa05 +Merge: 3372060 309fd5f +Author: Gregory Farnum +Date: Mon Dec 8 12:36:48 2014 -0800 + + Merge pull request #3110 from ceph/giant-10263 + + mds: store backtrace for straydir + + Reviewed-by: Greg Farnum + +commit 309fd5f56ef5ea76ffd525fdde6e6fbbc9ef6ef1 +Author: Yan, Zheng +Date: Fri Nov 7 11:38:37 2014 +0800 + + mds: store backtrace for straydir + + Backport: giant, firefly, emperor, dumpling + Signed-off-by: Yan, Zheng + (cherry picked from commit 0d89db5d3e5ae5d552d4058a88a4e186748ab1d2) + +commit 3372060894a1da0adef6d36380a131902ca05c5f +Merge: 1f00420 bff6747 +Author: Sage Weil +Date: Sat Dec 6 11:06:20 2014 -0800 + + Merge pull request #3088 from dachary/wip-10063-hobject-shard-giant + + common: do not omit shard when ghobject NO_GEN is set (giant) + +commit 1f004209434570337a3f90d7f89741f80dcc7075 +Merge: 3b65226 1ec557c +Author: Sage Weil +Date: Fri Dec 5 17:33:12 2014 -0800 + + Merge pull request #3095 from dachary/wip-9785-dmcrypt-keys-permissions-giant + + ceph-disk: dmcrypt file permissions (giant) + +commit 3b65226df806958f6a2f24df6099ee3a86d2a71f +Merge: 691f011 36c7484 +Author: Sage Weil +Date: Fri Dec 5 17:30:31 2014 -0800 + + Merge pull request #3006 from dachary/wip-9420-erasure-code-non-regression-giant + + erasure-code: store and compare encoded contents (giant) + +commit 1ec557c0eab94cb898ad3f5448482bd7afc53e09 +Author: Loic Dachary +Date: Thu Dec 4 22:21:32 2014 +0100 + + ceph-disk: dmcrypt file permissions + + The directory in which key files are stored for dmcrypt must be 700 and + the file 600. + + http://tracker.ceph.com/issues/9785 Fixes: #9785 + + Signed-off-by: Loic Dachary + (cherry picked from commit 58682d1776ab1fd4daddd887d921ca9cc312bf50) + +commit 691f0118ecd051d5f3f61fc696280e3c482de3de +Merge: 81295c5 dabf6f5 +Author: Sage Weil +Date: Fri Dec 5 09:03:54 2014 -0800 + + Merge pull request #3085 from dachary/wip-10125-radosgw-init-giant + + rgw: run radosgw as apache with systemd (giant) + +commit bff67475c775914237604ed3374c8ccfe74d0ffd +Author: Loic Dachary +Date: Fri Nov 14 01:16:10 2014 +0100 + + common: do not omit shard when ghobject NO_GEN is set + + Do not silence the display of shard_id when generation is NO_GEN. + Erasure coded objects JSON representation used by ceph_objectstore_tool + need the shard_id to find the file containing the chunk. + + Minimal testing is added to ceph_objectstore_tool.py + + http://tracker.ceph.com/issues/10063 Fixes: #10063 + + Signed-off-by: Loic Dachary + (cherry picked from commit dcf09aed121f566221f539106d10283a09f15cf5) + +commit dabf6f5f43b53a588bd9fa0cc5aa617ae8128735 +Author: Loic Dachary +Date: Tue Dec 2 18:10:48 2014 +0100 + + rgw: run radosgw as apache with systemd + + Same as sysv. + + http://tracker.ceph.com/issues/10125 Fixes: #10125 + + Signed-off-by: Loic Dachary + (cherry picked from commit 7b621f4abf63456272dec3449aa108c89504a7a5) + + Conflicts: + src/init-radosgw.sysv + +commit 81295c5ad2befced2e308c1cfb4e036cd5a825a9 +Merge: 8046359 3ff94ed +Author: Josh Durgin +Date: Thu Dec 4 11:32:01 2014 -0800 + + Merge pull request #3077 from ceph/wip-10030-giant + + librbd: don't close an already closed parent image upon failure + + Reviewed-by: Josh Durgin + +commit ad04a677cefc1f0a02fbff0c68409fda6874fdc7 +Author: Loic Dachary +Date: Tue Dec 2 00:59:08 2014 +0100 + + common: add bufferlist::rebuild_aligned_size_and_memory + + The function bufferlist::rebuild_aligned checks memory and size + alignment with the same variable. It is however useful to separate + memory alignment constraints from size alignment constraints. For + instance rebuild_aligned could be called to allocate an erasure coded + buffer where each 2048 bytes chunk needs to start on a memory address + aligned on 32 bytes. + + Signed-off-by: Loic Dachary + (cherry picked from commit 9ade88e8dacc9b96c042bb668f4452447139a544) + +commit cc469b238f42ce989d0efa49154b95612e3d4111 +Author: Loic Dachary +Date: Tue Dec 2 01:07:34 2014 +0100 + + erasure-code: enforce chunk size alignment + + Let say the ErasureCode::encode function is given a 4096 bytes + bufferlist made of a 1249 bytes bufferptr followed by a 2847 bytes + bufferptr, both properly starting on SIMD_ALIGN address. As a result the + second 2048 had to be reallocated when bufferlist::substr_of gets the + second 2048 buffer, the address starts at 799 bytes after the beginning + of the 2847 buffer ptr and is not SIMD_ALIGN'ed. + + The ErasureCode::encode must enforce a size alignment based on the chunk + size in addition to the memory alignment required by SIMD operations, + using the bufferlist::rebuild_aligned_size_and_memory function instead of + bufferlist::rebuild_aligned. + + http://tracker.ceph.com/issues/10211 Fixes: #10211 + + Signed-off-by: Loic Dachary + (cherry picked from commit 4e955f41297283798236c505c3d21bdcabb5caa0) + +commit 5205d4dacf7ebe2e42d2294bc30cb27f226c8d22 +Author: Loic Dachary +Date: Tue Dec 2 02:04:14 2014 +0100 + + common: allow size alignment that is not a power of two + + Do not assume the alignment is a power of two in the is_n_align_sized() + predicate. When used in the context of erasure code it is common + for chunks to not be powers of two. + + Signed-off-by: Loic Dachary + (cherry picked from commit 73ad2d63d479b09037d50246106bbd4075fbce80) + +commit 80463596919d28f58010d16ad017b3c5ae6e558c +Merge: 26e8cf1 3dc6298 +Author: Sage Weil +Date: Wed Dec 3 23:02:43 2014 -0800 + + Merge pull request #3062 from ceph/wip-10123-giant + + librbd: protect list_children from invalid child pool IoCtxs + + Reviewed-by: Sage Weil + +commit 26e8cf174b8e76b4282ce9d9c1af6ff12f5565a9 +Merge: aac7946 7cd8c3f +Author: Gregory Farnum +Date: Wed Dec 3 06:44:56 2014 -0800 + + Merge pull request #3055 from ceph/wip-10135-giant + + mon: OSDMonitor: allow adding tiers to FS pools + + Reviewed-by: Greg Farnum + +commit 7cd8c3f8a5afa9481b6f6a78d5fb8c04784ef4ca (refs/remotes/gh/wip-10135-giant) +Author: John Spray +Date: Tue Nov 25 16:54:42 2014 +0000 + + mon: OSDMonitor: allow adding tiers to FS pools + + This was an overly-strict check. In fact it is perfectly + fine to set an overlay on a pool that is already in use + as a filesystem data or metadata pool. + + Fixes: #10135 + + Signed-off-by: John Spray + (cherry picked from commit 17b5fc9a40440e76dd1fa64f7fc19577ae3b58ce) + +commit 3ff94ed73ff27af2c8ea215ab693d815e285a27f +Author: Jason Dillaman +Date: Thu Nov 6 05:01:38 2014 -0500 + + librbd: don't close an already closed parent image upon failure + + If librbd is not able to open a child's parent image, it will + incorrectly close the parent image twice, resulting in a crash. + + Fixes: #10030 + Backport: firefly, giant + Signed-off-by: Jason Dillaman + (cherry picked from commit 61ebfebd59b61ffdc203dfeca01ee1a02315133e) + +commit aac794616580ed0bb00608c5867199b12d4e1d94 +Merge: 65f6814 c8b46d6 +Author: John Spray +Date: Tue Dec 2 11:35:59 2014 +0000 + + Merge pull request #2990 from ceph/wip-10151-giant + + mon: fix MDS health status from peons + + Reviewed-by: Greg Farnum + +commit 3dc629822adeee961d78208b46b9bd7ef1200890 +Author: Jason Dillaman +Date: Mon Nov 17 21:49:26 2014 -0500 + + librbd: protect list_children from invalid child pool IoCtxs + + While listing child images, don't ignore error codes returned + from librados when creating an IoCtx. This will prevent seg + faults from occurring when an invalid IoCtx is used. + + Fixes: #10123 + Backport: giant, firefly, dumpling + Signed-off-by: Jason Dillaman + (cherry picked from commit 0d350b6817d7905908a4e432cd359ca1d36bab50) + +commit 65f6814847fe8644f5d77a9021fbf13043b76dbe +Merge: 28e2708 9158326 +Author: Gregory Farnum +Date: Mon Dec 1 17:59:19 2014 -0800 + + Merge pull request #3047 from ceph/wip-10011-giant + + osdc: fix Journaler write error handling [giant backport] + + Reviewed-by: Greg Farnum + +commit 9158326eeb69312283a6e8174352f36ea30d0cbf +Author: John Spray +Date: Thu Nov 6 11:46:29 2014 +0000 + + osdc: fix Journaler write error handling + + Since we started wrapping the write error + handler in a finisher, multiple calls to + handle_write_error would hit the assert() + on the second call before the actual + handler had been called (at the other end + of the finisher) from the first call. + + The symptom was that the MDS was intermittently + failing to respawn on blacklist, seen in #10011. + + Signed-off-by: John Spray + (cherry picked from commit 762eda88a18ba707bd5410f38e21e95c4a6b3a46) + +commit 28e27080e25f95851039a0cc4e1c1d06b2cd597d +Merge: 37ffccb cb1d681 +Author: Sage Weil +Date: Tue Nov 25 21:18:59 2014 -0800 + + Merge pull request #3005 from dachary/wip-9665-ceph-disk-partprobe-giant + + ceph disk zap must call partprobe + +commit e7faed5d903cf7681d77a6af53cf8137eeb2fc69 +Author: Loic Dachary +Date: Thu Nov 6 17:11:20 2014 +0100 + + osd: deep scrub must not abort if hinfo is missing + + Instead it should set read_error. + + http://tracker.ceph.com/issues/10018 Fixes: #10018 + + Signed-off-by: Loic Dachary + (cherry picked from commit 9d84d2e8309d26e39ca849a75166d2d7f2dec9ea) + +commit 36c7484c18fd072ba2f7b176403414dd32fbe92b +Author: Loic Dachary +Date: Thu Sep 25 14:46:07 2014 +0200 + + erasure-code: erasure_code_benchmark exhaustive erasure exploration + + Add the --erasure-generation exhaustive flag to try all combinations of + erasures, not just one at random. + + Signed-off-by: Loic Dachary + (cherry picked from commit 2d7adb23bc52e7c0753f4571fecd8eefa209ef02) + + Conflicts: + src/test/erasure-code/ceph_erasure_code_benchmark.h + +commit 32daa9b0f4d39f8a49512b18d5c19437aca5fec6 +Author: Loic Dachary +Date: Mon Sep 29 11:17:13 2014 +0200 + + erasure-code: add erasure_code_benchmark --verbose + + Signed-off-by: Loic Dachary + (cherry picked from commit 3ff2816b3eecfb7277295583387549dac5429628) + + Conflicts: + src/test/erasure-code/ceph_erasure_code_benchmark.cc + src/test/erasure-code/ceph_erasure_code_benchmark.h + +commit da9a7f07787d7f8c20b0c3e7a53fcaf95ed7ca20 +Author: Loic Dachary +Date: Tue Sep 23 14:37:57 2014 +0200 + + erasure_code: implement ceph_erasure_code to assert the existence of a plugin + + This is handy when scripting in the context of teuthology and only + conditionally run tests for the isa plugin, for instance. + + Signed-off-by: Loic Dachary + (cherry picked from commit efe121d9f2028c312eef2650d32ccf0cbc828edb) + +commit c855f3958fb8c10bd824075c1739f40799f6d74b +Author: Loic Dachary +Date: Tue Sep 23 14:36:08 2014 +0200 + + erasure-code: ceph_erasure_code does not need to avoid dlclose + + The only reason for not dlclosing plugins at exit is for callgrind but + ceph_erasure_code has no workload that would require callgrind. + + Signed-off-by: Loic Dachary + (cherry picked from commit 49613cb2aab6e73e3ea50fa164735b55e80121cd) + +commit ba8ceb1f067e0f9f6419358435ed0008b61fa438 +Author: Loic Dachary +Date: Tue Sep 23 11:38:09 2014 +0200 + + erasure-code: add corpus verification to make check + + Signed-off-by: Loic Dachary + (cherry picked from commit 6fdbdff2ad1b55d4a37dcb95cfbb06c4454cdaf2) + +commit ca4c2702139cc7fd8f2e3fa2ee5cda4094ecad79 +Author: Loic Dachary +Date: Sat Sep 13 12:58:27 2014 +0200 + + erasure-code: Makefile.am cosmetics + + Cluster benchmark related lines together. + + Signed-off-by: Loic Dachary + (cherry picked from commit 10c88c8f27080a8e25f128b7065cee5c2f68e91b) + +commit 208a5ee1c9975adaa8b09b1bf541aff0d8551c63 +Author: Loic Dachary +Date: Sat Sep 13 12:55:26 2014 +0200 + + erasure-code: s/alignement/alignment/ typos in jerasure + + The jerasure-per-chunk-alignment prameter was mispelled and while + useable that would lead to confusion. + + Signed-off-by: Loic Dachary + (cherry picked from commit 2c84d0b1db57d918840e669a17bbd8c5ddca9747) + +commit 1def82d530965bd0441e4f7f6aa032666984f17d +Author: Loic Dachary +Date: Sat Sep 13 13:36:09 2014 +0200 + + erasure-code: workunit to check for encoding regression + + Clone the archive of encoded objects and decode all archived objects, up + to and including the current ceph version. + + http://tracker.ceph.com/issues/9420 Refs: #9420 + + Signed-off-by: Loic Dachary + (cherry picked from commit 7638b15f23976c3265cf766e16cf93af1a7e0091) + +commit 1b7fc7e49e6edf0d0f7d1d6d9f9447c42067d8b8 +Author: Loic Dachary +Date: Sat Sep 13 10:16:31 2014 +0200 + + erasure-code: store and compare encoded contents + + Introduce ceph_erasure_code_non_regression to check and compare how an + erasure code plugin encodes and decodes content with a given set of + parameters. For instance: + + ./ceph_erasure_code_non_regression \ + --plugin jerasure \ + --parameter technique=reed_sol_van \ + --parameter k=2 \ + --parameter m=2 \ + --stripe-width 3181 \ + --create \ + --check + + Will create an encoded object (--create) and store it into a directory + along with the chunks, one chunk per file. The directory name is derived + from the parameters. The content of the object is a random pattern of 31 + bytes repeated to fill the object size specified with --stripe-width. + + The check function (--check) reads the object back from the file, + encodes it and compares the result with the content of the chunks read + from the files. It also attempts recover from one or two erasures. + + Chunks encoded by a given version of Ceph are expected to be encoded + exactly in the same way by all Ceph versions going forward. + + http://tracker.ceph.com/issues/9420 Refs: #9420 + + Signed-off-by: Loic Dachary + (cherry picked from commit f5901303dbf50e9d08f2f1e510a1936a20037909) + +commit cb1d68113477cf9c2028a65372d2d4a3e6a8bdc1 +Author: Loic Dachary +Date: Thu Oct 9 18:52:17 2014 +0200 + + ceph-disk: run partprobe after zap + + Not running partprobe after zapping a device can lead to the following: + + * ceph-disk prepare /dev/loop2 + * links are created in /dev/disk/by-partuuid + * ceph-disk zap /dev/loop2 + * links are not removed from /dev/disk/by-partuuid + * ceph-disk prepare /dev/loop2 + * some links are not created in /dev/disk/by-partuuid + + This is assuming there is a bug in the way udev events are handled by + the operating system. + + http://tracker.ceph.com/issues/9665 Fixes: #9665 + + Signed-off-by: Loic Dachary + (cherry picked from commit fed3b06c47a5ef22cb3514c7647544120086d1e7) + +commit d9c04b880d8bf867aa454132117119be5bd550ad +Author: Loic Dachary +Date: Fri Oct 10 10:26:31 2014 +0200 + + ceph-disk: use update_partition in prepare_dev and main_prepare + + In the case of prepare_dev the partx alternative was missing and is not + added because update_partition does it. + + http://tracker.ceph.com/issues/9721 Fixes: #9721 + + Signed-off-by: Loic Dachary + (cherry picked from commit 23e71b1ee816c0ec8bd65891998657c46e364fbe) + +commit 5c9cd3c2a292ae540fd0a487defaf4d712e41e62 +Author: Loic Dachary +Date: Fri Oct 10 10:23:34 2014 +0200 + + ceph-disk: encapsulate partprobe / partx calls + + Add the update_partition function to reduce code duplication. + The action is made an argument although it always is -a because it will + be -d when deleting a partition. + + Use the update_partition function in prepare_journal_dev + + Signed-off-by: Loic Dachary + (cherry picked from commit 922a15ea6865ef915bbdec2597433da6792c1cb2) + +commit c8b46d68c71f66d4abbda1230741cc4c7284193b +Author: John Spray +Date: Mon Nov 24 11:00:25 2014 +0000 + + mon: fix MDS health status from peons + + The health data was there, but we were attempting + to enumerate MDS GIDs from pending_mdsmap (empty on + peons) instead of mdsmap (populated from paxos updates) + + Fixes: #10151 + Backport: giant + + Signed-off-by: John Spray + (cherry picked from commit 0c33930e3a90f3873b7c7b18ff70dec2894fce29) + + Conflicts: + src/mon/MDSMonitor.cc + +commit 37ffccbd57e7e441e0eb1499e5c173aa9c375d35 +Merge: b13a56a 65c5657 +Author: Josh Durgin +Date: Thu Nov 20 13:13:33 2014 -0800 + + Merge pull request #2975 from ceph/wip-9936-giant + + rbd: Fix the rbd export when image size more than 2G + + Reviewed-by: Josh Durgin + +commit b13a56afe99c091600392a2fc15befa9438d59c9 +Merge: 828c1a2 46bd344 +Author: Loic Dachary +Date: Wed Nov 19 02:40:47 2014 +0100 + + Merge pull request #2963 from ceph/wip-10114-giant + + Wip 10114 giant + + Reviewed-by: Loic Dachary + +commit 828c1a2bcd81a49264f8a81ca7f1070169037820 +Merge: ccfd241 6cb9a24 +Author: David Zafman +Date: Tue Nov 18 15:48:16 2014 -0800 + + Merge pull request #2958 from ceph/wip-10128-giant + + ceph_objectstore_tool: When exporting to stdout, don't cout messages + + Reviewed-by: Loic Dachary + +commit 46bd3441b00c22ba78f04617fd77f0231ccc698d +Author: Dan Mick +Date: Tue Nov 18 15:21:30 2014 -0800 + + erasure-code isa-l: remove duplicated lines (fix warning) + + 06a245a added a section def to assembly files; I added it twice to + this file. There's no damage, but a compiler warning (on machines with + yasm installed) + + Signed-off-by: Dan Mick + (cherry picked from commit 10f6ef185a9d09e396e94036ec90bfe8a0738ce9) + +commit 1bba3887038aade137a808d751561cc02002f4bf +Author: Dan Mick +Date: Fri Nov 14 17:59:57 2014 -0800 + + Add annotation to all assembly files to turn off stack-execute bit + + See discussion in http://tracker.ceph.com/issues/10114 + + Building with these changes allows output from readelf like this: + + $ readelf -lW src/.libs/librados.so.2 | grep GNU_STACK + GNU_STACK 0x000000 0x0000000000000000 0x0000000000000000 0x000000 + 0x000000 RW 0x8 + + (note the absence of 'X' in 'RW') + + Fixes: #10114 + Signed-off-by: Dan Mick + (cherry picked from commit 06a245a9845c0c126fb3106b41b2fd2bc4bc4df3) + +commit 6cb9a2499cac2645e2cc6903ab29dfd95aac26c7 +Author: David Zafman +Date: Mon Nov 17 23:02:50 2014 -0800 + + ceph_objectstore_tool: When exporting to stdout, don't cout messages + + Fixes: #10128 + Caused by a2bd2aa7 + + Signed-off-by: David Zafman + (cherry picked from commit 0d5262ac2f69ed3996af76a72894b1722a27b37d) + +commit 13bb880b2a04ab354506eb183d2907b9054bf937 +Author: Sebastien Ponce +Date: Tue Nov 18 10:30:36 2014 +0100 + + Fixed locking issue in the trun method of libradosstriper leading to potential race conditions - Fixes: #10129 + + Signed-off-by: Sebastien Ponce + (cherry picked from commit 8613984373de946e1815cc84d50bbd4437a3f7a7) + +commit 65c565701eb6851f4ed4d2dbc1c7136dfaad6bcb +Author: Vicente Cheng +Date: Wed Oct 29 12:21:11 2014 +0800 + + rbd: Fix the rbd export when image size more than 2G + + When using export and the size of image is more + than 2G, the previous version about finish() could not handle in + seeking the offset in image and return error. + + This is caused by the incorrect variable type. Try to use the correct + variable type to fixed it. + + I use another variable which type is uint64_t for confirming seeking + and still use the previous r for return error. + + uint64_t is more better than type int for handle lseek64(). + + Signed-off-by: Vicente Cheng + (cherry picked from commit 4b87a81c86db06f6fe2bee440c65fc05cd4c23ce) + +commit ccfd2414c68afda55bf4cefa2441ea6d53d87cc6 +Author: Sage Weil +Date: Wed Nov 12 17:11:10 2014 -0800 + + osd/OSD: use OSDMap helper to determine if we are correct op target + + Use the new helper. This fixes our behavior for EC pools where targetting + a different shard is not correct, while for replicated pools it may be. In + the EC case, it leaves the op hanging indefinitely in the OpTracker because + the pgid exists but as a different shard. + + Fixes: #9835 + Signed-off-by: Sage Weil + (cherry picked from commit 9e05ba086a36ae9a04b347153b685c2b8adac2c3) + +commit 963947718a954f63f351ce4034bf97380421ab7c +Author: Sage Weil +Date: Wed Nov 12 17:04:35 2014 -0800 + + osd/OSDMap: add osd_is_valid_op_target() + + Helper to check whether an osd is a given op target for a pg. This + assumes that for EC we always send ops to the primary, while for + replicated we may target any replica. + + Signed-off-by: Sage Weil + (cherry picked from commit 89c02637914ac7332e9dbdbfefc2049b2b6c127d) + +commit 0cb32c157c6c11b26607521a20c6f320c5170516 +Author: Josh Durgin +Date: Tue Nov 11 18:16:02 2014 -0800 + + qa: allow small allocation diffs for exported rbds + + The local filesytem may behave slightly differently. This isn't + foolproof, but seems to be reliable enough on rhel7 rootfs, where + exact comparison was failing. + + Fixes: #10002 + Signed-off-by: Josh Durgin + (cherry picked from commit e94d3c11edb9c9cbcf108463fdff8404df79be33) + +commit fe705c8fdef2371d3f5b11eb73f87a0cf6ef0f9e +Author: Adam Crume +Date: Thu Sep 18 16:57:27 2014 -0700 + + common: Add cctid meta variable + + Fixes: #6228 + Signed-off-by: Adam Crume + (cherry picked from commit bb45621cb117131707a85154292a3b3cdd1c662a) + +commit 5fc659a0d52094a4c595ca8b33b407ecdefc180a +Merge: b27f5db a6c02a1 +Author: Sage Weil +Date: Tue Nov 11 08:28:19 2014 -0800 + + Merge pull request #2804 from ceph/wip-9301-giant + + mon: backport paxos off-by-one bug (9301) to giant + +commit b27f5dba8677ca48c9819980e3c90b76f5f04267 +Merge: 97e423f fc5354d +Author: Gregory Farnum +Date: Mon Nov 10 22:41:19 2014 -0800 + + Merge pull request #2887 from ceph/wip-9977-backport + + tools: skip up to expire_pos in journal-tool + + Reviewed-by: Greg Farnum + +commit 97e423f52155e2902bf265bac0b1b9ed137f8aa0 +Author: Yan, Zheng +Date: Thu Sep 11 09:36:44 2014 +0800 + + client: trim unused inodes before reconnecting to recovering MDS + + So the recovering MDS does not need to fetch these ununsed inodes during + cache rejoin. This may reduce MDS recovery time. + + Signed-off-by: Yan, Zheng + (cherry picked from commit 2bd7ceeff53ad0f49d5825b6e7f378683616dffb) + + Reviewed-by: Greg Farnum + +commit 387efc5fe1fb148ec135a6d8585a3b8f8d97dbf8 +Author: John Spray +Date: Mon Oct 27 12:02:17 2014 +0000 + + client: allow xattr caps in inject_release_failure + + Because some test environments generate spurious + rmxattr operations, allow the client to release + 'X' caps. Allows xattr operations to proceed + while still preventing client releasing other caps. + + Fixes: #9800 + Signed-off-by: John Spray + (cherry picked from commit 5691c68a0a44eb2cdf0afb3f39a540f5d42a5c0c) + + Reviewed-by: Greg Farnum + +commit fc5354dec55248724f8f6b795e3a96882c33b490 (refs/remotes/gh/wip-9977-backport) +Author: John Spray +Date: Mon Nov 3 19:19:45 2014 +0000 + + tools: skip up to expire_pos in journal-tool + + Previously worked for journals starting from an + object boundary (i.e. freshly created filesystems) + + Fixes: #9977 + Signed-off-by: John Spray + (cherry picked from commit 65c33503c83ff8d88781c5c3ae81d88d84c8b3e4) + + Conflicts: + src/tools/cephfs/JournalScanner.cc + +commit 9680613141b3eef62f35a6728e654efa5f6ba8e8 +Merge: fd4363d a5984ba +Author: Gregory Farnum +Date: Fri Nov 7 16:26:54 2014 -0800 + + Merge pull request #2876 from ceph/giant-readdir-fix + + Giant readdir fix + +commit fd4363d1bd49f73e1b3c22516686c7b7e1745b57 +Merge: f66bf31 7166ff8 +Author: Gregory Farnum +Date: Fri Nov 7 14:10:40 2014 -0800 + + Merge pull request #2879 from ceph/wip-10025-giant + + #10025/giant -- tools: fix MDS journal import + + Reviewed-by: Greg Farnum + +commit 7166ff83f6343d31d52a58363e2767434554505c +Author: John Spray +Date: Fri Nov 7 11:34:43 2014 +0000 + + tools: fix MDS journal import + + Previously it only worked on fresh filesystems which + hadn't been trimmed yet, and resulted in an invalid + trimmed_pos when expire_pos wasn't on an object + boundary. + + Fixes: #10025 + + Signed-off-by: John Spray + (cherry picked from commit fb29e71f9a97c12354045ad2e128156e503be696) + +commit a5984ba34cb684dae623df22e338f350c8765ba5 +Author: Yan, Zheng +Date: Mon Oct 27 13:57:16 2014 -0700 + + client: fix I_COMPLETE_ORDERED checking + + Current code marks a directory inode as complete and ordered when readdir + finishes, but it does not check if the directory was modified in the middle + of readdir. This is wrong, directory inode should not be marked as ordered + if it was modified during readddir + + The fix is introduce a new counter to the inode data struct, we increase + the counter each time the directory is modified. When readdir finishes, we + check the counter to decide if the directory should be marked as ordered. + + Fixes: #9894 + Signed-off-by: Yan, Zheng + (cherry picked from commit a4caed8a53d011b214ab516090676641f7c4699d) + +commit b5ff4e99c87958211e4b7716b59084fc3417ec17 +Author: Yan, Zheng +Date: Tue Sep 9 17:34:46 2014 +0800 + + client: preserve ordering of readdir result in cache + + Preserve ordering of readdir result in a list, so that the result of cached + readdir is consistant with uncached readdir. + + As a side effect, this commit also removes the code that removes stale dentries. + This is OK because stale dentries does not have valid lease, they will be + filter out by the shared gen check in Client::_readdir_cache_cb() + + Signed-off-by: Yan, Zheng + (cherry picked from commit 346c06c1647658768e927a47768a0bc74de17b53) + +commit 0671c1127015311d9894f15e2493805f93432910 +Author: Yan, Zheng +Date: Tue Sep 9 14:06:06 2014 +0800 + + client: introduce a new flag indicating if dentries in directory are sorted + + When creating a file, Client::insert_dentry_inode() set the dentry's offset + based on directory's max offset. The offset does not reflect the real + postion of the dentry in directory. Later readdir reply from real postion + of the dentry in directory. Later readdir reply from MDS may change the + dentry's position/offset. This inconsistency can cause missing/duplicate + entries in readdir result if readdir is partly satisfied by dcache_readdir(). + + The fix is introduce a new flag indicating if dentries in directory are + sorted. We use _readdir_cache_cb() to handle readdir only when the flag is + set, clear the flag after creating/deleting/renaming file. + + Fixes: #9178 + Signed-off-by: Yan, Zheng + (cherry picked from commit 600af25493947871c38214aa370e2544a7fea399) + +commit f66bf31b6743246fb1c88238cf18101238dee3a4 +Author: Greg Farnum +Date: Thu Nov 6 17:48:01 2014 -0800 + + qa: use sudo even more when rsyncing /usr + + Signed-off-by: Greg Farnum + (cherry picked from commit 3aa7797741f9cff06053a2f31550fe6929039692) + +commit f7ec448d6579f965eec184416a97b47ae27ab47a +Merge: f410d76 f111bc8 +Author: Loic Dachary +Date: Wed Nov 5 08:51:18 2014 +0100 + + Merge pull request #2858 from ceph/wip-9909 + + tools: rados put /dev/null should write() and not create() + + Reviewed-by: Loic Dachary + +commit f111bc8eac8a521b13340f4a75418d839725e010 +Author: Loic Dachary +Date: Thu Oct 2 09:23:55 2014 +0200 + + tools: rados put /dev/null should write() and not create() + + In the rados.cc special case to handle put an empty objects, use + write_full() instead of create(). + + A special case was introduced 6843a0b81f10125842c90bc63eccc4fd873b58f2 + to create() an object if the rados put file is empty. Prior to this fix + an attempt to rados put an empty file was a noop. The problem with this + fix is that it is not idempotent. rados put an empty file twice would + fail the second time and rados put a file with one byte would succeed as + expected. + + Signed-off-by: Loic Dachary + (cherry picked from commit 50e80407f3c2f74d77ba876d01e7313c3544ea4d) + +commit f410d764d2e6795389cb320b4436cff3607927bd +Author: Yehuda Sadeh +Date: Thu Oct 9 10:20:27 2014 -0700 + + rgw: set length for keystone token validation request + + Fixes: #7796 + Backport: giany, firefly + Need to set content length to this request, as the server might not + handle a chunked request (even though we don't send anything). + + Tested-by: Mark Kirkwood + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 3dd4ccad7fe97fc16a3ee4130549b48600bc485c) + +commit dba7defc623474ad17263c9fccfec60fe7a439f0 +Merge: 6a201f8 e0b0441 +Author: Sage Weil +Date: Fri Oct 31 08:35:42 2014 -0700 + + Merge pull request #2846 from dachary/wip-9752-past-intervals-giant + + osd: past_interval display bug on acting + +commit e0b04414b92018277a0d3b9d82e72ea7529f4ef5 +Author: Loic Dachary +Date: Fri Oct 31 00:49:21 2014 +0100 + + osd: past_interval display bug on acting + + The acting array was incorrectly including the primary and up_primary. + + http://tracker.ceph.com/issues/9752 Fixes: #9752 + + Signed-off-by: Loic Dachary + (cherry picked from commit c5f8d6eded52da451fdd1d807bd4700221e4c41c) + +commit 6a201f89b1aa6c2197383c29919cdeb4a8353d1b +Merge: ebe1637 905aba2 +Author: Yan, Zheng +Date: Thu Oct 30 17:01:12 2014 -0700 + + Merge pull request #2841 from ceph/giant-9869 + + Backport "client: cast m->get_client_tid() to compare to 16-bit Inode::flushing_cap_tid" + +commit 905aba2f3d847933f98124f3ea8d1d76d644edb4 +Author: Greg Farnum +Date: Wed Oct 22 17:16:31 2014 -0700 + + client: cast m->get_client_tid() to compare to 16-bit Inode::flushing_cap_tid + + m->get_client_tid() is 64 bits (as it should be), but Inode::flushing_cap_tid + is only 16 bits. 16 bits should be plenty to let the cap flush updates + pipeline appropriately, but we need to cast in the proper direction when + comparing these differently-sized versions. So downcast the 64-bit one + to 16 bits. + + Fixes: #9869 + Backport: giant, firefly, dumpling + + Signed-off-by: Greg Farnum + (cherry picked from commit a5184cf46a6e867287e24aeb731634828467cd98) + +commit ebe16373e966917ca8cb03ebeac974bdff7b7685 +Merge: c51c8f9 b704f0d +Author: Sage Weil +Date: Thu Oct 30 10:05:22 2014 -0700 + + Merge pull request #2838 from ceph/wip-9945-giant + + messages: fix COMPAT_VERSION on MClientSession + + Reviewed-by: Sage Weil + +commit b704f0dd888aacb10c32cdb63cdbf9f06296fc18 +Author: John Spray +Date: Thu Oct 30 16:43:21 2014 +0000 + + messages: fix COMPAT_VERSION on MClientSession + + This was incorrectly incremented to 2 by omission + of an explicit COMPAT_VERSION value. + + Fixes: #9945 + + Signed-off-by: John Spray + (cherry picked from commit 1eb9bcb1d36014293efc687b4331be8c4d208d8e) + +commit a6c02a18d6249ea62cf7a74710c8d0192b6eecaa +Author: Sage Weil +Date: Thu Sep 18 14:23:36 2014 -0700 + + mon: re-bootstrap if we get probed by a mon that is way ahead + + During bootstrap we verify that our paxos commits overlap with the other + mons we will form a quorum with. If they do not, we do a sync. + + However, it is possible we pass those checks, then fail to join a quorum + before the quorum moves ahead in time such that we no longer overlap. + Currently nothing kicks up back into a probing state to discover we need + to sync... we will just keep trying to call or join an election instead. + + Fix this by jumping back to bootstrap if we get a probe that is ahead of + us. Only do this from non probe or sync states as these will be common; + it is only the active and electing states that matter (and probably just + electing!). + + Fixes: #9301 + Backport: giant, firefly + Signed-off-by: Sage Weil + (cherry picked from commit c421b55e8e15ef04ca8aeb47f7d090375eaa8573) + +commit 92d2a38efd458f9e8f4da228ea1e94df08dc8222 +Author: Sage Weil +Date: Thu Sep 18 14:11:24 2014 -0700 + + mon/Paxos: fix off-by-one in last_ vs first_committed check + + peon last_committed + 1 == leader first_committed is okay. Note that the + other check (where I clean up whitespace) gets this correct. + + Fixes: #9301 (partly) + Signed-off-by: Sage Weil + (cherry picked from commit d81cd7f86695185dce31df76c33c9a02123f0e4a) diff --git a/doc/changelog/v0.87.2.txt b/doc/changelog/v0.87.2.txt new file mode 100644 index 0000000000000..98cf11732a583 --- /dev/null +++ b/doc/changelog/v0.87.2.txt @@ -0,0 +1,1413 @@ +commit 87a7cec9ab11c677de2ab23a7668a77d2f5b955e (tag: refs/tags/v0.87.2, refs/remotes/gh/giant) +Author: Jenkins +Date: Fri Apr 24 12:31:27 2015 -0700 + + 0.87.2 + +commit c1301e84aee0f399db85e2d37818a66147a0ce78 +Merge: 1a13e10 9e9c3c6 +Author: Loic Dachary +Date: Tue Apr 7 21:08:24 2015 +0200 + + Merge pull request #4214 from dachary/wip-10430-giant + + osd/osd_types.cc: 456: FAILED assert(m_seed < old_pg_num) + + Reviewed-by: David Zafman + +commit 1a13e1065829c59987c2f57a13eaa03de31df4ed +Merge: 1fb08d3 5f4e62f +Author: Loic Dachary +Date: Tue Apr 7 16:39:28 2015 +0200 + + Merge pull request #4258 from ceph/wip-10643-v2 + + mon: MDSMonitor: additional backports for #10643 + + Reviewed-by: Abhishek Lekshmanan + Reviewed-by: Loic Dachary + +commit 1fb08d3066b14b178a8912ffb3c9f50d2333738c +Merge: 90b37d9 7684ee2 +Author: Josh Durgin +Date: Thu Apr 2 08:55:48 2015 -0700 + + Merge pull request #4261 from ceph/wip-11303-giant + + allow -L to disable lttng. Enable it by default + + Reviewed-by: Josh Durgin + +commit 7684ee25ac21810153a44bdc4fc00b36e39eb12f +Author: Alfredo Deza +Date: Thu Apr 2 10:34:02 2015 -0400 + + allow -L to disable lttng. Enable it by default + + Signed-off-by: Alfredo Deza + +commit 5f4e62f382767ee69e5b0c701b1a01d9e4132237 +Author: Joao Eduardo Luis +Date: Fri Oct 17 19:08:20 2014 +0100 + + mon: MDSMonitor: wait for osdmon to be writable when requesting proposal + + Otherwise we may end up requesting the osdmon to propose while it is + mid-proposal. We can't simply return EAGAIN to the user either because + then we would have to expect the user to be able to successfully race + with the whole cluster in finding a window in which 'mds fs new' command + would succeed -- which is not a realistic expectation. Having the + command to osdmon()->wait_for_writable() guarantees that the command + will be added to a queue and that we will, eventually, tend to it. + + Fixes: #9794 + + Signed-off-by: Joao Eduardo Luis + (cherry picked from commit 2ae1cba595d9c56a0a4c534b34fe25250e7eb2d5) + +commit 257bd17db6470ca050403b1c8ff8daa94a4b80b5 +Author: Joao Eduardo Luis +Date: Fri Oct 17 18:59:51 2014 +0100 + + mon: MDSMonitor: have management_command() returning int instead of bool + + We can more easily differentiate between errors we get out of this + function, which makes the code a bit more versatile and readable. + + Signed-off-by: Joao Eduardo Luis + (cherry picked from commit 0dd473cbad4f9ea403fe60badffdc6da4dd3aa3c) + +commit 9e9c3c652339d85863af01cac621228f04eb4f18 +Author: David Zafman +Date: Thu Oct 9 11:20:13 2014 -0700 + + osd: Get pgid ancestor from last_map when building past intervals + + Fixed OSD::build_past_intervals_parallel() and PG::generate_past_intervals() + + Fixes: #10430 + + Signed-off-by: David Zafman + (cherry picked from commit 0c5b66da7a9ba516340d06d9e806beb9d1040d0e) + +commit 90b37d9bdcc044e26f978632cd68f19ece82d19a +Merge: 2ccbc14 9f1f355 +Author: Loic Dachary +Date: Thu Mar 26 07:58:14 2015 +0100 + + Merge pull request #4175 from wonzhq/objecter-timer-2 + + Objecter: failed assert(tick_event==NULL) at osdc/Objecter.cc + + Reviewed-by: Loic Dachary + +commit 9f1f35546e00e8f1ecbce0697d59b64f3537facf +Author: Zhiqiang Wang +Date: Wed Mar 25 16:32:44 2015 +0800 + + Objecter: failed assert(tick_event==NULL) at osdc/Objecter.cc + + When the Objecter timer erases the tick_event from its events queue and + calls tick() to dispatch it, if the Objecter::rwlock is held by shutdown(), + it waits there to get the rwlock. However, inside the shutdown function, + it checks the tick_event and tries to cancel it. The cancel_event function + returns false since tick_event is already removed from the events queue. Thus + tick_event is not set to NULL in shutdown(). Later the tick function return + ealier and doesn't set tick_event to NULL as well. This leads to the assertion + failure. + + This is a regression introduced by an incorrect conflict resolution when + d790833 was backported. + + Fixes: #11183 + + Signed-off-by: Zhiqiang Wang + +commit 2ccbc14d17b54ea4fd4126cb04a7b83cd64c7f1e +Merge: 02f9cdb de4b087 +Author: Loic Dachary +Date: Mon Mar 23 20:39:26 2015 +0100 + + Merge pull request #4127 from dzafman/wip-11176-giant + + ceph-objectstore-tool: Output only unsupported features when incomatible + + Reviewed-by: Loic Dachary + +commit 02f9cdbf889071ca6fe3811d9b9a92a0b630fa55 +Merge: 83bcc51 fc43d8c +Author: Loic Dachary +Date: Sun Mar 22 23:11:46 2015 +0100 + + Merge pull request #4097 from dachary/wip-10497-giant + + librados: c api does not translate op flag + + Reviewed-by: Abhishek Lekshmanan + +commit 83bcc516743e426c7a8c6f6401721bffbbec4fc0 +Merge: ebab2bd d790833 +Author: Loic Dachary +Date: Sun Mar 22 23:11:26 2015 +0100 + + Merge pull request #4096 from dachary/wip-9617-giant + + objecter shutdown races with msg dispatch + + Reviewed-by: Abhishek Lekshmanan + +commit ebab2bd5f36205d666673600624aaa3e5e06c405 +Merge: e31c92d 970a797 +Author: Loic Dachary +Date: Sun Mar 22 23:11:03 2015 +0100 + + Merge pull request #4095 from dachary/wip-9675-giant + + splitting a pool doesn't start when rule_id != ruleset_id + + Reviewed-by: Abhishek Lekshmanan + +commit e31c92d8dd814d276357e431ed87b93d75933f77 +Merge: f0ec5e3 7653511 +Author: Loic Dachary +Date: Sun Mar 22 23:10:42 2015 +0100 + + Merge pull request #4094 from dachary/wip-9891-giant + + Assertion: os/DBObjectMap.cc: 1214: FAILED assert(0) + + Reviewed-by: Abhishek Lekshmanan + +commit f0ec5e3b3ed58af65323bcc494e589935147aa45 +Merge: dd7c15b 13b0147 +Author: Loic Dachary +Date: Sun Mar 22 23:10:25 2015 +0100 + + Merge pull request #4093 from dachary/wip-9915-giant + + osd: eviction logic reversed + + Reviewed-by: Abhishek Lekshmanan + +commit dd7c15b2b24027a7cc5fa4bff21222c5a4606e60 +Merge: 33b09e1 13b8364 +Author: Loic Dachary +Date: Sun Mar 22 23:09:28 2015 +0100 + + Merge pull request #4092 from dachary/wip-9985-giant + + osd: incorrect atime calculation + + Reviewed-by: Abhishek Lekshmanan + +commit 33b09e18aa78252d94cbec8bf94ec97ed5bb1573 +Merge: 950123e 5550cdd +Author: Loic Dachary +Date: Sun Mar 22 23:08:41 2015 +0100 + + Merge pull request #4091 from dachary/wip-9986-giant + + objecter: map epoch skipping broken + + Reviewed-by: Abhishek Lekshmanan + +commit 950123e4daa85562b2f52e0e12e0bae07f444095 +Merge: abdc065 21f81b8 +Author: Loic Dachary +Date: Sun Mar 22 23:08:16 2015 +0100 + + Merge pull request #4090 from dachary/wip-10059-giant + + osd/ECBackend.cc: 876: FAILED assert(0) + + Reviewed-by: Abhishek Lekshmanan + +commit abdc065acd6cafa6439e9c1724cc87de02352bd7 +Merge: aee2825 1ccf583 +Author: Loic Dachary +Date: Sun Mar 22 23:07:52 2015 +0100 + + Merge pull request #4089 from dachary/wip-10080-giant + + Pipe::connect() cause osd crash when osd reconnect to its peer + + Reviewed-by: Abhishek Lekshmanan + +commit aee28250fee66bca08d91e56ce47ec46c2e9fc24 +Merge: 6582253 3e875ab +Author: Loic Dachary +Date: Sun Mar 22 23:07:20 2015 +0100 + + Merge pull request #4088 from dachary/wip-6003-giant + + journal Unable to read past sequence 406 ... + + Reviewed-by: Abhishek Lekshmanan + +commit 65822530ebfdea2feb9192c6eb6e3b8b9d60fe33 +Merge: 4b20f2d 96a5c67 +Author: Loic Dachary +Date: Sun Mar 22 23:06:51 2015 +0100 + + Merge pull request #4082 from dachary/wip-10106-giant + + rgw acl response should start with + + Reviewed-by: Abhishek Lekshmanan + +commit 4b20f2d2d1ee52deed33617f000fa342ebce2e49 +Merge: 7ff3a67 c7b02f5 +Author: Loic Dachary +Date: Sun Mar 22 23:06:23 2015 +0100 + + Merge pull request #4078 from dachary/wip-11157-giant + + doc,tests: force checkout of submodules + + Reviewed-by: Abhishek Lekshmanan + +commit 7ff3a67c44ba3dc20a663a7dc6ba28c25714f063 +Merge: 440e706 4d4eb9f +Author: Loic Dachary +Date: Sun Mar 22 23:05:20 2015 +0100 + + Merge pull request #4077 from dachary/wip-10150-giant + + osd/ReplicatedPG.cc: 10853: FAILED assert(r >= 0) (in _scan_range) + + Reviewed-by: Abhishek Lekshmanan + +commit 440e70607e7e3cd1d8ca33843c626109431caf8d +Merge: 66f639b 499d94f +Author: Loic Dachary +Date: Sun Mar 22 23:04:51 2015 +0100 + + Merge pull request #4076 from dachary/wip-10153-giant + + Rados.shutdown() dies with Illegal instruction (core dumped) + + Reviewed-by: Abhishek Lekshmanan + +commit 66f639b30ce6c74caae9397f20053761203f8e87 +Merge: 43b45df b79852f +Author: Loic Dachary +Date: Sun Mar 22 23:04:25 2015 +0100 + + Merge pull request #4074 from dachary/wip-10220-giant + + mon/Paxos.cc: 1033: FAILED assert(mon->is_leader()) + + Reviewed-by: Abhishek Lekshmanan + +commit 43b45dfacd598bab51fa06c5d0e2d0605d6e83d6 +Merge: d282cfd b318e2f +Author: Loic Dachary +Date: Sun Mar 22 23:03:35 2015 +0100 + + Merge pull request #3548 from ceph/wip-10643 + + mon: MDSMonitor: missing backports for giant + + Reviewed-by: Abhishek Lekshmanan + Reviewed-by: Loic Dachary + +commit d282cfd5ae3e65b74801cd27480ce8c0bd72d9a7 +Merge: 3f3b981 681c99f +Author: Loic Dachary +Date: Sun Mar 22 22:12:58 2015 +0100 + + Merge pull request #4053 from dachary/wip-8011-giant + + osd/ReplicatedPG.cc: 5244: FAILED assert(soid < scrubber.start || soid >= scrubber.end) + + Reviewed-by: Abhishek Lekshmanan + +commit 3f3b98123cb46080068c1c73f4be41acbe18bd0d +Merge: 6919eb1 4427358 +Author: Loic Dachary +Date: Sun Mar 22 22:12:40 2015 +0100 + + Merge pull request #4052 from dachary/wip-10844-giant + + mon: caps validation should rely on EntityName instead of entity_name_t + + Reviewed-by: Abhishek Lekshmanan + +commit 6919eb1684e34a6395963be6cc65215a51f5ba13 +Merge: 8876585 1d4ffbe +Author: Loic Dachary +Date: Sun Mar 22 22:12:15 2015 +0100 + + Merge pull request #4050 from dachary/wip-10817-giant + + WorkQueue: make timeout when calling WaitInterval configurable + + Reviewed-by: Abhishek Lekshmanan + +commit 88765851bd56a6a526f6ab724920c8858ec4956c +Merge: 6da3171 92c352d +Author: Loic Dachary +Date: Sun Mar 22 22:11:43 2015 +0100 + + Merge pull request #4049 from dachary/wip-10787-giant + + mon: OSDMonitor::map_cache is buggy, send_incremental is not conservative + + Reviewed-by: Abhishek Lekshmanan + Reviewed-by: Sage Weil + +commit 6da3171f21b8e1a56fe941a5028f2ccfdccee18a +Merge: 5a6eefc 25fcaca +Author: Loic Dachary +Date: Sun Mar 22 22:09:28 2015 +0100 + + Merge pull request #4048 from dachary/wip-10770-giant + + rgw: pending bucket index operations are not cancelled correctly + + Reviewed-by: Abhishek Lekshmanan + +commit 5a6eefcb40c94da9a900d9893ecb6eaaf8fd0cea +Merge: c67a7a5 2858327 +Author: Loic Dachary +Date: Sun Mar 22 22:09:06 2015 +0100 + + Merge pull request #4046 from dachary/wip-10723-giant + + rados python binding leaks Ioctx objects + + Reviewed-by: Abhishek Lekshmanan + +commit c67a7a52dc96f176431125921e36e4a2b8a30f1c +Merge: 41dcd2d d5b1b7e +Author: Loic Dachary +Date: Sun Mar 22 22:08:45 2015 +0100 + + Merge pull request #4044 from dachary/wip-10617-giant + + osd: pgs for deleted pools don't finish getting removed if osd restarts + + Reviewed-by: Abhishek Lekshmanan + +commit 41dcd2d9c307dd0b25d27bd6673943f3fdaaa28b +Merge: 42e7413 c3d998e +Author: Loic Dachary +Date: Sun Mar 22 22:08:20 2015 +0100 + + Merge pull request #4034 from dachary/wip-10475-giant + + rgw: Swift API. Support for X-Remove-Container-Meta-{key} header. + + Reviewed-by: Abhishek Lekshmanan + +commit 42e741339913990521c4509f4e266921f8dbe007 +Merge: ffb76d1 fa8d454 +Author: Loic Dachary +Date: Sun Mar 22 22:07:53 2015 +0100 + + Merge pull request #4033 from dachary/wip-10471-giant + + rgw: index swift keys appropriately + + Reviewed-by: Abhishek Lekshmanan + +commit de4b08704172ac31b511dde50e5c11d58d811ca2 +Author: David Zafman +Date: Fri Mar 20 16:57:40 2015 -0700 + + ceph-objectstore-tool: Output only unsupported features when incomatible + + Fixes: #11176 + Backport: firefly, giant + + Signed-off-by: David Zafman + (cherry picked from commit 5b23f5b5892b36fb7d06efc0d77e64a24ef6e8c9) + +commit ffb76d16c5d9d65ac94d21b4e8fc1fdf86441977 +Merge: fea29b1 61d6006 +Author: John Spray +Date: Thu Mar 19 22:07:58 2015 +0000 + + Merge pull request #3971 from ceph/giant-11053 + + mds: fix assertion caused by system clock backwards + + Reviewed-by: John Spray + +commit c3d998e4c3c9d90b50c3f6c0af7d48785616e032 +Author: Dmytro Iurchenko +Date: Tue Feb 3 17:54:38 2015 +0200 + + rgw: Swift API. Support for X-Remove-Container-Meta-{key} header. + + Fixes: #10475 + Backport: hammer, firefly + Reported-by: Josh Durgin + Signed-off-by: Dmytro Iurchenko + (cherry picked from commit f67bfa24fd6f69c2fcc0987eba8b6b426dd78320) + + Conflicts: + src/rgw/rgw_rest.h + trivial merge: prototype of an unrelated function changed + src/rgw/rgw_op.cc + s/is_object_op/!(s->object == NULL)/ + +commit fc43d8c2c54ec9e9cb6ef4d19cca695eb2fb3aab +Author: Matt Richards +Date: Thu Jan 8 13:16:17 2015 -0800 + + librados: Translate operation flags from C APIs + + The operation flags in the public C API are a distinct enum + and need to be translated to Ceph OSD flags, like as happens in + the C++ API. It seems like the C enum and the C++ enum consciously + use the same values, so I reused the C++ translation function. + + Signed-off-by: Matthew Richards + (cherry picked from commit 49d114f1fff90e5c0f206725a5eb82c0ba329376) + +commit d790833cb84d6f6349146e4f9abdcdffb4db2ee0 +Author: Josh Durgin +Date: Mon Sep 29 18:17:29 2014 -0700 + + Objecter: check the 'initialized' atomic_t safely + + shutdown() resets initialized to 0, but we can still receive messages + after this point, so fix message handlers to skip messages in this + case instead of asserting. + + Also read initialized while holding Objecter::rwlock to avoid races + where e.g. handle_osd_map() checks initialized -> 1, continues, + shutdown() is called, sets initialized to 0, then handle_osd_map() + goes about its business and calls op_submit(), which would fail the + assert(initialized.read()) check. Similar races existed in other + message handlers which change Objecter state. + + The Objecter is not destroyed until after its Messenger in + the MDS, OSD, and librados, so this should be safe. + + Fixes: #9617 + Backport: giant + Signed-off-by: Josh Durgin + (cherry picked from commit e506f896a9217324ab7a7865989f4454562aed5f) + + Conflicts: + src/osdc/Objecter.cc + context changed: Objecter::tick() did not have + assert(initialized.read()) + +commit ce436a33e0f720ea4b8cf0363bcac1126be3c28b +Author: Josh Durgin +Date: Mon Sep 29 18:12:50 2014 -0700 + + Objecter: init with a constant of the correct type + + Just a tiny cleanup. + + Signed-off-by: Josh Durgin + (cherry picked from commit 1feba200aae7d9a042cda705c3de8fba2fc82331) + +commit 970a79753dc52d82d2abf29ffe6f88adac678eb0 +Author: Xiaoxi Chen +Date: Wed Aug 20 15:35:44 2014 +0800 + + CrushWrapper: pick a ruleset same as rule_id + + Originally in the add_simple_ruleset funtion, the ruleset_id + is not reused but rule_id is reused. So after some add/remove + against rules, the newly created rule likely to have + ruleset!=rule_id. + + We dont want this happen because we are trying to hold the constraint + that ruleset == rule_id. + + Signed-off-by: Xiaoxi Chen + (cherry picked from commit 78e84f34da83abf5a62ae97bb84ab70774b164a6) + +commit 76535116823f02f0392226e5725fbfef14c277ba +Author: Samuel Just +Date: Fri Feb 20 13:43:46 2015 -0800 + + DBObjectMap: lock header_lock on sync() + + Otherwise, we can race with another thread updating state.seq + resulting in the old, smaller value getting persisted. If there + is a crash at that time, we will reuse a sequence number, resulting + in an inconsistent node tree and bug #9891. + + Fixes: 9891 + Backport: giant, firefly, dumpling + Signed-off-by: Samuel Just + (cherry picked from commit 2b63dd25fc1c73fa42e52e9ea4ab5a45dd9422a0) + + Conflicts: + src/os/DBObjectMap.cc + because we have state.v = 1; instead of state.v = 2; + +commit 13b01473c5c1a116d6cd6acbbb6cbe08ee6e9433 +Author: Zhiqiang Wang +Date: Tue Oct 28 09:37:11 2014 +0800 + + osd: cache tiering: fix the atime logic of the eviction + + Reported-by: Xinze Chi + Signed-off-by: Zhiqiang Wang + (cherry picked from commit 622c5ac41707069ef8db92cb67c9185acf125d40) + +commit 13b8364dc869b2eefcb6646cff1e18c31126ce7d +Author: Sage Weil +Date: Fri Oct 31 19:33:59 2014 -0700 + + osd/ReplicatedPG: fix compile error + + From 1fef4c3d541cba360738437420ebfa2447d5802e. + + Signed-off-by: Sage Weil + (cherry picked from commit 4a9ad7dc2da6f4fa6a64235776a3f1d2799aef60) + +commit 55541b90db0a6d6c26c1fae6a4958fa5a320f82c +Author: Xinze Chi +Date: Wed Oct 29 07:11:11 2014 +0000 + + Get the currently atime of the object in cache pool for eviction + + Because if there are mutiple atime in agent_state for the same object, we should use the recently one. + + Signed-off-by: Xinze Chi + (cherry picked from commit 1fef4c3d541cba360738437420ebfa2447d5802e) + +commit 5550cdd876db913b152edad17a4de16bded31627 +Author: Ding Dinghua +Date: Thu Oct 30 14:58:42 2014 +0800 + + osdc/Objecter: Fix a bug of dead looping in Objecter::handle_osd_map + + If current map epoch is less than oldest epoch, current map epoch + should step up to oldest epoch. + + Fixes: #9986 + Signed-off-by: Ding Dinghua + (cherry picked from commit e0166a23c2cf655bfb4cf873be021a14d9b9be27) + +commit 7c5056f5cf77065e0e5a920f525f0a3be6b4b299 +Author: Ding Dinghua +Date: Thu Oct 30 14:58:05 2014 +0800 + + osdc/Objecter: e shouldn't be zero in Objecter::handle_osd_map + + Signed-off-by: Ding Dinghua + (cherry picked from commit 31c584c8ba022cd44fe2872d221f3026618cefab) + +commit 21f81b85de652aede51e88b87bdff71f2e411da3 +Author: Samuel Just +Date: Wed Nov 19 08:20:16 2014 -0800 + + PG: always clear_primary_state on new interval, but only clear pg temp if not primary + + Signed-off-by: Samuel Just + (cherry picked from commit f692bfe076b8ddb679c6d1a6ea78cc47f0876326) + +commit 0c3f7637d8cf2146a4268330d8c0506ad38c354d +Author: Samuel Just +Date: Fri Nov 14 15:44:20 2014 -0800 + + PG: always clear_primary_state when leaving Primary + + Otherwise, entries from the log collection process might leak into the next + epoch, where we might end up choosing a different authoritative log. In this + case, it resulted in us not rolling back to log entries on one of the replicas + prior to trying to recover from an affected object due to the peer_missing not + being cleared. + + Fixes: #10059 + Backport: giant, firefly, dumpling + Signed-off-by: Samuel Just + (cherry picked from commit c87bde64dfccb5d6ee2877cc74c66fc064b1bcd7) + +commit 1ccf58355d0796172935938572cf68ceb31a6801 +Author: Greg Farnum +Date: Tue Dec 2 15:17:57 2014 -0800 + + SimpleMessenger: allow RESETSESSION whenever we forget an endpoint + + In the past (e229f8451d37913225c49481b2ce2896ca6788a2) we decided to disable + reset of lossless Pipes, because lossless peers resetting caused trouble and + they can't forget about each other. But they actually can: if mark_down() + is called. + + I can't figure out how else we could forget about a remote endpoint, so I think + it's okay if we tell them we reset in order to clean up state. That's desirable + so that we don't get into strange situations with out-of-whack counters. + + Fixes: #10080 + Backport: giant, firefly, dumpling + + Signed-off-by: Greg Farnum + (cherry picked from commit 8cd1fdd7a778eb84cb4d7161f73bc621cc394261) + +commit 3e875ab108de8d2aa3717f76a3fe48ede286abb7 +Author: Samuel Just +Date: Fri Feb 6 09:52:29 2015 -0800 + + FileJournal: fix journalq population in do_read_entry() + + Fixes: 6003 + Backport: dumpling, firefly, giant + Signed-off-by: Samuel Just + (cherry picked from commit bae1f3eaa09c4747b8bfc6fb5dc673aa6989b695) + + Conflicts: + src/os/FileJournal.cc + because reinterpret_cast was added near two hunks after firefly + +commit 96a5c67121dde0d4d4cd13793bb131414b64cc28 +Author: Yehuda Sadeh +Date: Fri Jan 30 18:42:40 2015 -0800 + + rgw: flush xml header on get acl request + + Fixes: #10106 + Backport: firefly, giant + + dump_start() updates the formatter with the appropriate prefix, however, + we never flushed the formatter. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit eb45f861343162e018968b8c56693a8c6f5b2cab) + +commit c7b02f5a7347ea8688c5214f85bb9f612925a586 +Author: Loic Dachary +Date: Thu Mar 19 00:32:39 2015 +0100 + + doc,tests: force checkout of submodules + + When updating submodules, always checkout even if the HEAD is the + desired commit hash (update --force) to avoid the following: + + * a directory gmock exists in hammer + * a submodule gmock replaces the directory gmock in master + * checkout master + submodule update : gmock/.git is created + * checkout hammer : the gmock directory still contains the .git from + master because it did not exist at the time and checkout won't + remove untracked directories + * checkout master + submodule update : git rev-parse HEAD is + at the desired commit although the content of the gmock directory + is from hammer + + http://tracker.ceph.com/issues/11157 Fixes: #11157 + + Signed-off-by: Loic Dachary + +commit 4d4eb9faf1871c4469b78a7ee75d527ce5cc67ad +Author: Samuel Just +Date: Thu Dec 11 13:05:54 2014 -0800 + + ReplicatedPG::scan_range: an object can disappear between the list and the attr get + + The first item in the range is often last_backfill, upon which writes + can be occuring. It's trimmed off on the primary side anyway. + + Fixes: 10150 + Backport: dumpling, firefly, giant + Signed-off-by: Samuel Just + (cherry picked from commit dce6f288ad541fe7f0ef8374301cd712dd3bfa39) + +commit 499d94f573e78f6545bd5a12ade6b5ba5a75bbca +Author: Federico Simoncelli +Date: Sat Nov 15 14:14:04 2014 +0000 + + common: do not unlock rwlock on destruction + + According to pthread_rwlock_unlock(3p): + + Results are undefined if the read-write lock rwlock is not held + by the calling thread. + + and: + + https://sourceware.org/bugzilla/show_bug.cgi?id=17561 + + Calling pthread_rwlock_unlock on an rwlock which is not locked + is undefined. + + calling pthread_rwlock_unlock on RWLock destruction could cause + an unknown behavior for two reasons: + + - the lock is acquired by another thread (undefined) + - the lock is not acquired (undefined) + + Moreover since glibc-2.20 calling pthread_rwlock_unlock on a + rwlock that is not locked results in a SIGILL that kills the + application. + + This patch removes the pthread_rwlock_unlock call on destruction + and replaces it with an assertion to check that the RWLock is + not in use. + + Any code that relied on the implicit release is now going to + break the assertion, e.g.: + + { + RWLock l; + l.get(for_write); + } // implicit release, wrong. + + Signed-off-by: Federico Simoncelli + (cherry picked from commit cf2104d4d991361c53f6e2fea93b69de10cd654b) + +commit b79852f3ab0643bbb3f6b71a56b15e5a4b5fc1f5 +Author: Joao Eduardo Luis +Date: Wed Dec 10 17:46:35 2014 +0000 + + mon: Paxos: reset accept timeout before submiting work to the store + + Otherwise we may trigger the timeout while waiting for the work to be + committed to the store -- and it would only take the write to take a bit + longer than 10 seconds (default accept timeout). + + We do wait for the work to be properly committed to the store before + extending the lease though. + + Fixes: #10220 + + Signed-off-by: Joao Eduardo Luis + (cherry picked from commit 18534615f184ba56b441fd1d4242eb06debdfe13) + +commit e997c9fed5feb3e877dfe07ffac1327b85d09ea2 +Author: Joao Eduardo Luis +Date: Tue Dec 9 17:35:47 2014 +0000 + + mon: MonitorDBStore: allow randomly injecting random delays on writes + + Adds two new config options: + + mon_inject_transaction_delay_probability : DOUBLE (0.0-1.0, default: 0.0) + mon_inject_transaction_delay_max : DOUBLE (seconds, default: 10.0) + + If probability is set to a value greater than 0, just before applying + the transaction, the store will decide whether to inject a delay, + randomly choosing a value between 0 and the max. + + Signed-off-by: Joao Eduardo Luis + (cherry picked from commit beaa04e4119765d5775a6c48fd072dd95c984e3b) + +commit 1d4ffbe1d233de555c1ecb0a33eebe2391b29f33 +Author: Samuel Just +Date: Mon Feb 9 17:41:19 2015 -0800 + + ShardedThreadPool: make wait timeout on empty queue configurable + + Fixes: 10818 + Backport: giant + Signed-off-by: Samuel Just + (cherry picked from commit 7002f934e6664daa995ca0629c0ea3bae1c6bddf) + +commit 292c4339c064968c2aa05eec701fbc2a8d82dab0 +Author: Samuel Just +Date: Mon Feb 9 17:11:38 2015 -0800 + + WorkQueue: make wait timeout on empty queue configurable + + Fixes: 10817 + Backport: giant, firefly, dumpling + Signed-off-by: Samuel Just + (cherry picked from commit 5aa6f910843e98a05bfcabe6f29d612cf335edbf) + +commit fea29b1bcbd17b3d1f642398ec70dbe258bbc98f +Author: Samuel Just +Date: Thu Nov 20 15:15:08 2014 -0800 + + PGLog: include rollback_info_trimmed_to in (read|write)_log + + Fixes: #10157 + Backport: firefly, giant + Signed-off-by: Samuel Just + (cherry picked from commit 1fe8b846641486cc294fe7e1d2450132c38d2dba) + +commit 4427358bb556d902b01df27fd097cc3eefa561da +Author: Joao Eduardo Luis +Date: Wed Feb 11 23:36:01 2015 +0000 + + mon: MonCap: take EntityName instead when expanding profiles + + entity_name_t is tightly coupled to the messenger, while EntityName is + tied to auth. When expanding profiles we want to tie the profile + expansion to the entity that was authenticated. Otherwise we may incur + in weird behavior such as having caps validation failing because a given + client messenger inst does not match the auth entity it used. + + e.g., running + + ceph --name osd.0 config-key exists foo daemon-private/osd.X/foo + + has entity_name_t 'client.12345' and EntityName 'osd.0'. Using + entity_name_t during profile expansion would not allow the client access + to daemon-private/osd.X/foo (client.12345 != osd.X). + + Fixes: #10844 + Backport: firefly,giant + + Signed-off-by: Joao Eduardo Luis + (cherry picked from commit 87544f68b88fb3dd17c519de3119a9ad9ab21dfb) + +commit 8902279dd4b52516d59db712bd59e8d9372611a6 +Author: Joao Eduardo Luis +Date: Fri Nov 14 21:03:54 2014 +0000 + + mon: Monitor: stash auth entity name in session + + Backport: giant + + Signed-off-by: Joao Eduardo Luis + (cherry picked from commit ca8e1efc0be9bffcfbdce5593526d257aa498062) + +commit 681c99fe12cfa6318f8cf06f4e825805635bcc07 +Author: Samuel Just +Date: Thu Nov 20 14:27:39 2014 -0800 + + ReplicatedPG: fail a non-blocking flush if the object is being scrubbed + + Fixes: #8011 + Backport: firefly, giant + Signed-off-by: Samuel Just + (cherry picked from commit 9b26de3f3653d38dcdfc5b97874089f19d2a59d7) + +commit 1f58a0adc3b23785fac00083d721b62f6a4c44a1 +Merge: 9fee7ba fe7d4ca +Author: Sage Weil +Date: Tue Mar 17 10:52:01 2015 -0700 + + Merge pull request #4042 from dachary/wip-10546-giant + + ceph time check start round bug in monitor.cc + +commit 9fee7ba9d7343bda29bc5113f8db5cdd9c09d71a +Merge: ca635ce e7af52a +Author: Sage Weil +Date: Tue Mar 17 10:50:26 2015 -0700 + + Merge pull request #4047 from dachary/wip-10762-giant + + mon: osd gets marked down twice + +commit ca635ce3eb21135b08613b97ce0c56977de44351 +Merge: 9094185 5771f57 +Author: Sage Weil +Date: Tue Mar 17 10:49:53 2015 -0700 + + Merge pull request #4041 from dachary/wip-10512-giant + + osd: cancel_flush requeues blocked events after blocking event + +commit 90941850ceb2c080ac72c67fed991c951b13f449 +Merge: bc1b9d2 5fca232 +Author: Sage Weil +Date: Tue Mar 17 10:47:26 2015 -0700 + + Merge pull request #4031 from dachary/wip-10353-giant + + crush: set_choose_tries = 100 for erasure code rulesets + +commit bc1b9d22d29a74e6853834fb0430e8803710bbbf +Merge: bd69cfc 30a1744 +Author: Sage Weil +Date: Tue Mar 17 10:47:08 2015 -0700 + + Merge pull request #4029 from dachary/wip-9910-giant + + msg/Pipe: discard delay queue before incoming queue + + Reviewed-by: Greg Farnum + +commit bd69cfcbd58103843fbc42c4e55d43c2bafe54ff +Merge: 30f7df2 4bd5237 +Author: Sage Weil +Date: Tue Mar 17 10:44:53 2015 -0700 + + Merge pull request #4030 from dachary/wip-10351-giant + + mount.ceph: avoid spurious error message + + Reviewed-by: John Spray + +commit 30f7df2b110f73477354fbfa607b4777317dddfa +Merge: e5a50c3 8307318 +Author: Sage Weil +Date: Tue Mar 17 10:44:00 2015 -0700 + + Merge pull request #4028 from dachary/wip-10259-giant + + osd_types: op_queue_age_hist and fs_perf_stat should be in osd_stat_t::o... + +commit e5a50c3ae42c8b9d16a19a13e10fc35b91149e73 +Merge: ba1d55d a5a76f8 +Author: Sage Weil +Date: Tue Mar 17 10:42:10 2015 -0700 + + Merge pull request #4027 from dachary/wip-10257-giant + + mon: PGMonitor: several stats output error fixes + +commit ba1d55de1e92a0143702e9e3fbbb1820db17d6c3 +Merge: 530fce4 640986f +Author: Sage Weil +Date: Tue Mar 17 10:41:56 2015 -0700 + + Merge pull request #3998 from dzafman/wip-10677 + + Fix ceph command manpage to match ceph -h (giant) + + Reviewed-by: Xinxin Shu + +commit 530fce4d7cf51f4a4ac1e70ebe20b2795a30010e +Merge: a2a3dad 8a5a3c7 +Author: Sage Weil +Date: Tue Mar 17 10:40:02 2015 -0700 + + Merge pull request #3921 from sponce/wip-11078-giant + + Fix libstriprados::stat, use strtoll insdead of strtol + +commit a2a3dad8ec00434b99d6d538615bf8ff9eada217 +Merge: 0f57188 984df3b +Author: Sage Weil +Date: Tue Mar 17 10:36:28 2015 -0700 + + Merge pull request #3819 from tchaikov/giant-pg-leak-10421 + + osd: fix PG leak in SnapTrimWQ._clear() + +commit 0f5718822793074ffab45124afb3bf3160e92004 +Merge: 776c82c c5c6fcd +Author: Sage Weil +Date: Tue Mar 17 10:35:37 2015 -0700 + + Merge pull request #3771 from ceph/wip-10883-giant + + osd: Fix FileJournal wrap to get header out first + +commit 776c82caac36de14a5f895688f81c88a883fca3e +Merge: 938e036 37f196e +Author: Sage Weil +Date: Tue Mar 17 10:35:29 2015 -0700 + + Merge pull request #3637 from sponce/wip-10758-giant + + Backport of pull request 3633 to giant : Fixed write_full behavior in libradosstriper + +commit 92c352d047c84035478e71828e32f554d9f507fc +Author: Sage Weil +Date: Thu Feb 12 13:49:50 2015 -0800 + + mon/OSDMonitor: do not trust small values in osd epoch cache + + If the epoch cache says the osd has epoch 100 and the osd is asking for + epoch 200+, do not send it 100+. + + Fixes: #10787 + Backport: giant, firefly + Signed-off-by: Sage Weil + (cherry picked from commit a5759e9b97107488a8508f36adf9ca1aba3fae07) + +commit 25fcaca19c76765284787c61584d25efd105a0ca +Author: Yehuda Sadeh +Date: Thu Feb 5 09:33:26 2015 -0800 + + rgw: send appropriate op to cancel bucket index pending operation + + Fixes: #10770 + Backport: firefly, giant + + Reported-by: baijiaruo + Signed-off-by: Yehuda Sadeh + (cherry picked from commit dfee96e3aebcaeef18c721ab73f0460eba69f1c7) + + Conflicts: + src/rgw/rgw_rados.cc + resolved by manual s/ADD/CANCEL/ + +commit e7af52a147cc6e1a578d51193a2d5a425cee8a20 +Author: Sage Weil +Date: Thu Feb 5 03:07:50 2015 -0800 + + mon: ignore osd failures from before up_from + + If the failure was generated for an instance of the OSD prior to when + it came up, ignore it. + + This probably causes a fair bit of unnecessary flapping in the wild... + + Backport: giant, firefly + Fixes: #10762 + Reported-by: Dan van der Ster + Signed-off-by: Sage Weil + (cherry picked from commit 400ac237d35d0d1d53f240fea87e8483c0e2a7f5) + +commit 28583276254ba6121b93688a6c6d8fab6588c864 +Author: Josh Durgin +Date: Mon Feb 9 20:50:23 2015 -0800 + + rados.py: keep reference to python callbacks + + If we don't keep a reference to these, the librados aio calls will + segfault since the python-level callbacks will have been garbage + collected. Passing them to aio_create_completion() does not take a + reference to them. Keep a reference in the python Completion object + associated with the request, since they need the same lifetime. + + This fixes a regression from 60b019f69aa0e39d276c669698c92fc890599f50. + + Fixes: #10775 + Backport: dumpling, firefly, giant + Signed-off-by: Josh Durgin + (cherry picked from commit 36d37aadbbbece28d70e827511f1a473d851463d) + (cherry picked from commit 5f1245e131e33a98572408c8223deed2c7cf7b75) + +commit 1b5fb51a995d5d672a46176240d5887354c12967 +Author: Billy Olsen +Date: Mon Feb 2 16:24:59 2015 -0700 + + Fix memory leak in python rados bindings + + A circular reference was inadvertently created when using the + CFUNCTYPE binding for callbacks for the asynchronous i/o callbacks. + This commit refactors the usage of the callbacks such that the + Ioctx object does not have a class reference to the callbacks. + + Fixes: #10723 + Backport: giant, firefly, dumpling + Signed-off-by: Billy Olsen + Reviewed-by: Dan Mick + Reviewed-by: Josh Durgin + (cherry picked from commit 60b019f69aa0e39d276c669698c92fc890599f50) + +commit d5b1b7e52242e082820e0cdd76dc59e9aea7f284 +Author: Sage Weil +Date: Fri Jan 23 10:47:44 2015 -0800 + + osd: do not ignore deleted pgs on startup + + These need to get instantiated so that we can complete the removal process. + + Fixes: #10617 + Signed-off-by: Sage Weil + (cherry picked from commit 879fd0c192f5d3c6afd36c2df359806ea95827b8) + +commit fe7d4ca17dacf4419be7bdc50010115cee36517b +Author: Joao Eduardo Luis +Date: Fri Jan 30 11:37:28 2015 +0000 + + mon: Monitor: fix timecheck rounds period + + Fixes: #10546 + Backports: dumpling?,firefly,giant + + Signed-off-by: Joao Eduardo Luis + (cherry picked from commit 2e749599ac6e1060cf553b521761a93fafbf65bb) + +commit 5771f57af731e61bf70f630cf85f5b94dcfd1edb +Author: Sage Weil +Date: Sun Jan 11 17:28:04 2015 -0800 + + osd: requeue blocked op before flush it was blocked on + + If we have request A (say, cache-flush) that blocks things, and then + request B that gets blocked on it, and we have an interval change, then we + need to requeue B first, then A, so that the resulting queue will keep + A before B and preserve the order. + + This was observed on this firefly run: + + ubuntu@teuthology:/a/sage-2015-01-09_21:43:43-rados-firefly-distro-basic-multi/694675 + + Backport: giant, firefly + Fixes: #10512 + Signed-off-by: Sage Weil + (cherry picked from commit 11bdfb4131ecac16d4a364d651c6cf5d1d28c702) + +commit fa8d45423e22f7b916a345024a9f70283fc3097d +Author: Yehuda Sadeh +Date: Wed Jan 7 13:56:14 2015 -0800 + + rgw: index swift keys appropriately + + Fixes: #10471 + Backport: firefly, giant + + We need to index the swift keys by the full uid:subuser when decoding + the json representation, to keep it in line with how we store it when + creating it through other mechanism. + + Reported-by: hemant burman + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 478629bd2f3f32afbe6e93eaebb8a8fa01af356f) + +commit 5fca232080c0bde50b5c65c60c133bf81311e8d3 +Author: Loic Dachary +Date: Wed Dec 17 16:06:55 2014 +0100 + + crush: set_choose_tries = 100 for erasure code rulesets + + It is common for people to try to map 9 OSDs out of a 9 OSDs total ceph + cluster. The default tries (50) will frequently lead to bad mappings for + this use case. Changing it to 100 makes no significant CPU performance + difference, as tested manually by running crushtool on one million + mappings. + + http://tracker.ceph.com/issues/10353 Fixes: #10353 + + Signed-off-by: Loic Dachary + (cherry picked from commit 2f87ac807f3cc7ac55d9677d2051645bf5396a62) + +commit 4bd523744cefb4a5e6b0d4440b9829b0e19dc012 +Author: Yan, Zheng +Date: Sat Jan 3 15:29:29 2015 +0800 + + mount.ceph: avoid spurious error message + + /etc/mtab in most modern distributions is a symbol link to + /proc/self/mounts. + + Fixes: #10351 + Signed-off-by: Yan, Zheng + (cherry picked from commit bdd0e3c4bda97fe18487a58dd173a7dff752e1a2) + +commit 30a17441f5030ac5c1227a9672811f88e580b4b2 +Author: Sage Weil +Date: Wed Oct 29 14:45:11 2014 -0700 + + msg/Pipe: discard delay queue before incoming queue + + Shutdown the delayed delivery before the incoming queue in case the + DelayedDelivery thread is busy queuing messages. + + Fixes: #9910 + Signed-off-by: Sage Weil + Reviewed-by: Greg Farnum + (cherry picked from commit f7431cc3c25878057482007beb874c9d4473883e) + +commit 8307318cdf9487dde3231aa4a00fca6079379738 +Author: Samuel Just +Date: Fri Dec 5 15:29:52 2014 -0800 + + osd_types: op_queue_age_hist and fs_perf_stat should be in osd_stat_t::operator== + + Fixes: 10259 + Backport: giant, firefly, dumpling + Signed-off-by: Samuel Just + (cherry picked from commit 1ac17c0a662e6079c2c57edde2b4dc947f547f57) + +commit a5a76f81ae9242a54d6ef150fa0ac32b31e90c25 +Author: Joao Eduardo Luis +Date: Mon Jan 19 18:49:15 2015 +0000 + + mon: PGMonitor: skip zeroed osd stats on get_rule_avail() + + Fixes: #10257 + + Signed-off-by: Joao Eduardo Luis + (cherry picked from commit b311e7c36273efae39aa2602c1f8bd90d39e5975) + +commit 95ec68505ce4a26ea707716791cfb1e46a75ed25 +Author: Joao Eduardo Luis +Date: Fri Jan 16 18:13:05 2015 +0000 + + mon: PGMonitor: available size 0 if no osds on pool's ruleset + + get_rule_avail() may return < 0, which we were using blindly assuming it + would always return an unsigned value. We would end up with weird + values if the ruleset had no osds. + + Signed-off-by: Joao Eduardo Luis + (cherry picked from commit 8be6a6ab2aa5a000a39c73a98b11a0ab32fffa1c) + +commit 1c480b39acc184018d976beddc55187f86dfff1a +Author: Joao Eduardo Luis +Date: Fri Jan 16 18:12:42 2015 +0000 + + mon: PGMonitor: fix division by zero on stats dump + + Signed-off-by: Joao Eduardo Luis + (cherry picked from commit 50547dc3c00b7556e26b9a44ec68640c5c3a2384) + +commit 640986fdef3543c7469e43853900b63d7b2f6f36 +Author: David Zafman +Date: Fri Mar 13 19:16:47 2015 -0700 + + doc: Fix ceph command manpage to match ceph -h (giant) + + Fixes: #10677 + + Signed-off-by: David Zafman + +commit ec4c8997a67397e3ba5f335e444586f6f1a7864c +Author: David Zafman +Date: Fri Mar 13 16:50:13 2015 -0700 + + doc: Minor fixes to ceph command manpage + + Fixes: #10676 + + Signed-off-by: David Zafman + (cherry picked from commit 7e85722fd4c89715fc2ed79697c82d65d7ebf287) + +commit 15e1c6fb3a0e2e191025f96df33feaf115c1411d +Author: David Zafman +Date: Thu Mar 12 11:39:52 2015 -0700 + + doc: Fix ceph command manpage to match ceph -h (firefly) + + Improve synopsis section + Fixes: #10676 + + Signed-off-by: David Zafman + (cherry picked from commit 9ac488c1eb0e30511079ba05aaf11c79615b3940) + + Conflicts: + + man/ceph.8 (took incoming version) + +commit 17d03e8e8b055ed50b1abb679a0dd1edf4ec3f30 +Author: Nilamdyuti Goswami +Date: Thu Dec 18 17:11:22 2014 +0530 + + doc: Changes format style in ceph to improve readability as html. + + Signed-off-by: Nilamdyuti Goswami + (cherry picked from commit 8b796173063ac9af8c21364521fc5ee23d901196) + +commit 61d600687ae6887a2edea0e79d582b1353558a83 +Author: Yan, Zheng +Date: Tue Mar 10 19:55:57 2015 +0800 + + mds: fix assertion caused by system clock backwards + + Fixes: #11053 + Signed-off-by: Yan, Zheng + +commit f1c9c71fd6d566687db05e1d8f15b04a9f793005 +Author: Nilamdyuti Goswami +Date: Sat Dec 13 02:24:41 2014 +0530 + + doc: Adds man page for ceph. + + Signed-off-by: Nilamdyuti Goswami + (cherry picked from commit 76da87a64ca6b3cc0ceeaf63e19a9f440d6f4161) + +commit 8a5a3c7a20517c7482dc9f5f2783a3da54bd4f23 +Author: Dongmao Zhang +Date: Fri Nov 14 18:48:58 2014 +0800 + + Fix libstriprados::stat, use strtoll insdead of strtol + + The return value(long int) of strict_strtol is too small for unstriped + object. + + Signed-off-by: Dongmao Zhang + (cherry picked from commit fe6679dca479fc24806d7e57ab0108a516cd6d55) + +commit 5d423a5188c62650eaa39077eb99a84085f5f3e2 +Author: Dongmao Zhang +Date: Wed Dec 10 18:55:28 2014 +0800 + + Fix libstriprados::remove, use strtoll insdead of strtol + + Signed-off-by: Dongmao Zhang + (cherry picked from commit 78a15ee4c61fdadccb1921e861748400cc651862) + +commit 938e03630e075af03780da139ae879b5b0377734 +Author: Samuel Just +Date: Mon Feb 2 13:57:00 2015 -0800 + + Objecter::_op_submit_with_budget: add timeout before call + + Objecter::_send_op depends on the ontimeout field being filled in + to avoid 10340 and 9582. + + Fixes: 10340 + Signed-off-by: Samuel Just + (cherry picked from commit cfcfafcb0f33994dbda1efe478ef3ab822ff50d4) + +commit 984df3b865f295ecf77b041a69d1d59384d80671 +Author: Kefu Chai +Date: Tue Feb 10 16:29:45 2015 +0800 + + osd: fix PG leak in SnapTrimWQ._clear() + + Fixes: #10421 + Signed-off-by: Kefu Chai + (cherry picked from commit 01e154d592d6cdbf3f859cf1b4357e803536a6b4) + +commit c5c6fcd6ad7a0317301970bc25419727ee3192fa +Author: David Zafman +Date: Wed Feb 18 16:21:12 2015 -0800 + + osd: Fix FileJournal wrap to get header out first + + Correct and restore assert that was removed + + Cause by f46b1b473fce0322a672b16c7739e569a45054b6 + Fixes: #10883 + Backport: dumpling, firefly, giant + + Signed-off-by: David Zafman + (cherry picked from commit 970bb4901f93575709421b5b25c3eff213de61b8) + +commit 37f196e80906d6b855ca0a7ce7344d1f8f50ae7f +Author: Sebastien Ponce +Date: Thu Feb 5 10:46:37 2015 +0100 + + libradosstriper: fixed write_full when ENOENT + + Fixes: #10758 + write_full was returning ENOENT when the file did not exists, while it should just have created it without complaining. + + Signed-off-by: Sebastien Ponce + (cherry picked from commit 6a91f2bb82035b3c8021a7dc7b23548ce3d61eb6) + +commit b318e2ffc3faedf6b02908429eb4ce79147d3471 (refs/remotes/gh/wip-10643) +Author: Joao Eduardo Luis +Date: Fri Oct 17 18:01:05 2014 +0100 + + mon: MDSMonitor: don't return -EINVAL if function is bool + + Returning -EINVAL on a function that expects bool and the error code to + be in a variable 'r' can only achieve one thing: if this path is ever + touched, instead of returning an error as it was supposed to, we're + returning 'true' with 'r = 0' and, for no apparent reason, the user will + think everything went smoothly but with no new fs created. + + Signed-off-by: Joao Eduardo Luis + (cherry picked from commit 91b2a2b69b198567d42f45f75b7f7ea559f89150) + +commit a1a7d856f5409280b6345afbbd961bcd8ee8d046 +Author: Joao Eduardo Luis +Date: Fri Oct 17 17:43:33 2014 +0100 + + mon: MDSMonitor: check all conditions are met *before* osdmon proposal + + We should not allow ourselves to request the osdmon to propose before we + know for sure that we meet the required conditions to go through with + our own state change. Even if we still can't guarantee that our + proposal is going to be committed, we shouldn't just change the osdmon's + state just because we can. This way, at least, we make sure that our + checks hold up before doing anything with side-effects. + + Fixes: #10643 + + Signed-off-by: Joao Eduardo Luis + (cherry picked from commit 1d1ae413e9f0aafa314af903bcf6b815f6558082) + +commit 0df7cf1e58bc4d77c69354f2a7160e3d9e6d1fc1 +Author: Joao Eduardo Luis +Date: Fri Oct 17 17:37:03 2014 +0100 + + mon: MDSMonitor: return if fs exists on 'fs new' + + We were just setting return code to -EINVAL, while allowing the logic to + continue regardless. If we are to return error, then we should abort + the operation as well and let the user know it went wrong instead of + continuing as if nothing had happened. + + Signed-off-by: Joao Eduardo Luis + (cherry picked from commit 07b7f101057a80d1356bd20040180e94cd20c211) diff --git a/doc/changelog/v0.94.1.txt b/doc/changelog/v0.94.1.txt new file mode 100644 index 0000000000000..227267ddb6e4e --- /dev/null +++ b/doc/changelog/v0.94.1.txt @@ -0,0 +1,74 @@ +commit e4bfad3a3c51054df7e537a724c8d0bf9be972ff (tag: refs/tags/v0.94.1, refs/remotes/gh/hammer) +Author: Jenkins +Date: Fri Apr 10 12:04:50 2015 -0700 + + 0.94.1 + +commit 733864738fa93979727e480e403293e079bb51e9 +Merge: b5921d5 5ca771a +Author: Loic Dachary +Date: Fri Apr 10 17:51:38 2015 +0200 + + Merge pull request #4328 from dachary/wip-11364-hammer + + v4 bucket feature detection + + Reviewed-by: Sage Weil + +commit 5ca771a7d1df8e78ee503a7063068cf744d5efcc +Author: Sage Weil +Date: Fri Apr 10 08:43:45 2015 -0700 + + crush: fix has_v4_buckets() + + alg, not type! + + This bug made us incorrectly think we were using v4 features when user type + 5 was being used. That's currently 'rack' with recent crush maps, but + was other types for clusters that were created with older versions. This + is clearly problematic as it will lock out non-hammer clients incorrectly, + breaking deployments on upgrade. + + Fixes: #11364 + Backport: hammer + Signed-off-by: Sage Weil + (cherry picked from commit 38b35ab9d17eb84ac178c4cd3ebcf2ec0f66d8b6) + +commit 33e79ab7aa0b5428e8fb82a90eea17d31d363a88 +Author: Sage Weil +Date: Thu Apr 9 17:17:59 2015 -0700 + + crush: fix dump of has_v4_buckets + + Backport: hammer + Signed-off-by: Sage Weil + (cherry picked from commit d6e23413017fb8f5d7f18d74e993ceeedb82d8bc) + +commit b5921d55d16796e12d66ad2c4add7305f9ce2353 +Author: Sage Weil +Date: Thu Apr 9 14:42:34 2015 -0700 + + crush/mapper: fix divide-by-0 in straw2 + + If the item weight is 0 we don't want to divide; instead draw a minimal + value. + + Fixes: #11357 + Reported-by: Yann Dupont + Tested-by: Yann Dupont + Signed-off-by: Sage Weil + (cherry picked from commit 64d1e900ec4f5831972ec80e8d0129604669f5bb) + +commit 93c8f436a4f84ac8088e1a1de82350dd33c68d64 +Author: Guang Yang +Date: Thu Feb 26 08:13:12 2015 +0000 + + osd: fix negative degraded objects during backfilling + + When there is deleting requests during backfilling, the reported number of degraded + objects could be negative, as the primary's num_objects is the latest (locally) but + the number for replicas might not reflect the deletings. A simple fix is to ignore + the negative subtracted value. + + Signed-off-by: Guang Yang + (cherry picked from commit 14d7e36d3c978844da73d0e1c8a3a1ec863bac15) diff --git a/doc/changelog/v0.94.2.txt b/doc/changelog/v0.94.2.txt new file mode 100644 index 0000000000000..f8113af7ffeba --- /dev/null +++ b/doc/changelog/v0.94.2.txt @@ -0,0 +1,1563 @@ +commit 5fb85614ca8f354284c713a2f9c610860720bbf3 (tag: refs/tags/v0.94.2, refs/remotes/gh/hammer) +Author: Jenkins +Date: Tue Jun 9 12:32:34 2015 -0700 + + 0.94.2 + +commit d967cecf0a5d7fbf992a0195341cbd893a358264 +Merge: eb69cf7 968573b +Author: Loic Dachary +Date: Fri May 29 09:49:43 2015 +0200 + + Merge pull request #4795 from dachary/wip-11806-hammer + + ceph / ceph-dbg steal ceph-objecstore-tool from ceph-test / ceph-test-dbg + + Reviewed-by: Sage Weil + +commit 968573b8930a7c8485bf53e3a989ce2f7d0a2fff +Author: Loic Dachary +Date: Thu May 28 10:35:51 2015 +0200 + + debian: ceph-dbg steals ceph-objectstore-tool from ceph-test-dbg + + When ceph-objectstore-tool was moved from ceph-test to + ceph by 61cf5da0b51e2d9578c7b4bca85184317e30f4ca, the ceph package in + debian/control was updated accordingly, as recommended by + https://www.debian.org/doc/debian-policy/ch-relationships.html#s-replaces + + The same must be done for the ceph-dbg package because + /usr/lib/debug/usr/bin/ceph-objectstore-too is no longer in + ceph-test-dbg. + + Although the change was merged may 6th, 2015 + 8f23382064c189b657564d58c3f9d17720e891ed, teuthology jobs were not + always failing because packages were not systematically upgraded during + the installation. The missing dependencies that were responsible for + this upgrade problem were fixed by + f898ec1e4e3472b0202280f09653a769fc62c8d3 on may 18th, 2015 and all + upgrade tests relying on ceph-*-dbg packages started to fail + systematically after this date. + + http://tracker.ceph.com/issues/11546 Fixes: #11546 + + Signed-off-by: Loic Dachary + (cherry picked from commit 6f11fbf41fab10924b1e0e41fcf27864779d4073) + +commit eb69cf758eb25e7ac71e36c754b9b959edb67cee +Merge: 63832d4 344328d +Author: Ken Dreyer +Date: Tue May 26 19:52:59 2015 -0600 + + Merge pull request #4773 from dachary/wip-11733-hammer + + Debian: ceph-test and rest-bench debug packages should require their respective binary packages + + Reviewed-by: Ken Dreyer + +commit 344328de584ac707b59ab857f1f3dd4165adfcf5 +Author: Ken Dreyer +Date: Mon May 18 10:50:58 2015 -0600 + + debian: set rest-bench-dbg ceph-test-dbg dependencies + + Debian's debug packages ought to depend on their respective binary + packages. This was the case for many of our ceph packages, but it was + not the case for ceph-test-dbg or rest-bench-dbg. + + Add the dependencies on the relevant binary packages, pinned to + "= ${binary:Version}" per convention. + + http://tracker.ceph.com/issues/11673 Fixes: #11673 + + Signed-off-by: Ken Dreyer + (cherry picked from commit f898ec1e4e3472b0202280f09653a769fc62c8d3) + +commit 63832d4039889b6b704b88b86eaba4aadcfceb2e +Merge: 195884e 293affe +Author: Loic Dachary +Date: Mon May 18 14:26:16 2015 +0200 + + Merge pull request #4696 from ceph/wip-11622-hammer + + Wip 11622 hammer + + Reviewed-by: Loic Dachary + +commit 195884e21760f4948f7d1df8b65788514f918054 +Merge: b69fb89 95818da +Author: Sage Weil +Date: Fri May 15 13:47:04 2015 -0700 + + Merge pull request #4649 from ceph/wip-hammer-package-perf-objectstore + + packaging: include ceph_perf_objectstore + +commit 293affe992118ed6e04f685030b2d83a794ca624 (refs/remotes/gh/wip-11622-hammer) +Author: Yehuda Sadeh +Date: Wed May 13 17:05:22 2015 -0700 + + rgw: merge manifests correctly when there's prefix override + + Fixes: #11622 + Backport: hammer, firefly + + Prefix override happens in a manifest when a rados object does not + conform to the generic prefix set on the manifest. When merging + manifests (specifically being used in multipart objects upload), we need + to check if the rule that we try to merge has a prefix that is the same + as the previous rule. Beforehand we checked if both had the same + override_prefix setting, but that might not apply as both manifests + might have different prefixes. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 389ae6739ddc6239a4dd7c5f7f9bfc9b645b8577) + +commit a43d24861089a02f3b42061e482e05016a0021f6 +Author: Yehuda Sadeh +Date: Tue May 12 16:40:10 2015 -0700 + + rgw: restore buffer of multipart upload after EEXIST + + Fixes #11604 + Backport: hammer, firefly + + When we need to restart a write of part data, we need to revert to + buffer to before the write, otherwise we're going to skip some data. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 580ccaec12daae64c38a1616d0be907bdd70a888) + +commit 95818dac1522c218662ec12bd42c470d8394b3b9 +Author: Ken Dreyer +Date: Fri Mar 13 16:08:35 2015 -0600 + + packaging: include ceph_perf_objectstore + + The /usr/bin/ceph_perf_objectstore file is installed by default. Prior + to this commit it was missing from the packaging. This caused the RPM to + fail to build in mock. + + Add ceph_perf_objectstore to the "ceph-test" RPM and Debian package. + + If we end up developing further ceph_perf_* utilities, it would make + sense to glob them all with a wildcard, similar to what we are doing + with all the ceph_test_* utilities in ceph-test. + + Signed-off-by: Ken Dreyer + (cherry picked from commit 338b44bc74bc4597939c5c58f2a6f2cb08de7d9c) + +commit b69fb89122d6f989152a29124cc7ed54b5e4d43b +Merge: 0f02512 efbfe6f +Author: Yehuda Sadeh +Date: Mon May 11 10:27:02 2015 -0700 + + Merge pull request #4568 from dachary/wip-10662-10663-hammer + + RGW swift API: Response header of COPY request for object does not + + contain x-copied-from, custom metadata, x-copied-from-last-modified, X-Copied-From-Account headers + + Reviewed-by: Yehuda Sadeh + +commit 0f025122fe574b99504a630e3d489c3449cbbe46 +Merge: 7df3eb5 6e2dd40 +Author: Loic Dachary +Date: Mon May 11 14:08:53 2015 +0200 + + Merge pull request #4629 from ceph/hammer-uclient-checking + + Hammer uclient checking + + Reviewed-by: Loic Dachary + +commit 6e2dd408be95644ee5bceb556a90483f882fe51c +Author: John Spray +Date: Thu May 7 18:42:01 2015 +0100 + + client: fix error handling in check_pool_perm + + Previously, on an error such as a pool not existing, + the caller doing the check would error out, but + anyone waiting on waiting_for_pool_perm would + block indefinitely (symptom was that reads on a + file with a bogus layout would block forever). + + Fix by triggering the wait list on errors and + clear the CHECKING state so that the other callers + also perform the check and find the error. + + Additionally, don't return the RADOS error code + up to filesystem users, because it can be + misleading. For example, nonexistent pool is + ENOENT, but we shouldn't give ENOENT on IO + to a file which does exist, we should give EIO. + + Signed-off-by: John Spray + (cherry picked from commit e08cf25cafef5752877439c18cc584b0a75eca08) + Reviewed-by: Greg Farnum + +commit 3c2e6ae97d6129cb8f5befb3e7bf4be16373f6a5 +Author: John Spray +Date: Thu May 7 18:17:37 2015 +0100 + + client: use SaferCond in check_pool_perm + + Just because it's easier to read. + + Signed-off-by: John Spray + (cherry picked from commit 289ee3b80ccce6bab2966f513a37332280d04a06) + Reviewed-by: Greg Farnum + +commit 79b2ac215187402a99594424944db4169f2b2cdf +Author: Yan, Zheng +Date: Fri Apr 24 15:23:21 2015 +0800 + + client: check OSD caps before read/write + + Signed-off-by: Yan, Zheng + (cherry picked from commit 3c4028ec21e3ef9e8801c4570420c88722651cc7) + Reviewed-by: Greg Farnum + +commit 7df3eb5e548f7b95ec53d3b9d0e43a863d6fe682 +Merge: 6a7fa83 2f86995 +Author: Yehuda Sadeh +Date: Fri May 8 11:07:07 2015 -0700 + + Merge pull request #4567 from dachary/wip-10938-hammer + + RGW Swift API: response for GET/HEAD on container does not contain the X-Timestamp header + + Reviewed-by: Yehuda Sadeh + +commit 6a7fa83b3e72b85f92d003a5bbb03a301354a657 +Merge: 553f0db 3edb196 +Author: Yehuda Sadeh +Date: Fri May 8 11:05:59 2015 -0700 + + Merge pull request #4570 from dachary/wip-10243-hammer + + civetweb is hitting a limit (number of threads 1024) + +commit 553f0db9a1fcff2601a8791af1d2bb6975d2821d +Merge: 3fe1f2b 3aef0f2 +Author: Yehuda Sadeh +Date: Fri May 8 11:05:41 2015 -0700 + + Merge pull request #4573 from dachary/wip-11125-hammer + + rgw: keystone token cache does not work correctly + +commit 3fe1f2b8ab3d0d3943a312e90f6a3de99c36beb4 +Merge: e0ed459 4d1f3f0 +Author: Yehuda Sadeh +Date: Fri May 8 11:05:22 2015 -0700 + + Merge pull request #4574 from dachary/wip-11160-hammer + + rgw: shouldn't need to disable rgw_socket_path if frontend is configured + +commit e0ed459442b1e9053e29e345cd0f30d1b4b4b994 +Merge: d6de3fa d2043a5 +Author: Yehuda Sadeh +Date: Fri May 8 11:04:36 2015 -0700 + + Merge pull request #4575 from dachary/wip-10650-hammer + + Response header of swift API PUT /container/object returned by RGW + + does not contain last-modified, content-length, x-trans-id headers. But Swift returns these headers. + + Reviewed-by: Yehuda Sadeh + +commit d6de3fa0b55918bc2ac2d65ee8308f04d3605dfd +Merge: 96dc624 2cb5d60 +Author: Yehuda Sadeh +Date: Fri May 8 11:03:37 2015 -0700 + + Merge pull request #4576 from dachary/wip-10661-hammer + + RGW swift API: Response header of POST request for object does not contain content-length and x-trans-id headers + + Reviewed-by: Yehuda Sadeh + +commit 96dc624ee1f593f817055d3426054ef2e05cbf92 +Merge: ae61aee f4a0dab +Author: Yehuda Sadeh +Date: Fri May 8 10:59:15 2015 -0700 + + Merge pull request #4579 from dachary/wip-11036-hammer + + RGW Swift API: response for PUT on /container does not contain the mandatory Content-Length header when FCGI is used + + Reviewed-by: Yehuda Sadeh + +commit ae61aee99bee9185af22424ec8019e0308828bf5 +Merge: 593d07f 7f2a9ed +Author: Yehuda Sadeh +Date: Fri May 8 10:57:47 2015 -0700 + + Merge pull request #4580 from dachary/wip-11088-hammer + + RGW Swift API: wrong handling of empty metadata on Swift container + + Reviewed-by: Yehuda Sadeh + +commit 593d07f29df8584629936051be4458b00f8a8f1f +Merge: 4f2b41c d164d80 +Author: Yehuda Sadeh +Date: Fri May 8 10:57:12 2015 -0700 + + Merge pull request #4581 from dachary/wip-11323-hammer + + rgw: quota not respected in POST object + + Reviewed-by: Yehuda Sadeh + +commit 4f2b41c4c803786c49cd2d2806d82e9223ab96a9 +Merge: 92e7a7f 893ffd3 +Author: Josh Durgin +Date: Fri May 8 09:15:41 2015 -0700 + + Merge pull request #4566 from dachary/wip-11478-hammer + + Queued AIO reference counters not properly updated + + Reviewed-by: Josh Durgin + +commit 92e7a7f057dd9aabb4f66965c412135d05f6812f +Merge: 2fbf171 0944051 +Author: Josh Durgin +Date: Fri May 8 09:15:19 2015 -0700 + + Merge pull request #4564 from dachary/wip-11369-hammer + + Periodic failure of TestLibRBD.DiffIterateStress + + Reviewed-by: Josh Durgin + +commit 2fbf1712d1e190774ada0af5094134369effb3ac +Merge: 9d97946 02a3813 +Author: Loic Dachary +Date: Fri May 8 17:32:35 2015 +0200 + + Merge pull request #4585 from ceph/wip-11370-hammer + + A retransmit of proxied flatten request can result in -EINVAL + + Reviewed-by: Loic Dachary + +commit 9d97946a8908e48849355a410415e09914ef3948 +Merge: fb10594 c548d8d +Author: Kefu Chai +Date: Fri May 8 20:39:29 2015 +0800 + + Merge pull request #4618 from dachary/wip-11398-kill-daemons-hammer + + tests: ceph-helpers kill_daemons fails when kill fails + + Reviewed-by: Kefu Chai + +commit c548d8d44baae78b868391e3c6fb7294f024b082 +Author: Loic Dachary +Date: Wed May 6 20:14:37 2015 +0200 + + tests: ceph-helpers kill_daemons fails when kill fails + + Instead of silently leaving the daemons running, it returns failure so + the caller can decide what to do with this situation. The timeout is + also extended to minutes instead of seconds to gracefully handle the + rare situations when a machine is extra slow for some reason. + + http://tracker.ceph.com/issues/11398 Fixes: #11398 + + Signed-off-by: Loic Dachary + (cherry picked from commit 0e26e9f72bc14417266b87ac1159607e1b109303) + +commit fb10594f2ab2427a2bf1d2f2b164a3a0928e3335 +Author: Yuri Weinstein +Date: Thu May 7 17:12:35 2015 -0700 + + Added a "ceph hello world" for a simple check for ceph-deploy qa suite + + Signed-off-by: Yuri Weinstein + (cherry picked from commit 13abae186357f4e9bb40990a7a212f93ec2e1e79) + + Signed-off-by: Yuri Weinstein + +commit 6cfae7f074462498e82cfeeddbc2fe8d302c7aa0 +Merge: 736cdf4 8d9f4d4 +Author: Sage Weil +Date: Thu May 7 13:29:57 2015 -0700 + + Merge pull request #4502 from dachary/wip-11026-hammer + + objecter: can get stuck in redirect loop if osdmap epoch == last_force_op_resend + + Reviewed-by: Sage Weil + +commit 736cdf450caa0b22cbfc54f4497717cf63d5bda7 +Merge: 3bd8e4f 46a4e8a +Author: Loic Dachary +Date: Thu May 7 21:45:53 2015 +0200 + + Merge pull request #4562 from dachary/wip-11376-hammer + + ceph-objectstore-tool should be in the ceph server package + + + + Reviewed-by: Ken Dreyer + +commit 46a4e8af5be54d8348a920c4a3a58e24dbf35988 +Author: Ken Dreyer +Date: Thu Apr 30 15:53:22 2015 -0600 + + packaging: mv ceph-objectstore-tool to main ceph pkg + + This change ensures that the ceph-objectstore-tool utility is present on + all OSDs. This makes it easier for users to run this tool to do manual + debugging/recovery in some scenarios. + + http://tracker.ceph.com/issues/11376 Refs: #11376 + + Signed-off-by: Ken Dreyer + (cherry picked from commit 61cf5da0b51e2d9578c7b4bca85184317e30f4ca) + Conflicts: + debian/control + because file layout changes from ceph-test and ceph << 0.94.1-46 + +commit 3bd8e4fa17d8acfd645b8a553bf58de48d59d648 +Merge: 76f6db2 6a04b55 +Author: Loic Dachary +Date: Thu May 7 19:05:48 2015 +0200 + + Merge pull request #4561 from dachary/wip-11143-hammer + + src/ceph-disk : disk zap sgdisk invocation + + Reviewed-by: Loic Dachary + +commit 76f6db24e09a4bfc55cbfb7075104f20653263cc +Merge: e5a20f8 8996907 +Author: Loic Dachary +Date: Thu May 7 19:03:53 2015 +0200 + + Merge pull request #4560 from dachary/wip-11507-hammer + + object creation by write cannot use an offset on an erasure coded pool + + Reviewed-by: Samuel Just + +commit e5a20f83ed6d8d3d09827343d757318026f6a690 +Merge: e7671a5 8a6e6e4 +Author: Loic Dachary +Date: Thu May 7 19:03:16 2015 +0200 + + Merge pull request #4559 from dachary/wip-11429-hammer + + OSD::load_pgs: we need to handle the case where an upgrade from earlier versions which ignored non-existent pgs resurrects a pg with a prehistoric osdmap + + Reviewed-by: Samuel Just + +commit e7671a58939f6e2e37f9f6df0f039f485ad4ca16 +Merge: 126a372 113f3b1 +Author: Loic Dachary +Date: Thu May 7 18:48:56 2015 +0200 + + Merge pull request #4563 from tchaikov/wip-hammer-11534 + + mon: Total size of OSDs is a maginitude less than it is supposed to be. + + Reviewed-by: Loic Dachary + +commit 126a37212550a4a59beaa80e0579098198f74db5 +Merge: 58b30d5 c87aa11 +Author: Loic Dachary +Date: Thu May 7 18:18:17 2015 +0200 + + Merge pull request #4577 from dachary/wip-10666-hammer + + RGW swift API: Response header of GET request for container does not contain X-Container-Object-Count, X-Container-Bytes-Used and x-trans-id headers + + Reviewed-by: Dmytro Iurchenko + Reviewed-by: Yehuda Sadeh + +commit f4a0dabfe8debc7b54afa5da179d51db891f5bc0 +Author: Radoslaw Zarzynski +Date: Sun Mar 8 22:45:34 2015 +0100 + + rgw: send Content-Length in response for HEAD on Swift account. + + Backport: hammer + Signed-off-by: Radoslaw Zarzynski + (cherry picked from commit 29073d84640b61d0221b2d2ab73c271d60fd13ba) + +commit 32f4a7439d1e0ce6aab1c1e39407b14e124d03bf +Author: Radoslaw Zarzynski +Date: Fri Mar 6 13:18:19 2015 +0100 + + rgw: send Content-Length in response for DELETE on Swift container. + + Backport: hammer + Signed-off-by: Radoslaw Zarzynski + (cherry picked from commit d260a93405a7a34a899f716bd9c4e731baa9ffd0) + +commit d39a66007fa7211c00a67f9cd898e55551f5ae62 +Author: Radoslaw Zarzynski +Date: Fri Mar 6 13:14:26 2015 +0100 + + rgw: send Content-Length in response for PUT on Swift container. + + Fixes: #11036 + Backport: hammer + Signed-off-by: Radoslaw Zarzynski + (cherry picked from commit 10c1f1aa1e09df5edf5d2f9f9b0273ddbcb384fa) + +commit 02a3813a6a4f9c8ce14f64fc7f378030e7ea6f93 +Author: Jason Dillaman +Date: Mon Apr 27 01:03:52 2015 -0400 + + librbd: flatten should return -EROFS if image is read-only + + Fixes: #11475 + Signed-off-by: Jason Dillaman + (cherry picked from commit f141e02ab719df830648318f4c1d9ca286071ed3) + +commit af8939be80310e234745fe81e67244ab52c6add5 +Author: Jason Dillaman +Date: Mon Apr 27 01:00:38 2015 -0400 + + librbd: allow snapshots to be created when snapshot is active + + The librbd API previously permitted the creation of snapshots while + the image context was associated to another snapshot. A recent code + cleanup broke that ability, so this re-introduces it. The code change + also allows minor cleanup with rebuild_object_map. + + Fixes: #11475 + Signed-off-by: Jason Dillaman + (cherry picked from commit 594a6610802f2cadb62200815bd8b9860809e759) + + Conflicts: + src/librbd/internal.cc + +commit d21c0c00d2bed282677d2063a3fb6f5346641286 +Author: Jason Dillaman +Date: Tue Apr 21 12:59:33 2015 -0400 + + librbd: better handling for duplicate flatten requests + + A proxied flatten request could be replayed, resulting in a + -EINVAL error code being generated on the second attempt. Filter + out that error if it is known the parent did exist before the + op started. + + Fixes: #11370 + Backport: hammer + Signed-off-by: Jason Dillaman + (cherry picked from commit ef7e210c3f747bc4c3c8768c7b6407cc91c5c319) + +commit ec0bd1dea526e04333d8059421666dcd2a59044e +Author: Jason Dillaman +Date: Wed Mar 18 11:51:47 2015 -0400 + + librbd: use generic helper for issuing async requests + + resize, flatten, and rebuild object map now use the same + bootstrap code for sending the request to the remote lock owner + or executing the request locally. + + Signed-off-by: Jason Dillaman + (cherry picked from commit 18fd6ca7f59d5545f0bb0b0e899d0739639ce104) + + Conflicts: + src/librbd/internal.cc + +commit 8a6e6e4c107b03563b2e38aa24cc2067ce6a7350 +Author: Samuel Just +Date: Mon Apr 20 23:45:57 2015 -0700 + + OSD: handle the case where we resurrected an old, deleted pg + + Prior to giant, we would skip pgs in load_pgs which were not present in + the current osdmap. Those pgs would eventually refer to very old + osdmaps, which we no longer have causing the assertion failure in 11429 + once the osd is finally upgraded to a version which does not skip the + pgs. Instead, if we do not have the map for the pg epoch, complain to + the osd log and skip the pg. + + Fixes: 11429 + Signed-off-by: Samuel Just + (cherry picked from commit fbfd50de5b9b40d71d2e768418a8eca28b1afaca) + +commit efbfe6fced72d07309ccf1f1a219c037b7f535fa +Author: Radoslaw Zarzynski +Date: Sat Mar 21 15:53:08 2015 +0100 + + rgw: improve metadata handling on copy operation of Swift API. + + Fixes: #10645 + Backport: hammer + Signed-off-by: Radoslaw Zarzynski + (cherry picked from commit a00cb31cc52e91bfacdd15e0af60be74e66c1996) + +commit d164d8004feddb3e2238b26e3360a905e209d117 +Author: 9seconds +Date: Wed Apr 1 09:12:06 2015 +0300 + + rgw: quota not respected in POST object + + Signed-off-by: Sergey Arkhipov + Backport: hammer, firefly + Fixes: #11323 + (cherry picked from commit e76f84e179d2ba8bfc0dc5abf4e620fef14bc8a0) + + Conflicts: + src/rgw/rgw_op.cc + discard the whitespace modification hunk that were creating + conflict and ignore the conflict due to an unrelated cast + modification in the context + +commit 7f2a9ed7a986145d4b34517a1a1bb44799ebf621 +Author: Radoslaw Zarzynski +Date: Tue Mar 10 19:36:59 2015 +0100 + + rgw: fix handling empty metadata items on Swift container. + + Fixes: #11088 + Backport: hammer + Signed-off-by: Radoslaw Zarzynski + (cherry picked from commit 1660d8625212f7b5fb30568ed640f467297e36cb) + +commit 8e6efdbcb0f820b2ab3728662efbfb4bc45495af +Author: Radoslaw Zarzynski +Date: Fri Feb 27 15:23:16 2015 +0100 + + rgw: send Content-Length in response for GET on Swift container. + + Fixes: #10971 + Backport: hammer + Signed-off-by: Radoslaw Zarzynski + (cherry picked from commit 5a64fb5430b6b42174bf53a5910d9435043c1380) + +commit 54b62904a4cc3913be23803734fa68741a3c33cc +Author: Radoslaw Zarzynski +Date: Fri Mar 6 12:42:52 2015 +0100 + + rgw: enable end_header() to handle proposal of Content-Length. + + Backport: hammer + Signed-off-by: Radoslaw Zarzynski + (cherry picked from commit f98fd4dfda7a105d621f99c2b48eb9ab9b45d979) + +commit c87aa110bd224ae3220c5486fbd2486ebdfb8b8e +Author: Dmytro Iurchenko +Date: Mon Feb 16 18:47:59 2015 +0200 + + rgw: Swift API. Complement the response to "show container details" + + OpenStack Object Storage API v1 states that X-Container-Object-Count, X-Container-Bytes-Used and user-defined metadata headers should be included in a response. + + Fixes: #10666 + Backport: hammer + Reported-by: Ahmad Faheem + Reviewed-by: Yehuda Sadeh + Signed-off-by: Dmytro Iurchenko + (cherry picked from commit d3a3d5ae5859cd7f2eed307e8f942f9a9fd75f35) + +commit 2cb5d600699085fe0e996b91cf85603be3da230e +Author: Radoslaw Zarzynski +Date: Thu Feb 26 19:21:03 2015 +0100 + + rgw: enforce Content-Length in response for POST on Swift cont/obj. + + Fixes: #10661 + Backport: hammer + Signed-off-by: Radoslaw Zarzynski + (cherry picked from commit fd0c612cf867d2e99e74820130357e63305970fb) + +commit d2043a5f3b8d7a3ce50c4e84aa88481c8912ae25 +Author: Radoslaw Zarzynski +Date: Fri Mar 13 21:41:45 2015 +0100 + + rgw: send Last-Modified header in response for PUT on Swift object. + + Fixes: #10650 + Backport: hammer + Signed-off-by: Radoslaw Zarzynski + (cherry picked from commit 9a22acc57684534a245f25848c23d1db21f16653) + +commit 4d1f3f03ef8809693c8ea2bcbc4d24fd41ae1842 +Author: Yehuda Sadeh +Date: Wed Mar 18 20:49:13 2015 -0700 + + rgw: don't use rgw_socket_path if frontend is configured + + Fixes: #11160 + Backport: hammer, firefly + + Previously if we wanted to use the tcp fcgi socket, we needed to clear + rgw_socket_path. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 36d6eea3caa79fcb4e08bdd340ccda2474b9e5ea) + +commit 3aef0f2bb6f88bb17c460a3cef0d3503550f716c +Author: Yehuda Sadeh +Date: Tue Mar 3 11:03:35 2015 -0800 + + rgw: update keystone cache with token info + + Fixes: #11125 + Backport: hammer, firefly + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 6616294aa140ceb83cc61c6ab6f9947636f5e67d) + +commit 3edb1964fe9e8574aafcb758d170007f0e43a324 +Author: Yehuda Sadeh +Date: Tue Apr 21 11:08:45 2015 -0700 + + civetweb: update max num of threads + + Fixes: #10243 + cherry-picked upstream fixed into submodule + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 7a432f7bdfbbc51518fb63d7f2ecab401e8a8d4f) + +commit bc6eb8d5f0a66aec3fbda0b794d008a3157a8154 +Author: Radoslaw Zarzynski +Date: Tue Apr 7 14:09:57 2015 +0200 + + rgw: improve code formatting ONLY. + + Signed-off-by: Radoslaw Zarzynski + (cherry picked from commit 214c8b32b1b04885005e929a7ed2d4354b3ea20b) + +commit 7aa1ae60cea17e0bd140c0cf2313d82f2f64554f +Author: Radoslaw Zarzynski +Date: Tue Mar 17 14:07:34 2015 +0100 + + rgw: send X-Copied-From-Last-Modified header of Swift API. + + Fixes: #10663 + Backport: hammer + Signed-off-by: Radoslaw Zarzynski + (cherry picked from commit e7724a1d8c0872362c19f578fe30ac2cf3dada90) + +commit 150b9e2b85a72dc247da4ba1ab770e6af053acb7 +Author: Radoslaw Zarzynski +Date: Tue Mar 17 14:06:37 2015 +0100 + + rgw: dump object metadata in response for COPY request of Swift API. + + Fixes: #10663 + Backport: hammer + Signed-off-by: Radoslaw Zarzynski + (cherry picked from commit c52b75edeaeef19471b9aca772bf08055bf04031) + +commit e749701be5368a22cad1630f8202e48f5d980409 +Author: Radoslaw Zarzynski +Date: Fri Mar 13 17:59:01 2015 +0100 + + rgw: refactor dumping metadata of Swift objects. + + Backport: hammer + Signed-off-by: Radoslaw Zarzynski + (cherry picked from commit ccf6eaac6f7068289c4a4ffd3f0481d497ba7c87) + +commit b034511fa79996415640b4aca3e8747340f2a127 +Author: Radoslaw Zarzynski +Date: Fri Mar 13 18:37:39 2015 +0100 + + rgw: add support for X-Copied-From{-Account} headers of Swift API. + + Fixes: #10663 + Backport: hammer + Signed-off-by: Radoslaw Zarzynski + (cherry picked from commit 94f1375ccb9df02cdd8f6828153ae2e3a7ad36b1) + +commit c9e6a0bf8c601c4fd9065c1f3a8ea445bd652a52 +Author: Radoslaw Zarzynski +Date: Wed Mar 11 14:08:03 2015 +0100 + + rgw: send ETag, Last-Modified in response for copying Swift cobject. + + Fixes: #11087 + Backport: hammer + Signed-off-by: Radoslaw Zarzynski + (cherry picked from commit 385fe4b4bbbd4a9aab92abf2a813090deeaa037e) + +commit 7f41ff0a6577b0784c6719b9d705f32921b1c40e +Author: Dmytro Iurchenko +Date: Fri Feb 20 18:31:03 2015 +0200 + + rgw: Swift API. Allows setting attributes with COPY object operation. + + http://developer.openstack.org/api-ref-objectstorage-v1.html says: "With COPY, you can add additional metadata to the object." + + Fixes: #10662 + Backport: hammer + Reported-by: Ahmad Faheem + Signed-off-by: Dmytro Iurchenko + (cherry picked from commit 1b722bbcd691e0a4a39ea77cd28e309fd723ec88) + +commit 2f869959ffe1adbcfef7d26ae2d022d23d982673 +Author: Radoslaw Zarzynski +Date: Wed Feb 25 16:12:58 2015 +0100 + + rgw: improve format of X-Timestamp on Swift objects. + + Backport: hammer + Signed-off-by: Radoslaw Zarzynski + (cherry picked from commit e54fef9542bc2b5db192308728a99df139a4b6cf) + +commit 48b19810a9860f6fccbf8d9b8a2fadfb37f598dd +Author: Radoslaw Zarzynski +Date: Wed Feb 25 14:11:40 2015 +0100 + + rgw: add support for X-Timestamp on Swift containers. + + Fixes: #10938 + Backport: hammer + Signed-off-by: Radoslaw Zarzynski + (cherry picked from commit b7e9bf6b98ee48d1977d907a9e5130c0ce073c54) + +commit 893ffd3767678ab881c4bc44ecfe1801cb9f9704 +Author: Jason Dillaman +Date: Mon Apr 27 03:43:10 2015 -0400 + + tests: AioCompletion incorrectly freed + + The AioCompletion should be released instead of directly + deleted. + + Signed-off-by: Jason Dillaman + (cherry picked from commit 387a09eeeaf0b66b3a2ddc36388da27d5804a4c7) + +commit 96b0db5decfad452964750cff92a63007433e519 +Author: Jason Dillaman +Date: Mon Apr 27 03:42:24 2015 -0400 + + librbd: update ref count when queueing AioCompletion + + If the client releases the AioCompletion while librbd is waiting + to acquire the exclusive lock, the memory associated with the + completion will be freed too early. + + Fixes: #11478 + Backport: hammer + Signed-off-by: Jason Dillaman + (cherry picked from commit fd7723a1e62e682ac5a5279231a9fd6f5682bf94) + +commit 094405171bdead0ce09055d4acc6445274992a01 +Author: Jason Dillaman +Date: Fri Apr 10 12:37:05 2015 -0400 + + librbd: failure to update the object map should always return success + + If an object map update fails, the object map will be flagged as + invalid. However, if a subsequent update failure occurs, the error + code will propagate back to the caller. + + Fixes: #11369 + Signed-off-by: Jason Dillaman + (cherry picked from commit 99f5a7d595c653447b351898192410c9cb773770) + +commit 7ee7dcfd609731d3c7f51b74c1d99fb3fa51c413 +Author: Jason Dillaman +Date: Fri Mar 6 15:40:48 2015 -0500 + + tests: librados_test_stub reads should deep-copy + + If a client of librados_test_stub modified a bufferlist + retrieved via a read call, the client will actually be + changing the contents of the file. Therefore, read calls + should deep-copy the contents of the buffer::ptrs. + + Signed-off-by: Jason Dillaman + (cherry picked from commit 76fe8d73ff79da2d734f70680208a2c188b58671) + +commit 113f3b14ec8631d57bbbc2931e242ac96cf9e6fc +Author: Zhe Zhang +Date: Tue May 5 18:08:48 2015 -0500 + + mon: Total size of OSDs is a maginitude less than it is supposed to be. + + When dumping statistics of OSDs such as running command "ceph osd df", + the sum of OSDs' size is 2^10 times less than their real size. + + Fixes: #11534 + Signed-off-by: Zhe Zhang + (cherry picked from commit 73d16f69d6f58fe8be262b0fb8db28c94605ea7d) + +commit 6a04b55df76faad9b1b0770fbe6038b3b8c7d645 +Author: Owen Synge +Date: Tue Mar 17 15:41:33 2015 +0100 + + Fix "disk zap" sgdisk invocation + + Fixes #11143 + + If the metadata on the disk is truly invalid, sgdisk would fail to zero + it in one go, because --mbrtogpt apparently tried to operate on the + metadata it read before executing --zap-all. + + Splitting this up into two separate invocations to first zap everything + and then clear it properly fixes this issue. + + Based on patch by Lars Marowsky-Bree in ceph-deploy. + Created by Vincent Untz + + Signed-off-by: Owen Synge + Signed-off-by: Thorsten Behrens + (cherry picked from commit fdd7f8d83afa25c4e09aaedd90ab93f3b64a677b) + +commit 8996907e0a777320b505e74754f48a1a82308166 +Author: Jianpeng Ma +Date: Fri Mar 6 15:54:07 2015 +0800 + + osd: refuse to write a new erasure coded object with an offset > 0 + + Even if the offset is properly aligned. + + http://tracker.ceph.com/issues/11507 Fixes: #11507 + + Signed-off-by: Jianpeng Ma + Signed-off-by: Loic Dachary + (cherry picked from commit a4f1256c214ee0e7ebb91ac4ea8655f5d9642dc8) + +commit 58b30d5426998bf7eab4a773f1a04e5bcfbf9b93 +Merge: bc51476 3a58e30 +Author: Yan, Zheng +Date: Wed May 6 16:01:57 2015 +0800 + + Merge pull request #4481 from ceph/hammer-11482 + + mds: remove caps from revoking list when caps are voluntarily released + +commit bc51476181429d7d95d2bba5f774d8b60c47fb1f +Author: Yehuda Sadeh +Date: Mon Apr 13 16:33:17 2015 -0700 + + ceph_json: add decode / encoder for multimap + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 55594623e2a478c3c023336b924bfdef0017d97f) + +commit 7c7e651a0ae8bbcebe136da74b7dbe3a3e9edcc8 +Author: Yehuda Sadeh +Date: Mon Mar 30 17:34:57 2015 -0700 + + cls_rgw: use multimap to keep pending operations in bucket index + + Fixes: #11256 + Multiple concurrent requests might be sent using the same tag, need the + entry map to be able to hold multiple entries. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 4e6a66b55e73c01347fc3330faa5c1307d29e9d3) + +commit cb7571375377295d0aff791a03b22da6eb26109d +Author: Yehuda Sadeh +Date: Fri Mar 27 16:32:48 2015 -0700 + + rgw: generate new tag for object when setting object attrs + + Fixes: #11256 + Backport: firefly, hammer + + Beforehand we were reusing the object's tag, which is problematic as + this tag is used for bucket index updates, and we might be clobbering a + racing update (like object removal). + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit edc0627a1dbeb66ea2f5f177f6ceca64559ff3d8) + +commit 7387c43382e7f114c43db3cc26ca77d081749d8e +Author: Noah Watkins +Date: Fri Mar 27 19:34:12 2015 -0700 + + java: libcephfs_jni.so is in /usr/lib64 on rhel + + Signed-off-by: Noah Watkins + (cherry picked from commit aed3434dc7c5161c72c7d5655faa3bc693fc9777) + + Reviewed-by: Greg Farnum + +commit 5cc0f20ba00080881aff460fab2110cb7eaba279 +Merge: f7bcb2d 0e6a032 +Author: Loic Dachary +Date: Mon May 4 23:58:37 2015 +0200 + + Merge pull request #4498 from dachary/wip-11342-hammer + + librbd notification race condition on snap_create + + Reviewed-by: Josh Durgin + Reviewed-by: Jason Dillaman + +commit f7bcb2dc6043501d478c9d7664bf39f34d5ad6d8 +Merge: 4d95929 2864da8 +Author: Loic Dachary +Date: Mon May 4 23:58:18 2015 +0200 + + Merge pull request #4497 from dachary/wip-7385-hammer + + Objectcacher setting max object counts too low + + Reviewed-by: Josh Durgin + Reviewed-by: Jason Dillaman + +commit 4d95929c3129028de2d48efdef71fba2b949edd9 +Merge: 16c2f4c c615972 +Author: Loic Dachary +Date: Mon May 4 23:57:48 2015 +0200 + + Merge pull request #4499 from dachary/wip-11363-hammer + + ImageWatcher should cancel in-flight ops on watch error + + Reviewed-by: Josh Durgin + Reviewed-by: Jason Dillaman + +commit 16c2f4c9e0428bda5d784c6c82929f748ec6cb4a +Merge: addb0ec 0cdc93f +Author: Loic Dachary +Date: Mon May 4 23:57:17 2015 +0200 + + Merge pull request #4496 from dachary/wip-5488-hammer + + librbd: deadlock in image refresh + + Reviewed-by: Josh Durgin + Reviewed-by: Jason Dillaman + +commit addb0ec950e3a3b1150f927f028d2260c1cd0615 +Merge: c0782ed 379ef71 +Author: Loic Dachary +Date: Mon May 4 16:31:24 2015 +0200 + + Merge pull request #4505 from dachary/wip-11322-hammer + + rgw - improve performance for large object (multiple chunks) GET + + Reviewed-by: Yehuda Sadeh + +commit c0782ed9ab1e59f39d3a30496214971d4cc509d9 +Merge: e074695 2f34d2e +Author: Loic Dachary +Date: Mon May 4 16:24:53 2015 +0200 + + Merge pull request #4501 from dachary/wip-11001-hammer + + Improve rgw HEAD request by avoiding read the body of the first chunk + + Reviewed-by: Yehuda Sadeh + +commit e074695e003f891e9c52e19a1679809155364d32 +Merge: 7f9d78d c6edc16 +Author: Loic Dachary +Date: Mon May 4 16:23:28 2015 +0200 + + Merge pull request #4500 from dachary/wip-11047-hammer + + rgw : make quota/gc thread configurable for starting + + Reviewed-by: Yehuda Sadeh + +commit 7f9d78d4f12ff3feaee519dd319426650ca0ff88 +Merge: 11b1ccd 4789686 +Author: Loic Dachary +Date: Mon May 4 11:46:13 2015 +0200 + + Merge pull request #4504 from dachary/wip-10691-hammer + + ceph-dencoder links to libtcmalloc, and shouldn't + + Reviewed-by: Sage Weil + +commit 11b1ccdb6502406456905d05624f0600ef9df893 +Merge: 7c050ec 8709e34 +Author: Loic Dachary +Date: Mon May 4 11:44:11 2015 +0200 + + Merge pull request #4503 from dachary/wip-10983-hammer + + use a new disk as journal disk,ceph-disk prepare fail + + Reviewed-by: Loic Dachary + +commit 7c050ecab2ccc29b03bab2c4ad67c22e9736bb9c +Merge: c5e0b61 da7f683 +Author: Loic Dachary +Date: Mon May 4 11:43:23 2015 +0200 + + Merge pull request #4507 from dachary/wip-11432-hammer + + compilation error: No high-precision counter available (armhf, powerpc..) + + Reviewed-by: Loic Dachary + +commit c5e0b615d7c98ef700ebe8048ed038f1ff036ff4 +Merge: ee61a61 856b2fa +Author: Sage Weil +Date: Thu Apr 30 17:10:29 2015 -0700 + + Merge pull request #4515 from ceph/hammer-next + + rgw: critical fixes for hammer + +commit ee61a61face479d9895a5cd08ebc8aa93c8bb6ce +Merge: abc0741 0ee022b +Author: Samuel Just +Date: Thu Apr 30 09:23:53 2015 -0700 + + Merge pull request #4462 from liewegas/wip-11211-hammer + + osd/ReplicatedPG: don't check order in finish_proxy_read + +commit 856b2fa1fc72916349e484bf3615860392b74100 +Author: Yehuda Sadeh +Date: Wed Apr 22 15:39:05 2015 -0700 + + rgw-admin: a tool to fix object locator issue + + Objects that start with underscore need to have an object locator, + this is due to an old behavior that we need to retain. Some objects + might have been created without the locator. This tool creates a new + rados object with the appropriate locator. + + Syntax: + + $ ./radosgw-admin bucket check --check-head-obj-locator \ + --bucket= [--fix] + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit be4355ad8ed622734172fdce77ca71fb2635b36c) + +commit 512ae4cb3e182ce79aca7354c66d2f2a662555da +Author: Yehuda Sadeh +Date: Tue Apr 21 17:31:41 2015 -0700 + + rgw: set a special object locator if object starts with underscore + + Fixes: #11442 + Backport: hammer + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit 3d4a1d20b86a5a00556df3d6a8dba096509274b7) + +commit da4d2274b5d83a116e767f3063752624d1719c32 +Author: Yehuda Sadeh +Date: Thu Apr 23 15:33:03 2015 -0700 + + rgw: use correct oid when creating gc chains + + Fixes: #11447 + Backport: hammer + + When creating gc chain, use the appropriate oid, otherwise objects will + leak. + + Signed-off-by: Yehuda Sadeh + (cherry picked from commit cc5d34678c6d4bdcd552e6334a383c4df9797f46) + +commit 4e84f318de97e592e16493c67491ba0d7f8103a8 +Author: Orit Wasserman +Date: Thu Apr 23 17:36:47 2015 +0200 + + rgw: civetweb should use unique request id + + max_req_id was moved to RGWRados and changed to atomic64_t. + + The same request id resulted in gc giving the same idtag to all objects + resulting in a leakage of rados objects. It only kept the last deleted object in + it's queue, the previous objects were never freed. + + Fixes: 10295 + Backport: Hammer, Firefly + + Signed-off-by: Orit Wasserman + (cherry picked from commit c26225980c2fd018e70033a453d635533fcdefec) + +commit abc0741d57f30a39a18106bf03576e980ad89177 +Merge: 74c2dc1 3001fad +Author: Gregory Farnum +Date: Wed Apr 29 15:02:56 2015 -0700 + + Merge pull request #4506 from dachary/wip-11381-hammer + + messenger: double clear of pipe in reaper + + Reviewed-by: Greg Farnum + +commit 478968670caecd67f7995a09b60f6208729e3de3 +Author: Boris Ranto +Date: Mon Apr 13 12:38:58 2015 +0200 + + Move ceph-dencoder build to client + + The patch simply moves the ceph-dencoder build from server part of the + Makefiles to client part of the Makefiles. + + Refs: #10691 + + Signed-off-by: Boris Ranto + (cherry picked from commit ef2164725f0b55ffa4b609d407eed5f3f3048b46) + +commit 7eabb70b906e50901551ab96453f05fe662a8876 +Author: Boris Ranto +Date: Mon Apr 13 15:07:03 2015 +0200 + + Rework mds/Makefile.am to support a dencoder client build + + The patch adds all the mds sources to DENCODER_SOURCES to allow a + dencoder client build. The patch also splits the Makefile.am file to + better accomodate the change. + + Refs: #10691 + + Signed-off-by: Boris Ranto + (cherry picked from commit c2b3a35fb3cbf28d46a5427f32fbaff142c85f2a) + + Conflicts: + src/mds/Makefile-server.am + src/mds/Makefile.am + because the mds/StrayManager.h file was added after hammer + +commit da7f6835b15370ce0120a64f7ac3359f3ba4729b +Author: James Page +Date: Fri Mar 13 19:46:04 2015 +0000 + + Add support for PPC architecture, provide fallback + + Add high precision cpu cycles support for powerpc and powerpc64. + + Provide a fallback for other architectures and warn during + compilation. + + Signed-off-by: James Page + (cherry picked from commit b2781fb5638afae7438b983a912ede126a8c5b85) + +commit 3001fad4b6d7e692f6070ef166ed4a3e4849760f +Author: Haomai Wang +Date: Fri Apr 17 22:07:00 2015 +0800 + + Fix clear_pipe after reaping progress + + In pipe.cc:1353 we stop this connection and we will let reader and write threads stop. If now reader and writer quit ASAP and we call queue_reap to trigger the reap progress. Now we haven't call "connection_state->clear_pipe(this)" in pipe.cc:1379, so we may assert failure here. + + Fixes: #11381 + Signed-off-by: Haomai Wang + (cherry picked from commit 0ea0e011a6a6c6d6b40f5d97328bbad0e4568dd7) + +commit 379ef714f7149a748891dafd41db80c247d35975 +Author: Guang Yang +Date: Fri Apr 3 12:27:04 2015 +0000 + + rgw : Issue AIO for next chunk first before flush the (cached) data. + + When handling GET request for large object (with multiple chunks), currently it will first flush the + cached data, and then issue AIO request for next chunk, this has the potential issue to make the retriving + from OSD and sending to client serialized. This patch switch the two operations. + + Fixes: 11322 + Signed-off-by: Guang Yang + (cherry picked from commit 366e8a85c0e9f00eed364eaebbfb6b672852eae9) + +commit b903ad28a68772fa0b7a88b4db2724f4d07565d5 +Author: Boris Ranto +Date: Mon Apr 13 12:33:00 2015 +0200 + + rgw/Makefile.am: Populate DENCODER_SOURCES properly + + Dencoder is built if ENABLE_CLIENT is set. However, the rgw/Makefile.am + populated DENCODER_SOURCES only if WITH_RADOSGW was set. The patch fixes + this and populates DENCODER_SOURES if ENABLE_CLIENT is set. + + Signed-off-by: Boris Ranto + (cherry picked from commit 0b264331f57b64880ce05fe3bd752e8df226d00c) + +commit f994483c0e3a60226c8fb6983380ef8400b0160e +Author: Boris Ranto +Date: Mon Apr 13 12:32:30 2015 +0200 + + Dencoder should never be built with tcmalloc + + The patch adds disabled perfglue stubs to DENCODER sources in order to + avoid tcmalloc-enabled ceph-dencoder builds. + + Refs: #10691 + + Signed-off-by: Boris Ranto + (cherry picked from commit fb11c74d1dc3843f2f5b6dca9c76278c5ceeca1c) + +commit 8709e34f931809f7129cdac1203ec4e774e3eb4c +Author: Loic Dachary +Date: Sun Mar 8 15:15:35 2015 +0100 + + ceph-disk: more robust parted output parser + + In some cases, depending on the implementation or the operating system, + + parted --machine -- /dev/sdh print + + may contain empty lines. The current parsing code is fragile and highly + depends on output details. Replace it with code that basically does the + same sanity checks (output not empty, existence of units, existence of + the dev entry) but handles the entire output instead of checking line by + line. + + http://tracker.ceph.com/issues/10983 Fixes: #10983 + + Signed-off-by: Loic Dachary + (cherry picked from commit f5acf6bb6a342b05897605703d7d9cb7c09714fd) + +commit 8d9f4d4eb546e26eeb3911811bdeb166d06cb1d1 +Author: Jianpeng Ma +Date: Fri Mar 6 11:26:31 2015 +0800 + + osdc: add epoch_t last_force_resend in Op/LingerOp. + + Using this field record the pg_poo_t::last_force_op_resend to avoid op + endless when osd reply with redirect. + + Fixes: #11026 + Signed-off-by: Jianpeng Ma + Signed-off-by: Sage Weil + (cherry picked from commit def4fc4ae51174ae92ac1fb606427f4f6f00743e) + +commit 2f34d2e73bb4bded4779af15a337c75eb2d1497f +Author: Guang Yang +Date: Tue Mar 3 09:46:52 2015 +0000 + + rgw: do not pre-fetch data for HEAD requests + + Backport: hammer + + Signed-off-by: Guang Yang + (cherry picked from commit e32da3e7c880eaf7cb84d1c078447b28e1d8052b) + +commit c6edc16fee027f6cdefacba08f1edc436a7406c5 +Author: Guang Yang +Date: Wed Feb 4 03:21:52 2015 +0000 + + rgw - make starting quota/gc threads configurable + + Fixes: 11047 + Backport: hammer + + Signed-off-by: Guang Yang + (cherry picked from commit a88712aeb4e7cd4208b9a707aa3bd4d03340c3ff) + +commit c6159724f065731c41b2d29a48d0f0a3dc82340b +Author: Jason Dillaman +Date: Fri Mar 20 11:56:55 2015 -0400 + + librbd: ImageWatcher should cancel in-flight ops on watch error + + Upon an watch error notification from librados, pending requests + should be canceled and in-flight IO should be flushed prior to + unlocking the image. + + Fixes: #11363 + Signed-off-by: Jason Dillaman + (cherry picked from commit fc2e511b2aed4d40eff5101a4c9e513b34e5e58e) + +commit 0e6a032c9e6ddae20be82df7500a0758d2cd8e74 +Author: Jason Dillaman +Date: Tue Apr 7 15:39:13 2015 -0400 + + librbd: moved snap_create header update notification to initiator + + When handling a proxied snap_create operation, the client which + invoked the snap_create should send the header update notification + to avoid a possible race condition where snap_create completes but + the client doesn't see the new snapshot (since it didn't yet receive + the notification). + + Fixes: #11342 + Signed-off-by: Jason Dillaman + (cherry picked from commit 6e20ed6834912ccd979d16e3f3b340c239e05288) + +commit 2864da86ca689472341e8a80f7ad6fcc5eb8321a +Author: Jason Dillaman +Date: Wed Apr 22 11:27:35 2015 -0400 + + librbd: updated cache max objects calculation + + The previous calculation was based upon the image's object size. + Since the cache stores smaller bufferheads, the object size is not + a good indicator of cache usage and was resulting in objects being + evicted from the cache too often. Instead, base the max number of + objects on the memory load required to store the extra metadata + for the objects. + + Fixes: #7385 + Backport: firefly, hammer + Signed-off-by: Jason Dillaman + (cherry picked from commit 0b378942c4f1b79cb65967f2d3466728ca1c8d5b) + + Conflicts: + src/librbd/ImageCtx.cc + because hammer has cct->_conf->rbd_cache_size + instead of cache_size + +commit 0cdc93fbdcf68a31e6aada38b0cb9d66efdc512d +Author: Jason Dillaman +Date: Mon Mar 16 11:04:22 2015 -0400 + + librbd: acquire cache_lock before refreshing parent + + cache_lock needs to be acquired before snap_lock to avoid + the potential for deadlock. + + Fixes: #5488 + Signed-off-by: Jason Dillaman + (cherry picked from commit 703ba377e3de4007920f2ed7d8a0780f68676fe2) + + Conflicts: + src/librbd/internal.cc + resolved by moving int r; in the scope of the block + +commit a1b4aeb8e8c3a8d5c8284dcee8e03f501a77928c +Author: Jason Dillaman +Date: Fri Mar 13 18:08:47 2015 -0400 + + librados_test_stub: AIO operation callbacks should be via Finisher + + librados will execute all AIO callbacks via a single finisher to + prevent blocking the Objecter. Reproduce this behavior to avoid + deadlocks that only exist when using the test stub. + + Signed-off-by: Jason Dillaman + (cherry picked from commit b38c96f2c1747a6d864e7aaa2e9858139ce9d1fd) + +commit 3a58e30dc6563197f0effeabbd2fbf804403ad34 +Author: Yan, Zheng +Date: Tue Apr 28 15:45:32 2015 +0800 + + mds: remove caps from revoking list when caps are voluntarily released + + Fixes: #11482 + Signed-off-by: Yan, Zheng + (cherry picked from commit 86788c4ea4155f0388b7ebaf475a3d3c37d39331) + +commit 74c2dc1f3924fa05e2c40f4cceb2ab060493bdfb +Merge: 8a58d83 f30fa4a +Author: Sage Weil +Date: Fri Apr 24 16:13:56 2015 -0700 + + Merge pull request #4463 from ceph/wip-11453-hammer-rgw-init-as-root + + rgw: init-radosgw: run RGW as root + + Reviewed-by: Sage Weil + +commit f30fa4a364602fb9412babf7319140eca4c64995 +Author: Ken Dreyer +Date: Wed Apr 22 16:36:42 2015 -0600 + + init-radosgw: run RGW as root + + The ceph-radosgw service fails to start if the httpd package is not + installed. This is because the init.d file attempts to start the RGW + process with the "apache" UID. If a user is running civetweb, there is + no reason for the httpd or apache2 package to be present on the system. + + Switch the init scripts to use "root" as is done on Ubuntu. + + http://tracker.ceph.com/issues/11453 Refs: #11453 + + Reported-by: Vickey Singh + Signed-off-by: Ken Dreyer + (cherry picked from commit 47339c5ac352d305e68a58f3d744c3ce0fd3a2ac) + +commit 0ee022b1ae832c70a80e9d2cdf32403039f3f125 (refs/remotes/me/wip-11211-hammer) +Author: Zhiqiang Wang +Date: Tue Mar 24 16:00:16 2015 +0800 + + osd/ReplicatedPG: don't check order in finish_proxy_read + + Read doesn't need to be ordered. So when proxy read comes back from base + tier, it's not necessarily at the front of the in progress list. + + Fixes: #11211 + + Signed-off-by: Zhiqiang Wang + (cherry picked from commit 560a5839c0d1852b5816937b845b60390777636c) + +commit 8a58d83b0d039d2c2be353fee9c57c4e6181b662 +Author: Haomai Wang +Date: Sun Mar 22 23:59:19 2015 +0800 + + Fix ceph_test_async_driver failed + + This test will create 10000 sockets which will failed because of limited system fd. Actually we only need to create several hundreds sockets and it's enough to get the test's goal. + + Fix bug #11198(cherry picked from commit cd11daa2d21b7b059df9877cad38432678bb6161) + +commit 85a68f9a8237f7e74f44a1d1fbbd6cb4ac50f8e8 +Author: Orit Wasserman +Date: Fri Apr 3 13:43:31 2015 +0200 + + rgw: remove meta file after deleting bucket + The meta file is deleted only if the bucket meta data is not synced + + Signed-off-by: Orit Wasserman + + Fixes: #11149 + Backport: hammer, firefly + (cherry picked from commit dfdc7afb59cc8e32cf8bff55faa09076c853de06) + +commit b01e68fae2f8235b7a813188b664d3ec7bd1fa09 +Merge: addc7e2 bd0ec49 +Author: Loic Dachary +Date: Tue Apr 14 20:58:40 2015 +0200 + + Merge pull request #4332 from dachary/wip-11217-hammer + + tests: TestFlatIndex.cc races with TestLFNIndex.cc + + Reviewed-by: Sage Weil + +commit addc7e2c6a4973cbcb5f02635be01a446cb223f0 +Merge: e4bfad3 51f5763 +Author: Sage Weil +Date: Tue Apr 14 09:05:53 2015 -0700 + + Merge pull request #4357 from dachary/wip-release-notes-hammer + + release-notes: backport Hammer release notes + +commit 51f57632f6d463e5f702bdb4e12c1914ec76d2b5 +Author: Loic Dachary +Date: Tue Apr 14 11:30:43 2015 +0200 + + release-notes: backport Hammer release notes + + Include the Hammer release notes in Hammer. + + Cherry picking the Hammer release notes cannot be done cleanly, they are + copy/pasted instead. This will allow cherry-picking the release notes + for the next point releases. It should be undisturbed by the release + notes for other point releases because they modify parts of the file + that will not generate cherry-pick conflicts. + + Signed-off-by: Loic Dachary + +commit bd0ec494739308dc33e6b042ae3e1aafd53c634c +Author: Xinze Chi +Date: Tue Mar 24 18:42:04 2015 +0800 + + bug fix: test case for lfn index + + tests: TestFlatIndex.cc races with TestLFNIndex.cc + Both use the same PATH and when run in parallel they sometime conflict. + + Fixes: #11217 + Signed-off-by: Xinze Chi + (cherry picked from commit 791c3879d62b848616972f9c6d921aac30ac4925) diff --git a/doc/conf.py b/doc/conf.py index 98ce7d861b3a6..f1fefe0bc7cbf 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -28,55 +28,25 @@ 'sphinx.ext.graphviz', 'sphinx.ext.todo', 'sphinx_ditaa', - 'asphyxiate', + 'breathe', ] todo_include_todos = True -def _get_manpages(): - import os - man_dir = os.path.join( - os.path.dirname(__file__), - 'man', - ) - sections = os.listdir(man_dir) - for section in sections: - section_dir = os.path.join(man_dir, section) - if not os.path.isdir(section_dir): - continue - for filename in os.listdir(section_dir): - base, ext = os.path.splitext(filename) - if ext != '.rst': - continue - if base == 'index': - continue - with file(os.path.join(section_dir, filename)) as f: - one = f.readline() - two = f.readline() - three = f.readline() - assert one == three - assert all(c=='=' for c in one.rstrip('\n')) - two = two.strip() - name, rest = two.split('--', 1) - assert name.strip() == base - description = rest.strip() - yield ( - os.path.join('man', section, base), - base, - description, - '', - section, - ) - -man_pages = list(_get_manpages()) - -asphyxiate_doxygen_xml = 'doxygen' - top_level = os.path.dirname( os.path.dirname( os.path.abspath(__file__) ) ) +breathe_default_project = "Ceph" +# see $(top_srcdir)/Doxyfile + +breathe_build_directory = os.path.join(top_level, "build-doc") +breathe_projects = {"Ceph": os.path.join(top_level, breathe_build_directory)} +breathe_projects_source = { + "Ceph": (os.path.join(top_level, "src/include/rados"), + ["rados_types.h", "librados.h"]) +} pybind = os.path.join(top_level, 'src/pybind') if pybind not in sys.path: sys.path.insert(0, pybind) diff --git a/doc/dev/cache-pool.rst b/doc/dev/cache-pool.rst index f44cbd99ac4a8..5fa910f98355f 100644 --- a/doc/dev/cache-pool.rst +++ b/doc/dev/cache-pool.rst @@ -5,7 +5,7 @@ Purpose ------- Use a pool of fast storage devices (probably SSDs) and use it as a -cache for an existing larger pool. +cache for an existing slower and larger pool. Use a replicated pool as a front-end to service most I/O, and destage cold data to a separate erasure coded pool that does not currently (and @@ -153,6 +153,7 @@ The thresholds at which Ceph will flush or evict objects is specified relative to a 'target size' of the pool. For example:: ceph osd pool set foo-hot cache_target_dirty_ratio .4 + ceph osd pool set foo-hot cache_target_dirty_high_ratio .6 ceph osd pool set foo-hot cache_target_full_ratio .8 will begin flushing dirty objects when 40% of the pool is dirty and begin diff --git a/doc/dev/cephx_protocol.rst b/doc/dev/cephx_protocol.rst index 8d229841db375..c514b38eb00df 100644 --- a/doc/dev/cephx_protocol.rst +++ b/doc/dev/cephx_protocol.rst @@ -18,18 +18,18 @@ The basic idea of the protocol is based on Kerberos. A client wishes to obtain a server. The server will only offer the requested service to authorized clients. Rather than requiring each server to deal with authentication and authorization issues, the system uses an authorization server. Thus, the client must first communicate with the authorization -server to authenticate himself and to obtain credentials that will grant him access to the -service he wants. +server to authenticate itself and to obtain credentials that will grant it access to the +service it wants. Authorization is not the same as authentication. Authentication provides evidence that some -party is who he claims to be. Authorization provides evidence that a particular party is +party is who it claims to be. Authorization provides evidence that a particular party is allowed to do something. Generally, secure authorization implies secure authentication (since without authentication, you may authorize something for an imposter), but the reverse is not necessarily true. One can authenticate without authorizing. The purpose of this protocol is to authorize. -The basic approach is to use symmetric cryptography throughout. Each client C has his own -secret key, known only to himself and the authorization server A. Each server S has its own +The basic approach is to use symmetric cryptography throughout. Each client C has its own +secret key, known only to itself and the authorization server A. Each server S has its own secret key, known only to itself and the authorization server A. Authorization information will be passed in tickets, encrypted with the secret key of the entity that offers the service. There will be a ticket that A gives to C, which permits C to ask A for other tickets. This @@ -45,11 +45,11 @@ the system. Several parties need to prove something to each other if this protocol is to achieve its desired security effects. -1. The client C must prove to the authenticator A that he really is C. Since everything +1. The client C must prove to the authenticator A that it really is C. Since everything is being done via messages, the client must also prove that the message proving authenticity is fresh, and is not being replayed by an attacker. -2. The authenticator A must prove to client C that he really is the authenticator. Again, +2. The authenticator A must prove to client C that it really is the authenticator. Again, proof that replay is not occurring is also required. 3. A and C must securely share a session key to be used for distribution of later @@ -59,7 +59,7 @@ known only to A and C. 4. A must receive evidence from C that allows A to look up C's authorized operations with server S. -5. C must receive a ticket from A that will prove to S that C can perform his authorized +5. C must receive a ticket from A that will prove to S that C can perform its authorized operations. This ticket must be usable only by C. 6. C must receive from A a session key to protect the communications between C and S. The @@ -70,7 +70,7 @@ Getting Started With Authorization When the client first needs to get service, it contacts the monitor. At the moment, it has no tickets. Therefore, it uses the "unknown" protocol to talk to the monitor. This protocol -is specified as ``CEPH\_AUTH\_UNKNOWN``. The monitor also takes on the authentication server +is specified as ``CEPH_AUTH_UNKNOWN``. The monitor also takes on the authentication server role, A. The remainder of the communications will use the cephx protocol (most of whose code will be found in files in ``auth/cephx``). This protocol is responsible for creating and communicating the tickets spoken of above. @@ -90,7 +90,7 @@ Phase I: The client is set up to know that it needs certain things, using a variable called ``need``, which is part of the ``AuthClientHandler`` class, which the ``CephxClientHandler`` inherits from. At this point, one thing that's encoded in the ``need`` variable is -``CEPH\_ENTITY\_TYPE\_AUTH``, indicating that we need to start the authentication protocol +``CEPH_ENTITY_TYPE_AUTH``, indicating that we need to start the authentication protocol from scratch. Since we're always talking to the same authorization server, if we've gone through this step of the protocol before (and the resulting ticket/session hasn't timed out), we can skip this step and just ask for client tickets. But it must be done initially, and @@ -106,9 +106,9 @@ in the ``need`` flag as necessary. Then we call ``ticket.get_handler()``. This authorization) in the ticket map, creates a ticket handler object for it, and puts the handler into the right place in the map. Then we hit specialized code to deal with individual cases. The case here is when we still need to authenticate to A (the -``if (need & CEPH\_ENTITY\_TYPE\_AUTH)`` branch). +``if (need & CEPH_ENTITY_TYPE_AUTH)`` branch). -We now create a message of type ``CEPHX\_GET\_AUTH\_SESSION\_KEY``. We need to authenticate +We now create a message of type ``CEPH_AUTH_UNKNOWN``. We need to authenticate this message with C's secret key, so we fetch that from the local key repository. (It's called a key server in the code, but it's not really a separate machine or processing entity. It's more like the place where locally used keys are kept.) We create a @@ -124,12 +124,12 @@ challenges, gets put into the message. Then we return from this function, and t message is sent. We now switch over to the authenticator side, A. The server receives the message that was -sent, of type ``CEPHX\_GET\_AUTH\_SESSION\_KEY``. The message gets handled in ``prep_auth()``, +sent, of type ``CEPH_AUTH_UNKNOWN``. The message gets handled in ``prep_auth()``, in ``mon/AuthMonitor.cc``, which calls ``handle_request()`` is ``CephxServiceHandler.cc`` to do most of the work. This routine, also, handles multiple cases. The control flow is determined by the ``request_type`` in the ``cephx_header`` associated -with the message. Our case here is ``CEPHX\_GET\_AUTH\_SESSION\_KEY``. We need the +with the message. Our case here is ``CEPH_AUTH_UNKNOWN``. We need the secret key A shares with C, so we call ``get_secret()`` from out local key repository to get it. We should have set up a server challenge already with this client, so we make sure we really do have one. (This variable is specific to a ``CephxServiceHandler``, so there @@ -141,65 +141,65 @@ We now call the same routine the client used to calculate the hash, based on the the client challenge (which is in the incoming message), the server challenge (which we saved), and the client's key (which we just obtained). We check to see if the client sent the same thing we expected. If so, we know we're talking to the right client. We know the session is -fresh, because he used the challenge we sent him to calculate his crypto hash. So we can -give him an authentication ticket. +fresh, because it used the challenge we sent it to calculate its crypto hash. So we can +give it an authentication ticket. We fetch C's ``eauth`` structure. This contains an ID, a key, and a set of caps (capabilities). -The client sent us his old ticket in the message, if he had one. If so, we set a flag, +The client sent us its old ticket in the message, if it had one. If so, we set a flag, ``should_enc_ticket``, to true and set the global ID to the global ID in that old ticket. -If the attempt to decode his old ticket fails (most probably because he didn't have one), +If the attempt to decode its old ticket fails (most probably because it didn't have one), ``should_enc_ticket`` remains false. Now we set up the new ticket, filling in timestamps, the name of C, the global ID provided in the method call (unless there was an old ticket), and his ``auid``, obtained from the ``eauth`` structure obtained above. We need a new session key -to help the client communicate securely with us, not using his permanent key. We set the -service ID to ``CEPH\_ENTITY\_TYPE\_AUTH``, which will tell the client C what to do with the +to help the client communicate securely with us, not using its permanent key. We set the +service ID to ``CEPH_ENTITY_TYPE_AUTH``, which will tell the client C what to do with the message we send it. We build a cephx response header and call -``cephx\_build\_service\_ticket\_reply()``. +``cephx_build_service_ticket_reply()``. -``cephx\_build\_service\_ticket\_reply()`` is in ``auth/cephx/CephxProtocol.cc``. This +``cephx_build_service_ticket_reply()`` is in ``auth/cephx/CephxProtocol.cc``. This routine will build up the response message. Much of it copies data from its parameters to a message structure. Part of that information (the session key and the validity period) -gets encrypted with C's permanent key. If the ``should\_encrypt\_ticket`` flag is set, +gets encrypted with C's permanent key. If the ``should_encrypt_ticket`` flag is set, encrypt it using the old ticket's key. Otherwise, there was no old ticket key, so the new ticket is not encrypted. (It is, of course, already encrypted with A's permanent key.) Presumably the point of this second encryption is to expose less material encrypted with permanent keys. -Then we call the key server's ``get\_service\_caps()`` routine on the entity name, with a -flag ``CEPH\_ENTITY\_TYPE\_MON``, and capabilities, which will be filled in by this routine. +Then we call the key server's ``get_service_caps()`` routine on the entity name, with a +flag ``CEPH_ENTITY_TYPE_MON``, and capabilities, which will be filled in by this routine. The use of that constant flag means we're going to get the client's caps for A, not for some other data server. The ticket here is to access the authorizer A, not the service S. The result of this call is that the caps variable (a parameter to the routine we're in) is filled in with the monitor capabilities that will allow C to access A's authorization services. -``handle\_request()`` itself does not send the response message. It builds up the -``result\_bl``, which basically holds that message's contents, and the capabilities structure, -but it doesn't send the message. We go back to ``prep\_auth()``, in ``mon/AuthMonitor.cc``, +``handle_request()`` itself does not send the response message. It builds up the +``result_bl``, which basically holds that message's contents, and the capabilities structure, +but it doesn't send the message. We go back to ``prep_auth()``, in ``mon/AuthMonitor.cc``, for that. This routine does some fiddling around with the caps structure that just got filled in. There's a global ID that comes up as a result of this fiddling that is put into -the reply message. The reply message is built here (mostly from the ``response\_bl`` buffer) +the reply message. The reply message is built here (mostly from the ``response_bl`` buffer) and sent off. -This completes Phase I of the protocol. At this point, C has authenticated himself to A, and A has generated a new session key and ticket allowing C to obtain server tickets from A. +This completes Phase I of the protocol. At this point, C has authenticated itself to A, and A has generated a new session key and ticket allowing C to obtain server tickets from A. Phase II -------- This phase starts when C receives the message from A containing a new ticket and session key. -The goal of this phase is to provide A with a session key and ticket allowing him to +The goal of this phase is to provide A with a session key and ticket allowing it to communicate with S. -The message A sent to C is dispatched to ``build\_request()`` in ``CephxClientHandler.cc``, +The message A sent to C is dispatched to ``build_request()`` in ``CephxClientHandler.cc``, the same routine that was used early in Phase I to build the first message in the protocol. -This time, when ``validate\_tickets()`` is called, the ``need`` variable will not contain -``CEPH\_ENTITY\_TYPE\_AUTH``, so a different branch through the bulk of the routine will be +This time, when ``validate_tickets()`` is called, the ``need`` variable will not contain +``CEPH_ENTITY_TYPE_AUTH``, so a different branch through the bulk of the routine will be used. This is the branch indicated by ``if (need)``. We have a ticket for the authorizer, but we still need service tickets. We must send another message to A to obtain the tickets (and session key) for the server -S. We set the ``request\_type`` of the message to ``CEPHX\_GET\_PRINCIPAL\_SESSION\_KEY`` and -call ``ticket\_handler.build\_authorizer()`` to obtain an authorizer. This routine is in +S. We set the ``request_type`` of the message to ``CEPHX_GET_PRINCIPAL_SESSION_KEY`` and +call ``ticket_handler.build_authorizer()`` to obtain an authorizer. This routine is in ``CephxProtocol.cc``. We set the key for this authorizer to be the session key we just got from A,and create a new nonce. We put the global ID, the service ID, and the ticket into a message buffer that is part of the authorizer. Then we create a new ``CephXAuthorize`` @@ -207,20 +207,20 @@ structure. The nonce we just created goes there. We encrypt this ``CephXAuthor structure with the current session key and stuff it into the authorizer's buffer. We return the authorizer. -Back in ``build\_request()``, we take the part of the authorizer that was just built (its +Back in ``build_request()``, we take the part of the authorizer that was just built (its buffer, not the session key or anything else) and shove it into the buffer we're creating for the message that will go to A. Then we delete the authorizer. We put the requirements for what we want in ``req.keys``, and we put ``req`` into the buffer. Then we return, and the message gets sent. -The authorizer A receives this message which is of type ``CEPHX\_GET\_PRINCIPAL\_SESSION\_KEY``. +The authorizer A receives this message which is of type ``CEPHX_GET_PRINCIPAL_SESSION_KEY``. The message gets handled in ``prep_auth()``, in ``mon/AuthMonitor.cc``, which again calls -``handle\_request()`` in ``CephxServiceHandler.cc`` to do most of the work. +``handle_request()`` in ``CephxServiceHandler.cc`` to do most of the work. -In this case, ``handle\_request()`` will take the ``CEPHX\_GET\_PRINCIPAL\_SESSION\_KEY`` case. -It will call ``cephx\_verify\_authorizer()`` in ``CephxProtocol.cc``. Here, we will grab +In this case, ``handle_request()`` will take the ``CEPHX_GET_PRINCIPAL_SESSION_KEY`` case. +It will call ``cephx_verify_authorizer()`` in ``CephxProtocol.cc``. Here, we will grab a bunch of data out of the input buffer, including the global and service IDs and the ticket -for A. The ticket contains a ``secret\_id``, indicating which key is being used for it. +for A. The ticket contains a ``secret_id``, indicating which key is being used for it. If the secret ID pulled out of the ticket was -1, the ticket does not specify which secret key A should use. In this case, A should use the key for the specific entity that C wants to contact, rather than a rotating key shared by all server entities of the same type. @@ -236,52 +236,52 @@ this message. Use that session key to decrypt the rest of the message. Create a ``CephXAuthorizeReply`` to hold our reply. Extract the nonce (which was in the stuff we just decrypted), add 1 to it, and put the result in the reply. Encrypt the reply and -put it in the buffer provided in the call to ``cephx\_verify\_authorizer()`` and return -to ``handle\`_request()``. This will be used to prove to C that A (rather than an attacker) +put it in the buffer provided in the call to ``cephx_verify_authorizer()`` and return +to ``handle`_request()``. This will be used to prove to C that A (rather than an attacker) created this response. -Having verified that the message is valid and from C, now we need to build him a ticket for S. -We need to know what S he wants to communicate with and what services he wants. Pull the -ticket request that describes those things out of his message. Now run through the ticket -request to see what he wanted. (He could potentially be asking for multiple different +Having verified that the message is valid and from C, now we need to build it a ticket for S. +We need to know what S it wants to communicate with and what services it wants. Pull the +ticket request that describes those things out of its message. Now run through the ticket +request to see what it wanted. (He could potentially be asking for multiple different services in the same request, but we will assume it's just one, for this discussion.) Once we -know which service ID he's after, call ``build\_session\_auth\_info()``. +know which service ID it's after, call ``build_session_auth_info()``. -``build\_session\_auth\_info()`` is in ``CephxKeyServer.cc``. It checks to see if the -secret for the ``service\_ID`` of S is available and puts it into the subfield of one of -the parameters, and calls the similarly named ``\_build\_session\_auth\_info()``, located in -the same file. This routine loads up the new ``auth\_info`` structure with the +``build_session_auth_info()`` is in ``CephxKeyServer.cc``. It checks to see if the +secret for the ``service_ID`` of S is available and puts it into the subfield of one of +the parameters, and calls the similarly named ``_build_session_auth_info()``, located in +the same file. This routine loads up the new ``auth_info`` structure with the ID of S, a ticket, and some timestamps for that ticket. It generates a new session key -and puts it in the structure. It then calls ``get\_caps()`` to fill in the -``info.ticket`` caps field. ``get\_caps()`` is also in ``CephxKeyServer.cc``. It fills the -``caps\_info`` structure it is provided with caps for S allowed to C. +and puts it in the structure. It then calls ``get_caps()`` to fill in the +``info.ticket`` caps field. ``get_caps()`` is also in ``CephxKeyServer.cc``. It fills the +``caps_info`` structure it is provided with caps for S allowed to C. -Once ``build\_session\_auth\_info()`` returns, A has a list of the capabilities allowed to +Once ``build_session_auth_info()`` returns, A has a list of the capabilities allowed to C for S. We put a validity period based on the current TTL for this context into the info -structure, and put it into the ``info\_vec`` structure we are preparing in response to the +structure, and put it into the ``info_vec`` structure we are preparing in response to the message. -Now call ``build\_cephx\_response\_header()``, also in ``CephxServiceHandler.cc``. Fill in -the ``request\_type``, which is ``CEPHX\_GET\_PRINCIPAL\_SESSION\_KEY``, a status of 0, +Now call ``build_cephx_response_header()``, also in ``CephxServiceHandler.cc``. Fill in +the ``request_type``, which is ``CEPHX_GET_PRINCIPAL_SESSION_KEY``, a status of 0, and the result buffer. -Now call ``cephx\_build\_service\_ticket\_reply()``, which is in ``CephxProtocol.cc``. The +Now call ``cephx_build_service_ticket_reply()``, which is in ``CephxProtocol.cc``. The same routine was used towards the end of A's handling of its response in phase I. Here, the session key (now a session key to talk to S, not A) and the validity period for that key will be encrypted with the existing session key shared between C and A. -The ``should\_encrypt\_ticket`` parameter is false here, and no key is provided for that +The ``should_encrypt_ticket`` parameter is false here, and no key is provided for that encryption. The ticket in question, destined for S once C sends it there, is already encrypted with S's secret. So, essentially, this routine will put ID information, the encrypted session key, and the ticket allowing C to talk to S into the buffer to be sent to C. -After this routine returns, we exit from ``handle\_request()``, going back to ``prep\_auth()`` +After this routine returns, we exit from ``handle_request()``, going back to ``prep_auth()`` and ultimately to the underlying message send code. The client receives this message. The nonce is checked as the message passes through ``Pipe::connect()``, which is in ``msg/SimpleMessager.cc``. In a lengthy ``while(1)`` loop in the middle of this routine, it gets an authorizer. If the get was successful, eventually -it will call ``verify\_reply()``, which checks the nonce. ``connect()`` never explicitly +it will call ``verify_reply()``, which checks the nonce. ``connect()`` never explicitly checks to see if it got an authorizer, which would suggest that failure to provide an authorizer would allow an attacker to skip checking of the nonce. However, in many places, if there is no authorizer, important connection fields will get set to zero, which will @@ -289,16 +289,16 @@ ultimately cause the connection to fail to provide data. It would be worth test it looks like failure to provide an authorizer, which contains the nonce, would not be helpful to an attacker. -The message eventually makes its way through to ``handle\_response()``, in -``CephxClientHandler.cc``. In this routine, we call ``get\_handler()`` to get a ticket +The message eventually makes its way through to ``handle_response()``, in +``CephxClientHandler.cc``. In this routine, we call ``get_handler()`` to get a ticket handler to hold the ticket we have just received. This routine is embedded in the definition -for a ``CephXTicketManager`` structure. It takes a type (``CEPH\_ENTITY\_TYPE\_AUTH``, in -this case) and looks through the ``tickets\_map`` to find that type. There should be one, and +for a ``CephXTicketManager`` structure. It takes a type (``CEPH_ENTITY_TYPE_AUTH``, in +this case) and looks through the ``tickets_map`` to find that type. There should be one, and it should have the session key of the session between C and A in its entry. This key will be used to decrypt the information provided by A, particularly the new session key allowing C to talk to S. -We then call ``verify\_service\_ticket\_reply()``, in ``CephxProtocol.cc``. This routine +We then call ``verify_service_ticket_reply()``, in ``CephxProtocol.cc``. This routine needs to determine if the ticket is OK and also obtain the session key associated with this ticket. It decrypts the encrypted portion of the message buffer, using the session key shared with A. This ticket was not encrypted (well, not twice - tickets are always encrypted, @@ -309,27 +309,27 @@ The stuff we decrypted with the session key shared between C and A included the key. That's our current session key for this ticket, so set it. Check validity and set the expiration times. Now return true, if we got this far. -Back in ``handle\_response()``, we now call ``validate\_tickets()`` to adjust what we think +Back in ``handle_response()``, we now call ``validate_tickets()`` to adjust what we think we need, since we now have a ticket we didn't have before. If we've taken care of everything we need, we'll return 0. This ends phase II of the protocol. We have now successfully set up a ticket and session key -for client C to talk to server S. S will know that C is who he claims to be, since A will -verify it. C will know it is S he's talking to, again because A verified it. The only +for client C to talk to server S. S will know that C is who it claims to be, since A will +verify it. C will know it is S it's talking to, again because A verified it. The only copies of the session key for C and S to communicate were sent encrypted under the permanent keys of C and S, respectively, so no other party (excepting A, who is trusted by all) knows that session key. The ticket will securely indicate to S what C is allowed to do, attested to by A. The nonces passed back and forth between A and C ensure that they have not been -subject to a replay attack. C has not yet actually talked to S, but he is ready to. +subject to a replay attack. C has not yet actually talked to S, but it is ready to. Much of the security here falls apart if one of the permanent keys is compromised. Compromise of C's key means that the attacker can pose as C and obtain all of C's privileges, and can eavesdrop on C's legitimate conversations. He can also pretend to be A, but only in -conversations with C. Since he does not (by hypothesis) have keys for any services, he -cannot generate any new tickets for services, though he can replay old tickets and session +conversations with C. Since it does not (by hypothesis) have keys for any services, he +cannot generate any new tickets for services, though it can replay old tickets and session keys until S's permanent key is changed or the old tickets time out. Compromise of S's key means that the attacker can pose as S to anyone, and can eavesdrop on any user's conversation with S. Unless some client's key is also compromised, the attacker -cannot generate new fake client tickets for S, since doing so requires him to authenticate -himself as A, using the client key he doesn't know. +cannot generate new fake client tickets for S, since doing so requires it to authenticate +himself as A, using the client key it doesn't know. diff --git a/doc/dev/config.rst b/doc/dev/config.rst index b9b0faaf14d1d..298dcaafa2600 100644 --- a/doc/dev/config.rst +++ b/doc/dev/config.rst @@ -56,7 +56,7 @@ Reading configuration values There are two ways for Ceph code to get configuration values. One way is to read it directly from a variable named "g_conf," or equivalently, -"g_ceph_ctx->_conf." The other is to register an observer that will called +"g_ceph_ctx->_conf." The other is to register an observer that will be called every time the relevant configuration values changes. This observer will be called soon after the initial configuration is read, and every time after that when one of the relevant values changes. Each observer tracks a set of keys diff --git a/doc/dev/corpus.rst b/doc/dev/corpus.rst index 86c4820cf7d62..76fa43db0a771 100644 --- a/doc/dev/corpus.rst +++ b/doc/dev/corpus.rst @@ -7,7 +7,7 @@ ceph.git/ceph-object-corpus is a submodule.:: bin/ # misc scripts archive/$version/objects/$type/$hash # a sample of encoded objects from a specific version -You can also mark known or deliber incompatibilities between versions with:: +You can also mark known or deliberate incompatibilities between versions with:: archive/$version/forward_incompat/$type @@ -16,7 +16,7 @@ decode old objects across that $version (this is normally the case). How to generate an object corpus -================================ +-------------------------------- We can generate an object corpus for a particular version of ceph like so. @@ -24,7 +24,7 @@ We can generate an object corpus for a particular version of ceph like so. git clone ceph.git cd ceph - git submodule update --init + git submodule update --init --recursive #. Build with flag to dump objects to /tmp/foo:: @@ -37,7 +37,7 @@ We can generate an object corpus for a particular version of ceph like so. cd src MON=3 OSD=3 MDS=3 RGW=1 ./vstart.sh -n -x -#. Use a much functionality of the cluster as you can, to exercise as many object encoder methods as possible:: +#. Use as much functionality of the cluster as you can, to exercise as many object encoder methods as possible:: ./rados -p rbd bench 10 write -b 123 ./ceph osd out 0 diff --git a/doc/dev/dev_cluster_deployement.rst b/doc/dev/dev_cluster_deployement.rst index 589641f315bae..d8c0d3c16d606 100644 --- a/doc/dev/dev_cluster_deployement.rst +++ b/doc/dev/dev_cluster_deployement.rst @@ -8,7 +8,7 @@ In order to develop on ceph, a Ceph utility, Usage ===== -It allows to deploy a fake local cluster on your machine for development purpose. It starts mon, osd and/or mds, or all of them if not specified. +It allows to deploy a fake local cluster on your machine for development purpose. It starts rgw, mon, osd and/or mds, or all of them if not specified. To start your development cluster, type the following:: diff --git a/doc/dev/development-workflow.rst b/doc/dev/development-workflow.rst new file mode 100644 index 0000000000000..8da44323d7d6d --- /dev/null +++ b/doc/dev/development-workflow.rst @@ -0,0 +1,255 @@ +===================== +Development workflows +===================== + +This page explains the workflows a developer is expected to follow to +implement the goals that are part of the Ceph release cycle. It does not +go into technical details and is designed to provide a high level view +instead. Each chapter is about a given goal such as ``Merging bug +fixes or features`` or ``Publishing point releases and backporting``. + +A key aspect of all workflows is that none of them blocks another. For +instance, a bug fix can be backported and merged to a stable branch +while the next point release is being published. For that specific +example to work, a branch should be created to avoid any +interference. In practice it is not necessary for Ceph because: + +* there are few people involved +* the frequency of backports is not too high +* the reviewers, who know a release is being published, are unlikely + to merge anything that may cause issues + +This ad-hoc approach implies the workflows are changed on a regular +basis to adapt. For instance, ``quality engineers`` were not involved +in the workflow to publish ``dumpling`` point releases. The number of +commits being backported to ``firefly`` made it impractical for developers +tasked to write code or fix bugs to also run and verify the full suite +of integration tests. Inserting ``quality engineers`` makes it +possible for someone to participate in the workflow by analyzing test +results. + +The workflows are not enforced when they impose an overhead that does +not make sense. For instance, if the release notes for a point release +were not written prior to checking all integration tests, they can be +commited to the stable branch and the result sent for publication +without going through another run of integration tests. + +Release Cycle +============= + +:: + + Ceph hammer infernalis + Developer CDS CDS + Summit | | + | | + development | | + release | v0.88 v0.89 v0.90 ... | v9.0.0 + --v--^----^--v---^------^--v- ---v----^----^--- 2015 + | | | | + stable giant | | hammer + release v0.87 | | v0.94 + | | + point firefly dumpling + release v0.80.8 v0.67.12 + + +Four times a year, the development roadmap is discussed online during +the `Ceph Developer Summit `_. A +new stable release (argonaut, cuttlefish, dumpling, emperor, firefly, +giant, hammer, infernalis ...) is published at the same frequency. +Every other release (dumpling, firefly, hammer, ...) is a `Long Term Stable (LTS) <../../releases>`_. +See `Understanding the release cycle +<../../releases#understanding-the-release-cycle>`_ for more +information. + +Merging bug fixes or features +============================= + +The development branch is ``master`` and the workflow followed by all +developers can be summarized as follows: + +* The developer prepares a series of commits +* The developer submits the series of commits via a pull request +* A reviewer is assigned the pull request +* When the pull request looks good to the reviewer, it is merged into + an integration branch by the tester +* After a successful run of integration tests, the pull request is + merged by the tester + +The ``developer`` is the author of a series of commits. The +``reviewer`` is responsible for providing feedback to the developer on +a regular basis and the developer is invited to ping the reviewer if +nothing happened after a week. After the ``reviewer`` is satisfied +with the pull request, (s)he passes it to the ``tester``. The +``tester`` is responsible for running teuthology integration tests on +the pull request. If nothing happens within a month the ``reviewer`` is +invited to ping the ``tester``. + +Resolving bug reports and implementing features +=============================================== + +All bug reports and feature requests are in the `issue tracker +`_ and the workflow can be summarized as +follows: + +* The reporter creates the issue with priority ``Normal`` +* A developer may pick the issue right away +* During a bi-weekly bug scrub, the team goes over all new issue and + assign them a priority +* The bugs with higher priority are worked on first + +Each ``team`` is responsible for a project: + +* rgw lead is Yehuda Sadeh +* CephFS lead is Gregory Farnum +* rados lead is Samuel Just +* rbd lead is Josh Durgin + +The ``developer`` assigned to an issue is responsible for it. The +status of an open issue can be: + +* ``New``: it is unclear if the issue needs work. +* ``Verified``: the bug can be reproduced or showed up multiple times +* ``In Progress``: the developer is working on it this week +* ``Pending Backport``: the fix needs to be backported to the stable + releases listed in the backport field + +For each ``Pending Backport`` issue, there exists at least one issue +in the ``Backport`` tracker to record the work done to cherry pick the +necessary commits from the master branch to the target stable branch. +See `the backporter manual +`_ for more +information. + +Running and interpreting teuthology integration tests +===================================================== + +The :doc:`/dev/sepia` runs `teuthology +`_ integration tests `on a regular basis `_ and the +results are posted on `pulpito `_ and the +`ceph-qa mailing list `_. + +* The job failures are `analyzed by quality engineers and developers + `_ +* If the cause is environmental (e.g. network connectivity), an issue + is created in the `sepia lab project + `_ +* If the bug is known, a pulpito URL to the failed job is added to the issue +* If the bug is new, an issue is created + +The ``quality engineer`` is either a developer or a member of the QE +team. There is at least one integration test suite per project: + +* `rgw `_ suite +* `CephFS `_ suite +* `rados `_ suite +* `rbd `_ suite + +and a many others such as + +* `upgrade `_ suites +* `power-cyle `_ suite +* ... + +Preparing a new release +======================= + +A release is prepared in a dedicated branch, different from the +``master`` branch. + +* For a stable releases it is the branch matching the release code + name (dumpling, firefly, etc.) +* For a development release it is the ``next`` branch + +The workflow expected of all developers to stabilize the release +candidate is the same as the normal development workflow with the +following differences: + +* The pull requests must target the stable branch or next instead of + master +* The reviewer rejects pull requests that are not bug fixes +* The ``Backport`` issues matching a teuthology test failure and set + with priority ``Urgent`` must be fixed before the release + +Cutting a new stable release +============================ + +A new stable release can be cut when: + +* all ``Backport`` issues with priority ``Urgent`` are fixed +* integration and upgrade tests run successfully + +Publishing a new stable release implies a risk of regression or +discovering new bugs during the upgrade, no matter how carefully it is +tested. The decision to cut a release must take this into account: it +may not be wise to publish a stable release that only fixes a few +minor bugs. For instance if only one commit has been backported to a +stable release that is not a LTS, it is better to wait until there are +more. + +When a stable release is to be retired, it may be safer to +recommend an upgrade to the next LTS release instead of +proposing a new point release to fix a problem. For instance, the +``dumpling`` v0.67.11 release has bugs related to backfilling which have +been fixed in ``firefly`` v0.80.x. A backport fixing these backfilling +bugs has been tested in the draft point release ``dumpling`` v0.67.12 but +they are large enough to introduce a risk of regression. As ``dumpling`` +is to be retired, users suffering from this bug can +upgrade to ``firefly`` to fix it. Unless users manifest themselves and ask +for ``dumpling`` v0.67.12, this draft release may never be published. + +* The ``Ceph lead`` decides a new stable release must be published +* The ``release master`` gets approval from all leads +* The ``release master`` writes and commits the release notes +* The ``release master`` informs the ``quality engineer`` that the + branch is ready for testing +* The ``quality engineer`` runs additional integration tests +* If the ``quality engineer`` discovers new bugs that require an + ``Urgent Backport``, the release goes back to being prepared, it + was not ready after all +* The ``quality engineer`` informs the ``publisher`` that the branch + is ready for release +* The ``publisher`` `creates the packages and sets the release tag + <../release-process>`_ + +The person responsible for each role is: + +* Sage Weil is the ``Ceph lead`` +* Sage Weil is the ``release master`` for major stable releases + (``firefly`` 0.80, ``hammer`` 0.94 etc.) +* Loic Dachary is the ``release master`` for stable point releases + (``firefly`` 0.80.10, ``hammer`` 0.94.1 etc.) +* Yuri Weinstein is the ``quality engineer`` +* Alfredo Deza is the ``publisher`` + +Cutting a new development release +================================= + +The publication workflow of a development release is the same as +preparing a new release and cutting it, with the following +differences: + +* The ``next`` branch is reset to the tip of ``master`` after + publication +* The ``quality engineer`` is not required to run additional tests, + the ``release master`` directly informs the ``publisher`` that the + release is ready to be published. + +Publishing point releases and backporting +========================================= + +The publication workflow of the point releases is the same as +preparing a new release and cutting it, with the following +differences: + +* The ``backport`` field of each issue contains the code name of the + stable release +* There is exactly one issue in the ``Backport`` tracker for each + stable release to which the issue is backported +* All commits are cherry-picked with ``git cherry-pick -x`` to + reference the original commit + +See `the backporter manual +`_ for more +information. diff --git a/doc/dev/differences-from-posix.rst b/doc/dev/differences-from-posix.rst index c1366e0c6aacc..1cc99428fe203 100644 --- a/doc/dev/differences-from-posix.rst +++ b/doc/dev/differences-from-posix.rst @@ -2,7 +2,6 @@ Differences from POSIX ======================== -.. todo:: delete http://ceph.com/wiki/Differences_from_POSIX Ceph does have a few places where it diverges from strict POSIX semantics for various reasons: diff --git a/doc/dev/documenting.rst b/doc/dev/documenting.rst index cc091705f8661..afd6efa952858 100644 --- a/doc/dev/documenting.rst +++ b/doc/dev/documenting.rst @@ -6,10 +6,10 @@ Code Documentation ================== C and C++ can be documented with Doxygen_, using the subset of Doxygen -markup supported by Asphyxiate_. +markup supported by Breathe_. .. _Doxygen: http://www.stack.nl/~dimitri/doxygen/ -.. _Asphyxiate: https://github.com/ceph/asphyxiate +.. _Breathe: https://github.com/michaeljones/breathe The general format for function documentation is:: diff --git a/doc/dev/erasure-coded-pool.rst b/doc/dev/erasure-coded-pool.rst index bdc0b62a0b03c..0043e171bff45 100644 --- a/doc/dev/erasure-coded-pool.rst +++ b/doc/dev/erasure-coded-pool.rst @@ -90,7 +90,7 @@ Choose an alternate erasure code plugin:: m=1 plugin=example technique=xor - $ ceph osd create ecpool 12 12 erasure \ + $ ceph osd pool create ecpool 12 12 erasure \ myprofile Display the default erasure code profile:: diff --git a/doc/dev/index.rst b/doc/dev/index.rst index 169bf6f3b6175..f18adaee0c549 100644 --- a/doc/dev/index.rst +++ b/doc/dev/index.rst @@ -35,4 +35,5 @@ in the body of the message. * osd_internals/index* + mds_internals/index* radosgw/index* diff --git a/doc/dev/mds_internals/data-structures.rst b/doc/dev/mds_internals/data-structures.rst new file mode 100644 index 0000000000000..1197b62f3a4b9 --- /dev/null +++ b/doc/dev/mds_internals/data-structures.rst @@ -0,0 +1,36 @@ +MDS internal data structures +============================== + +*CInode* + CInode contains the metadata of a file, there is one CInode for each file. + The CInode stores information like who owns the file, how big the file is. + +*CDentry* + CDentry is the glue that holds inodes and files together by relating inode to + file/directory names. A CDentry links to at most one CInode (it may not link + to any CInode). A CInode may be linked by multiple CDentries. + +*CDir* + CDir only exists for directory inode, it's used to link CDentries under the + directory. A CInode can have multiple CDir when the directory is fragmented. + +These data structures are linked together as:: + + CInode + CDir + | \ + | \ + | \ + CDentry CDentry + CInode CInode + CDir CDir + | | \ + | | \ + | | \ + CDentry CDentry CDentry + CInode CInode CInode + +As this doc is being written, size of CInode is about 1400 bytes, size of CDentry +is about 400 bytes, size of CDir is about 700 bytes. These data structures are +quite large. Please be careful if you want to add new fields to them. + diff --git a/doc/dev/mds_internals/index.rst b/doc/dev/mds_internals/index.rst new file mode 100644 index 0000000000000..c8c82ad10da5a --- /dev/null +++ b/doc/dev/mds_internals/index.rst @@ -0,0 +1,10 @@ +============================== +MDS developer documentation +============================== + +.. rubric:: Contents + +.. toctree:: + :glob: + + * diff --git a/doc/dev/messenger.rst b/doc/dev/messenger.rst new file mode 100644 index 0000000000000..2b1a8881b93a9 --- /dev/null +++ b/doc/dev/messenger.rst @@ -0,0 +1,33 @@ +============================ + Messenger notes +============================ + +Messenger is the Ceph network layer implementation. Currently Ceph supports +three messenger type "simple", "async" and "xio". The latter two are both +experiment features and shouldn't use them in production environment. + +ceph_perf_msgr +============== + +ceph_perf_msgr is used to do benchmark for messenger module only and can help +to find the bottleneck or time consuming within messenger moduleIt just like +"iperf", we need to start server-side program firstly: + +# ./ceph_perf_msgr_server 172.16.30.181:10001 0 + +The first argument is ip:port pair which is telling the destination address the +client need to specified. The second argument tells the "think time" when +dispatching messages. After Giant, CEPH_OSD_OP message which is the actual client +read/write io request is fast dispatched without queueing to Dispatcher, in order +to achieve better performance. So CEPH_OSD_OP message will be processed inline, +"think time" is used by mock this "inline process" process. + +# ./ceph_perf_msgr_client 172.16.30.181:10001 1 32 10000 10 4096 + +The first argument is specified the server ip:port, and the second argument is +used to specify client threads. The third argument specify the concurrency(the +max inflight messages for each client thread), the fourth argument specify the +io numbers will be issued to server per client thread. The fifth argument is +used to indicate the "think time" for client thread when receiving messages, +this is also used to mock the client fast dispatch process. The last argument +specify the message data length to issue. diff --git a/doc/dev/network-protocol.rst b/doc/dev/network-protocol.rst index cb4c6068145df..26e9d0ba1f491 100644 --- a/doc/dev/network-protocol.rst +++ b/doc/dev/network-protocol.rst @@ -17,7 +17,7 @@ Banner The first action is the server sending banner to the client. The banner is defined in ``CEPH_BANNER`` from ``src/include/msgr.h``. This is followed by -the server's then client's address each encoded as a ``sockaddr_storage``. +the server's then client's address each encoded as a ``entity_addr_t``. Once the client verifies that the servers banner matches its own it replies with its banner and its address. @@ -161,7 +161,7 @@ CEPH_MSGR_TAG_ACK (0x08) struct ceph_msgr_ack { u8 tag = 0x08; - u64le seq; // The sequence number of the message being acknoledged. + u64le seq; // The sequence number of the message being acknowledged. } CEPH_MSGR_TAG_KEEPALIVE (0x09) diff --git a/doc/dev/osd_internals/erasure_coding/jerasure.rst b/doc/dev/osd_internals/erasure_coding/jerasure.rst index af7631c16f111..27669a0b22639 100644 --- a/doc/dev/osd_internals/erasure_coding/jerasure.rst +++ b/doc/dev/osd_internals/erasure_coding/jerasure.rst @@ -26,8 +26,8 @@ implementation. It is a wrapper around the code found at and `https://github.com/ceph/gf-complete `_ , pinned to the latest stable version in *.gitmodules*. These repositories are copies of the -upstream repositories `https://bitbucket.org/jimplank/jerasure -`_ and -`https://bitbucket.org/jimplank/gf-complete -`_ . The difference +upstream repositories `http://jerasure.org/jerasure/jerasure +`_ and +`http://jerasure.org/jerasure/gf-complete +`_ . The difference between the two, if any, should match pull requests against upstream. diff --git a/doc/dev/osd_internals/erasure_coding/pgbackend.rst b/doc/dev/osd_internals/erasure_coding/pgbackend.rst index 0751af97eb193..db602991af4ed 100644 --- a/doc/dev/osd_internals/erasure_coding/pgbackend.rst +++ b/doc/dev/osd_internals/erasure_coding/pgbackend.rst @@ -39,8 +39,8 @@ and erasure coding which PGBackend must abstract over: 5. Selection of a pgtemp for backfill may differ between replicated and erasure coded backends. 6. The set of necessary OSDs from a particular interval required to - to continue peering may differ between replicated and erasure - coded backends. + continue peering may differ between replicated and erasure coded + backends. 7. The selection of the authoritative log may differ between replicated and erasure coded backends. diff --git a/doc/dev/osd_internals/last_epoch_started.rst b/doc/dev/osd_internals/last_epoch_started.rst new file mode 100644 index 0000000000000..fa86b4a3c8128 --- /dev/null +++ b/doc/dev/osd_internals/last_epoch_started.rst @@ -0,0 +1,60 @@ +====================== +last_epoch_started +====================== + +info.last_epoch_started records an activation epoch e for interval i +such that all writes commited in i or earlier are reflected in the +local info/log and no writes after i are reflected in the local +info/log. Since no committed write is ever divergent, even if we +get an authoritative log/info with an older info.last_epoch_started, +we can leave our info.last_epoch_started alone since no writes could +have commited in any intervening interval (See PG::proc_master_log). + +info.history.last_epoch_started records a lower bound on the most +recent interval in which the pg as a whole went active and accepted +writes. On a particular osd, it is also an upper bound on the +activation epoch of intervals in which writes in the local pg log +occurred (we update it before accepting writes). Because all +committed writes are committed by all acting set osds, any +non-divergent writes ensure that history.last_epoch_started was +recorded by all acting set members in the interval. Once peering has +queried one osd from each interval back to some seen +history.last_epoch_started, it follows that no interval after the max +history.last_epoch_started can have reported writes as committed +(since we record it before recording client writes in an interval). +Thus, the minimum last_update across all infos with +info.last_epoch_started >= MAX(history.last_epoch_started) must be an +upper bound on writes reported as committed to the client. + +We update info.last_epoch_started with the intial activation message, +but we only update history.last_epoch_started after the new +info.last_epoch_started is persisted (possibly along with the first +write). This ensures that we do not require an osd with the most +recent info.last_epoch_started until all acting set osds have recorded +it. + +In find_best_info, we do include info.last_epoch_started values when +calculating the max_last_epoch_started_found because we want to avoid +designating a log entry divergent which in a prior interval would have +been non-divergent since it might have been used to serve a read. In +activate(), we use the peer's last_epoch_started value as a bound on +how far back divergent log entries can be found. + +However, in a case like + +.. code:: none + + calc_acting osd.0 1.4e( v 473'302 (292'200,473'302] local-les=473 n=4 ec=5 les/c 473/473 556/556/556 + calc_acting osd.1 1.4e( v 473'302 (293'202,473'302] lb 0//0//-1 local-les=477 n=0 ec=5 les/c 473/473 556/556/556 + calc_acting osd.4 1.4e( v 473'302 (120'121,473'302] local-les=473 n=4 ec=5 les/c 473/473 556/556/556 + calc_acting osd.5 1.4e( empty local-les=0 n=0 ec=5 les/c 473/473 556/556/556 + +since osd.1 is the only one which recorded info.les=477 while 4,0 +which were the acting set in that interval did not (4 restarted and 0 +did not get the message in time) the pg is marked incomplete when +either 4 or 0 would have been valid choices. To avoid this, we do not +consider info.les for incomplete peers when calculating +min_last_epoch_started_found. It would not have been in the acting +set, so we must have another osd from that interval anyway (if +maybe_went_rw). If that osd does not remember that info.les, then we +cannot have served reads. diff --git a/doc/dev/osd_internals/map_message_handling.rst b/doc/dev/osd_internals/map_message_handling.rst index d7b4031beae3f..82ebf63322977 100644 --- a/doc/dev/osd_internals/map_message_handling.rst +++ b/doc/dev/osd_internals/map_message_handling.rst @@ -66,7 +66,7 @@ MOSDPGOps follow the following process: 1. OSD::handle_op: validates permissions and crush mapping. discard the request if they are not connected and the client cannot get the reply ( See OSD::op_is_discardable ) See OSDService::handle_misdirected_op - See OSD::op_has_sufficient_caps + See PG::op_has_sufficient_caps See OSD::require_same_or_newer_map 2. OSD::enqueue_op @@ -118,7 +118,7 @@ Peering messages are tagged with two epochs: These are the same in cases where there was no triggering message. We discard a peering message if the message's query_epoch if the PG in question has entered -a new epoch (See PG::old_peering_event, PG::queue_peering_event). Notifies, +a new epoch (See PG::old_peering_evt, PG::queue_peering_event). Notifies, infos, notifies, and logs are all handled as PG::RecoveryMachine events and are wrapped by PG::queue_* by PG::CephPeeringEvts, which include the created state machine event along with epoch_sent and query_epoch in order to diff --git a/doc/dev/osd_internals/snaps.rst b/doc/dev/osd_internals/snaps.rst index 825573825cc22..4afb4602fb98f 100644 --- a/doc/dev/osd_internals/snaps.rst +++ b/doc/dev/osd_internals/snaps.rst @@ -58,7 +58,7 @@ Snap Removal To remove a snapshot, a request is made to the *Monitor* cluster to add the snapshot id to the list of purged snaps (or to remove it from the set of pool snaps in the case of *pool snaps*). In either case, -the *PG* adds the snap to its *snaptrimq* for trimming. +the *PG* adds the snap to its *snap_trimq* for trimming. A clone can be removed when all of its snaps have been removed. In order to determine which clones might need to be removed upon snap @@ -70,7 +70,7 @@ See ReplicatedPG::SnapTrimmer, SnapMapper This trimming is performed asynchronously by the snap_trim_wq while the pg is clean and not scrubbing. - #. The next snap in PG::snaptrimq is selected for trimming + #. The next snap in PG::snap_trimq is selected for trimming #. We determine the next object for trimming out of PG::snap_mapper. For each object, we create a log entry and repop updating the object info and the snap set (including adjusting the overlaps). diff --git a/doc/dev/peering.rst b/doc/dev/peering.rst index ed40589ba195b..63574cac49dbe 100644 --- a/doc/dev/peering.rst +++ b/doc/dev/peering.rst @@ -169,20 +169,20 @@ The high level process is for the current PG primary to: we learn about a *last epoch started* that is newer than our own, we can prune older *past intervals* and reduce the peer OSDs we need to contact. - 5. if anyone else has (in his PG log) operations that I do not have, + 5. if anyone else has (in its PG log) operations that I do not have, instruct them to send me the missing log entries so that the primary's *PG log* is up to date (includes the newest write).. 5. for each member of the current *acting set*: - a) ask him for copies of all PG log entries since *last epoch start* + a) ask it for copies of all PG log entries since *last epoch start* so that I can verify that they agree with mine (or know what - objects I will be telling him to delete). + objects I will be telling it to delete). If the cluster failed before an operation was persisted by all members of the *acting set*, and the subsequent *peering* did not remember that operation, and a node that did remember that - operation later rejoined, his logs would record a different + operation later rejoined, its logs would record a different (divergent) history than the *authoritative history* that was reconstructed in the *peering* after the failure. @@ -193,8 +193,8 @@ The high level process is for the current PG primary to: any OSD that stores data from a divergent update to delete the affected (and now deemed to be apocryphal) objects. - b) ask him for his *missing set* (object updates recorded - in his PG log, but for which he does not have the new data). + b) ask it for its *missing set* (object updates recorded + in its PG log, but for which it does not have the new data). This is the list of objects that must be fully replicated before we can accept writes. diff --git a/doc/dev/quick_guide.rst b/doc/dev/quick_guide.rst index c23f56a8ffd64..6a4fe08eb4fc3 100644 --- a/doc/dev/quick_guide.rst +++ b/doc/dev/quick_guide.rst @@ -7,23 +7,18 @@ This guide will describe how to build and test Ceph for development. Development ----------- -After installing the dependencies described in the ``README``, -prepare the git source tree by updating the submodules +The ``run-make-check.sh`` script will install Ceph dependencies, +compile everything in debug mode and run a number of tests to verify +the result behaves as expected. .. code:: - git submodule update --init + $ ./run-make-check.sh -To build the server daemons, and FUSE client, execute the following: - -.. code:: - - ./do_autogen.sh -d 1 - make -j [number of cpus] Running a development deployment -------------------------------- -Ceph contains a script called ``vstart.sh`` which allows developers to quickly test their code using +Ceph contains a script called ``vstart.sh`` (see also :doc:`/dev/dev_cluster_deployement`) which allows developers to quickly test their code using a simple deployment on your development system. Once the build finishes successfully, start the ceph deployment using the following command: diff --git a/doc/dev/rbd-diff.rst b/doc/dev/rbd-diff.rst index 8cba29c30a37e..bff9582ae0698 100644 --- a/doc/dev/rbd-diff.rst +++ b/doc/dev/rbd-diff.rst @@ -37,7 +37,7 @@ Size ---- - u8: 's' -- u64: (ending) image size +- le64: (ending) image size Data Records ~~~~~~~~~~~~ diff --git a/doc/dev/release-process.rst b/doc/dev/release-process.rst index 0471415fc142a..f7e853b1ae478 100644 --- a/doc/dev/release-process.rst +++ b/doc/dev/release-process.rst @@ -43,7 +43,7 @@ In the ceph source directory, checkout next branch (for point releases use the { Checkout the submodules:: - git submodule update --init + git submodule update --force --init --recursive 4. Update Build version numbers ================================ diff --git a/doc/dev/sepia.rst b/doc/dev/sepia.rst index a9f26932a8228..6f83b2c19b18d 100644 --- a/doc/dev/sepia.rst +++ b/doc/dev/sepia.rst @@ -1,101 +1,9 @@ -Notes on the Sepia community test lab -===================================== +Sepia community test lab +======================== The Ceph community maintains a test lab that is open to active -contributors to the Ceph project. +contributors to the Ceph project. Please see the `Sepia repository`_ for more +information. -The lab is currently located in DreamHost's Irvine, CA data center. There are -about 15 racks of gear. +.. _Sepia repository: https://github.com/ceph/sepia -Hardware loans or donations are gladly accepted and will be put to -good use running regression and other automated testing. - - -E-mail and IRC --------------- - -Acitivity within the lab is coordinated in two places: - -* `sepia@ceph.com`_ email discussion list. - -* #sepia on irc.oftc.net - -.. _sepia@ceph.com: http://lists.ceph.com/listinfo.cgi/ceph-qa-ceph.com/ - - -Hardware overview ------------------ - -* **96 plana**: 1u Dell R410. 4 core/8 thread Intel E5620 2.4GHz. 8G RAM. 4x 500GB SATA. 1gig and 10gig network. - -* **64 burnupi**: 2u Dell R515. 16G RAM. 6 core. 8x1T SAS disk. 1gig and 10gig network. Crummy SAS expanders. - -* **122 mira**: 2u Supermicro. 4 core/8 thread. 16G RAM. 8x1T SATA disks. 1gig network. - -* 8 vercoi: 2u Dell C6100 (2 nodes/chassis); 12 core/24 thread; 72G RAM; 4x500GB RAID; VM hosts - -* 4 senta: 1u Supermicro; 12 core/24 thread; 72G RAM; 4x500GB MDADM; VM hosts - -* 44 saya: Calxeda armv7l nodes (for testing); 36x Highbank (4 core/4GB RAM), 12x Midway (4 core/8GB ram) - -* 24 tala: Calxeda armv7l nodes (gitbuilder); 24x Highbank (4 core/4GB RAM) - -* ~200 vps: VMs running on mira hardware (25 nodes) - -* 4 rex: 1u Supermicro; 12 core/24 thread; 64G RAM; 256G SSD; 1TB HD - -* 4 rhoda: 4u Supermicro (FatTwin, 4 nodes/chassis); 4 core/8 thread; 32G RAM; 10x4TB; 2x250G - -* 2 apama: 4u HP SL4540 (2 nodes/chassis); 8 core; 48G RAM; 2x500G (raid/boot); 25x3TB - -* 4 Intel 910 series PCI-E SSDs (2x800G, 2x400G) - -* 6 Force10 S4810 switches (2x AGG, 4x EDGE) (10G network) - -* 4 Cisco 2960G switches (1G network) - -* 5 Cisco 3560G switches (1G network) - -* 8 Cisco 2960 switches (100M OOB network/IPMI) - -* 8 Mellanox 40G NICs - -* 1 Mellanox 40G Switch - - -Access ------- - -We use openvpn to grant access to the lab network. Public ssh keys are used to -grant access to individual machines. - - -Locking machines ----------------- - -* All tests pull their builds from gitbuilder.ceph.com. - -* Anybody can lock machines with ``teuthology-lock --lock-many NUM - --machine-type TYPE``. - -* Machines are locked as ``whoami''@``hostname -s``. --owner to - choose otherwise. - -* Automated tests current run on the ``plana``; please avoid locking - these for personal use. - -* To unlock, please use ``teuthology-nuke -t list.yaml -r -u``, which - will reboot and clean up any leftover test state before unlocking - (or fail to unlock). It looks for a ``targets::`` section in the - yaml, so the regular job yaml will work. You can get a list of all - locked machines with ``teuthology-lock --list-targets``. - -* ``teuthology-lock -a --brief`` or ``teuthology-lock --summary`` to - see what is locked and by whom. - -* Be conscientious about scheduling entire qa runs. Coordinate - utilization on IRC. Make sure you are running the latest version - ceph-qa-suite.git and teuthology.git. - -* Results for scheduled runs appear in /a/$jobname on the teuthology - machine. diff --git a/doc/dev/session_authentication.rst b/doc/dev/session_authentication.rst index fd2651dd05d6c..e8a5059c69e39 100644 --- a/doc/dev/session_authentication.rst +++ b/doc/dev/session_authentication.rst @@ -5,7 +5,7 @@ Peter Reiher 7/30/12 The original Cephx protocol authenticated the client to the authenticator and set up a session -key used to authenticate the client to the server he needs to talk to. It did not, however, +key used to authenticate the client to the server it needs to talk to. It did not, however, authenticate the ongoing messages between the client and server. Based on the fact that they share a secret key, these ongoing session messages can be easily authenticated by using the key to sign the messages. diff --git a/doc/images/stack.png b/doc/images/stack.png index d297e42584b52..38eac41b01a4e 100644 Binary files a/doc/images/stack.png and b/doc/images/stack.png differ diff --git a/doc/index.rst b/doc/index.rst index c375b8e4f30e4..5bf64656ecba0 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -101,4 +101,5 @@ about Ceph, see our `Architecture`_ section. architecture Development release-notes + releases Glossary diff --git a/doc/install/clone-source.rst b/doc/install/clone-source.rst index e4af3e41c411e..fe67857b9a8a7 100644 --- a/doc/install/clone-source.rst +++ b/doc/install/clone-source.rst @@ -83,7 +83,7 @@ repository. If your submodules are out of date, run:: - git submodule update + git submodule update --force --init --recursive Choose a Branch =============== diff --git a/doc/install/get-packages.rst b/doc/install/get-packages.rst index f15e73c5eb1ea..b4492ec6f4244 100644 --- a/doc/install/get-packages.rst +++ b/doc/install/get-packages.rst @@ -124,6 +124,10 @@ Add our Ceph Extras package repository to your system's list of APT sources. :: RPM Packages ------------ +.. note:: ceph-extras on RPM-based systems is only needed on EL6-based + distributions (RHEL 6, CentOS 6, Scientific Linux 6). It is not needed + for Fedora or RHEL 7+. + For RPM packages, add our package repository to your ``/etc/yum.repos.d`` repos (e.g., ``ceph-extras.repo``). Some Ceph packages (e.g., QEMU) must take priority over standard packages, so you must ensure that you set ``priority=2``. :: diff --git a/doc/install/install-ceph-gateway.rst b/doc/install/install-ceph-gateway.rst index a1c1799bea853..ded9f35d643ea 100644 --- a/doc/install/install-ceph-gateway.rst +++ b/doc/install/install-ceph-gateway.rst @@ -2,227 +2,105 @@ Install Ceph Object Gateway ============================= -The :term:`Ceph Object Gateway` daemon runs on Apache and FastCGI. +.. note:: To run the Ceph object gateway service, you should have a running + Ceph cluster, the gateway host should have access to storage and public + networks, and SELinux should be in permissive mode in rpm-based distros. -To run a :term:`Ceph Object Storage` service, you must install Apache and -FastCGI. Then, you must install the Ceph Object Gateway daemon. The Ceph Object -Gateway supports 100-continue, but you must install Ceph builds of Apache and -FastCGI for 100-continue support. To install the Ceph Object Gateway, first -install and configure Apache and FastCGI. Then, install the Ceph Object Gateway -daemon. If you plan to run a Ceph Object Storage service with a federated -architecture (multiple regions and zones), you must also install the -synchronization agent. - -See `Get Packages`_ for information on adding Ceph packages to each Ceph Node. -Ensure that you have executed those steps on each Ceph Node first. - - -Apache/FastCGI w/out 100-Continue -================================= - -You may use standard Apache and FastCGI packages for your Ceph Object -Gateways. However, they will not provide 100-continue support. - -Debian Packages ---------------- - -To install Apache and FastCGI Debian packages, execute the following:: - - sudo apt-get install apache2 libapache2-mod-fastcgi - - -RPM Packages ------------- - -To install Apache and FastCGI RPMs, execute the following:: - - sudo rpm -ivh fcgi-2.4.0-10.el6.x86_64.rpm - sudo rpm -ivh mod_fastcgi-2.4.6-2.el6.rf.x86_64.rpm - -Or:: - - sudo yum install httpd mod_fastcgi - - -Apache/FastCGI w/ 100-Continue -============================== - -The Ceph community provides a slightly optimized version of the ``apache2`` -and ``fastcgi`` packages. The material difference is that the Ceph packages are -optimized for the ``100-continue`` HTTP response, where the server determines -if it will accept the request by first evaluating the request header. See `RFC -2616, Section 8`_ for details on ``100-continue``. You can find the most recent -builds of Apache and FastCGI packages modified for Ceph at `gitbuilder.ceph.com`_. - - -Debian Packages ---------------- - -#. Add the development key:: - - wget -q -O- https://raw.github.com/ceph/ceph/master/keys/autobuild.asc | sudo apt-key add - - -#. Add a ``ceph-apache.list`` file to your APT sources. :: - - echo deb http://gitbuilder.ceph.com/apache2-deb-$(lsb_release -sc)-x86_64-basic/ref/master $(lsb_release -sc) main | sudo tee /etc/apt/sources.list.d/ceph-apache.list - -#. Add a ``ceph-fastcgi.list`` file to your APT sources. :: - - echo deb http://gitbuilder.ceph.com/libapache-mod-fastcgi-deb-$(lsb_release -sc)-x86_64-basic/ref/master $(lsb_release -sc) main | sudo tee /etc/apt/sources.list.d/ceph-fastcgi.list - -#. Update your repository and install Apache and FastCGI:: - - sudo apt-get update && sudo apt-get install apache2 libapache2-mod-fastcgi +The :term:`Ceph Object Gateway` daemon runs on Apache and FastCGI. +To run a :term:`Ceph Object Storage` service, you must install Apache and +Ceph Object Gateway daemon on the host that is going to provide the gateway +service, i.e, the ``gateway host``. If you plan to run a Ceph Object Storage +service with a federated architecture (multiple regions and zones), you must +also install the synchronization agent. -RPM Packages ------------- - -To install Apache with 100-continue, execute the following steps: - -#. Install ``yum-plugin-priorities``. :: - - sudo yum install yum-plugin-priorities - -#. Ensure ``/etc/yum/pluginconf.d/priorities.conf`` exists. - -#. Ensure ``priorities.conf`` enables the plugin. :: - - [main] - enabled = 1 - -#. Add a ``ceph-apache.repo`` file to ``/etc/yum.repos.d``. Replace - ``{distro}`` with the name of your distribution (e.g., ``centos6``, - ``rhel6``, etc.) :: - - [apache2-ceph-noarch] - name=Apache noarch packages for Ceph - baseurl=http://gitbuilder.ceph.com/apache2-rpm-{distro}-x86_64-basic/ref/master - enabled=1 - priority=2 - gpgcheck=1 - type=rpm-md - gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/autobuild.asc - - [apache2-ceph-source] - name=Apache source packages for Ceph - baseurl=http://gitbuilder.ceph.com/apache2-rpm-{distro}-x86_64-basic/ref/master - enabled=0 - priority=2 - gpgcheck=1 - type=rpm-md - gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/autobuild.asc - - -#. Add a ``ceph-fastcgi.repo`` file to ``/etc/yum.repos.d``. Replace - ``{distro}`` with the name of your distribution (e.g., ``centos6``, - ``rhel6``, etc.) :: - - [fastcgi-ceph-basearch] - name=FastCGI basearch packages for Ceph - baseurl=http://gitbuilder.ceph.com/mod_fastcgi-rpm-{distro}-x86_64-basic/ref/master - enabled=1 - priority=2 - gpgcheck=1 - type=rpm-md - gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/autobuild.asc - - [fastcgi-ceph-noarch] - name=FastCGI noarch packages for Ceph - baseurl=http://gitbuilder.ceph.com/mod_fastcgi-rpm-{distro}-x86_64-basic/ref/master - enabled=1 - priority=2 - gpgcheck=1 - type=rpm-md - gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/autobuild.asc - - [fastcgi-ceph-source] - name=FastCGI source packages for Ceph - baseurl=http://gitbuilder.ceph.com/mod_fastcgi-rpm-{distro}-x86_64-basic/ref/master - enabled=0 - priority=2 - gpgcheck=1 - type=rpm-md - gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/autobuild.asc +.. note:: Previous versions of Ceph shipped with ``mod_fastcgi``. The current + version ships with ``mod_proxy_fcgi`` instead. - If the repository doesn't have a ``noarch`` section, you may remove the - ``noarch`` entry above. +In distros that ship Apache 2.4 (such as RHEL 7, CentOS 7 or Ubuntu 14.04 +``Trusty``), ``mod_proxy_fcgi`` is already present. When you install the +``httpd`` package with ``yum`` or the ``apache2`` package with ``apt-get``, +``mod_proxy_fcgi`` becomes available for use on your server. +In distros that ship Apache 2.2 (such as RHEL 6, CentOS 6 or Ubuntu 12.04 +``Precise``), ``mod_proxy_fcgi`` comes as a separate package. In +**RHEL 6/CentOS 6**, it is available in ``EPEL 6`` repo and can be installed with +``yum install mod_proxy_fcgi``. For **Ubuntu 12.04**, a backport for +``mod_proxy_fcgi`` is in progress and a bug has been filed for the same. +See: `ceph radosgw needs mod-proxy-fcgi for apache 2.2`_ -#. Update your repository. On RHEL systems, enable the - ``rhel-6-server-optional-rpms`` repository. :: - sudo yum update --enablerepo=rhel-6-server-optional-rpms +Install Apache +============== -#. Install Apache and FastCGI. :: +To install Apache on the ``gateway host``, execute the following: - sudo yum update && sudo yum install httpd mod_fastcgi +On Debian-based distros, run:: + sudo apt-get install apache2 -Configure Apache/FastCGI -======================== +On RPM-based distros, run:: -To complete the installation, ensure that you have the rewrite module -enabled and FastCGI enabled. The steps differ slightly based upon the -type of package installation. + sudo yum install httpd -Debian-based Packages ---------------------- -#. Open the ``apache2.conf`` file. :: +Configure Apache +================ - sudo vim /etc/apache2/apache2.conf +Make the following changes in Apache's configuration on the ``gateway host``: +Debian-based distros +-------------------- -#. Add a line for the ``ServerName`` in the Apache configuration file. - Provide the fully qualified domain name of the server machine - (e.g., ``hostname -f``). :: +#. Add a line for the ``ServerName`` in ``/etc/apache2/apache2.conf``. Provide + the fully qualified domain name of the server machine + (e.g., ``hostname -f``):: ServerName {fqdn} -#. Enable the URL rewrite modules for Apache and FastCGI. :: - - sudo a2enmod rewrite - sudo a2enmod fastcgi +#. Load ``mod_proxy_fcgi`` module. + Execute:: -#. Restart Apache so that the foregoing changes take effect. :: - - sudo service apache2 restart + sudo a2enmod proxy_fcgi +#. Start Apache service:: -RPM-based Packages ------------------- + sudo service apache2 start +RPM-based distros +----------------- -#. Open the ``httpd.conf`` file. :: +#. Open the ``httpd.conf`` file:: sudo vim /etc/httpd/conf/httpd.conf -#. Uncomment ``#ServerName`` and add the name of your server. - Provide the fully qualified domain name of the server machine - (e.g., ``hostname -f``).:: +#. Uncomment ``#ServerName`` in the file and add the name of your server. Provide + the fully qualified domain name of the server machine + (e.g., ``hostname -f``):: ServerName {fqdn} -#. Ensure that the Rewrite module is enabled. :: +#. Update ``/etc/httpd/conf/httpd.conf`` to load ``mod_proxy_fcgi`` module. Add + the following to the file:: - #if not present, add: - LoadModule rewrite_module modules/mod_rewrite.so + + LoadModule proxy_fcgi_module modules/mod_proxy_fcgi.so + -#. Save the ``httpd.conf`` file. +#. Edit the line ``Listen 80`` in ``/etc/httpd/conf/httpd.conf`` with the public + IP address of the host that you are configuring as a gateway server. Write + ``Listen {IP ADDRESS}:80`` in place of ``Listen 80``. -#. Ensure that the FastCGI module is enabled. The installer should - include an ``/etc/httpd/conf.d/fastcgi.conf`` file that loads the - FastCGI module. :: +#. Start httpd service - #if not present, add: - LoadModule fastcgi_module modules/mod_fastcgi.so + Execute:: -#. Restart Apache so that the foregoing changes take effect.. :: + sudo service httpd start - sudo /etc/init.d/httpd restart + Or:: + sudo systemctl start httpd Enable SSL @@ -236,25 +114,25 @@ for Apache. Use the following procedures to enable SSL. a SSL certificate from a trusted authority to use those client APIs. -Debian Packages ---------------- +Debian-based distros +-------------------- -To enable SSL for Debian/Ubuntu systems, execute the following steps: +To enable SSL on Debian-based distros, execute the following steps: -#. Ensure that you have installed the dependencies. :: +#. Ensure that you have installed the dependencies:: sudo apt-get install openssl ssl-cert -#. Enable the SSL module. :: +#. Enable the SSL module:: sudo a2enmod ssl -#. Generate a certificate. :: +#. Generate a certificate:: sudo mkdir /etc/apache2/ssl sudo openssl req -x509 -nodes -days 365 -newkey rsa:2048 -keyout /etc/apache2/ssl/apache.key -out /etc/apache2/ssl/apache.crt -#. Restart Apache. :: +#. Restart Apache:: sudo service apache2 restart @@ -262,72 +140,60 @@ To enable SSL for Debian/Ubuntu systems, execute the following steps: See the `Ubuntu Server Guide`_ for additional details. -RPM Packages ------------- +RPM-based distros +----------------- -To enable SSL for RPM-based systems, execute the following steps: +To enable SSL on RPM-based distros, execute the following steps: -#. Ensure that you have installed the dependencies. :: +#. Ensure that you have installed the dependencies:: sudo yum install mod_ssl openssl -#. Ensure the SSL module is enabled. +#. Generate private key:: + + openssl genrsa -out ca.key 2048 + +#. Generate CSR:: + + openssl req -new -key ca.key -out ca.csr -#. Generate a certificate and copy it to the appropriate locations. :: +#. Generate a certificate:: openssl x509 -req -days 365 -in ca.csr -signkey ca.key -out ca.crt - cp ca.crt /etc/pki/tls/certs - cp ca.key /etc/pki/tls/private/ca.key - cp ca.csr /etc/pki/tls/private/ca.csr -#. Restart Apache. :: +#. Copy the files to appropriate locations:: - sudo /etc/init.d/httpd restart + sudo cp ca.crt /etc/pki/tls/certs + sudo cp ca.key /etc/pki/tls/private/ca.key + sudo cp ca.csr /etc/pki/tls/private/ca.csr -See `Setting up an SSL secured Webserver with CentOS`_ for additional details. +#. Update the Apache SSL configuration file ``/etc/httpd/conf.d/ssl.conf``. + Give the correct location of ``SSLCertificateFile``:: + SSLCertificateFile /etc/pki/tls/certs/ca.crt -Add Wildcard to DNS -=================== + Give the correct location of ``SSLCertificateKeyFile``:: -To use Ceph with S3-style subdomains (e.g., ``bucket-name.domain-name.com``), -you need to add a wildcard to the DNS record of the DNS server you use with the -``radosgw`` daemon. + SSLCertificateKeyFile /etc/pki/tls/private/ca.key -.. tip:: The address of the DNS must also be specified in the Ceph - configuration file with the ``rgw dns name = {hostname}`` setting. + Save the changes. -For ``dnsmasq``, consider addding the following ``address`` setting with a dot -(.) prepended to the host name:: +#. Restart Apache. - address=/.{hostname-or-fqdn}/{host-ip-address} - address=/.ceph-node/192.168.0.1 + Execute:: -For ``bind``, consider adding the a wildcard to the DNS record:: + sudo service httpd restart - $TTL 604800 - @ IN SOA ceph-node. root.ceph-node. ( - 2 ; Serial - 604800 ; Refresh - 86400 ; Retry - 2419200 ; Expire - 604800 ) ; Negative Cache TTL - ; - @ IN NS ceph-node. - @ IN A 192.168.122.113 - * IN CNAME @ + Or:: -Restart your DNS server and ping your server with a subdomain to -ensure that your Ceph Object Store ``radosgw`` daemon can process -the subdomain requests. :: + sudo systemctl restart httpd - ping mybucket.{fqdn} - ping mybucket.ceph-node - +See `Setting up an SSL secured Webserver with CentOS`_ for additional details. -Install Ceph Object Gateway -=========================== + +Install Ceph Object Gateway Daemon +================================== Ceph Object Storage services use the Ceph Object Gateway daemon (``radosgw``) to enable the gateway. For federated architectures, the synchronization @@ -335,10 +201,11 @@ agent (``radosgw-agent``) provides data and metadata synchronization between zones and regions. -Debian Packages ---------------- +Debian-based distros +-------------------- -To install the Ceph Object Gateway daemon, execute the following:: +To install the Ceph Object Gateway daemon on the `gateway host`, execute the +following:: sudo apt-get install radosgw @@ -349,13 +216,13 @@ following:: sudo apt-get install radosgw-agent -RPM Packages ------------- +RPM-based distros +----------------- -To install the Ceph Object Gateway daemon, execute the +To install the Ceph Object Gateway daemon on the ``gateway host``, execute the following:: - sudo yum install ceph-radosgw ceph + sudo yum install ceph-radosgw To install the Ceph Object Gateway synchronization agent, execute the @@ -380,13 +247,9 @@ to configure your Ceph Object Gateway. There are two approaches: Ceph Object Gateway instances with regions and zones. Choose the approach that best reflects your cluster. - -.. _Get Packages: ../get-packages +.. _ceph radosgw needs mod-proxy-fcgi for apache 2.2: https://bugs.launchpad.net/precise-backports/+bug/1422417 .. _Ubuntu Server Guide: https://help.ubuntu.com/12.04/serverguide/httpd.html .. _Setting up an SSL secured Webserver with CentOS: http://wiki.centos.org/HowTos/Https -.. _RFC 2616, Section 8: http://www.w3.org/Protocols/rfc2616/rfc2616-sec8.html -.. _gitbuilder.ceph.com: http://gitbuilder.ceph.com -.. _Installing YUM Priorities: ../yum-priorities .. _simple: ../../radosgw/config .. _federated: ../../radosgw/federated-config diff --git a/doc/install/install-vm-cloud.rst b/doc/install/install-vm-cloud.rst index 45d4c9f4c3307..8bdb1e8b85111 100644 --- a/doc/install/install-vm-cloud.rst +++ b/doc/install/install-vm-cloud.rst @@ -57,6 +57,10 @@ To install QEMU, execute the following: [main] enabled = 1 +.. note:: ceph-extras on RPM-based systems is only needed on EL6-based + distributions (RHEL 6, CentOS 6, Scientific Linux 6). It is not needed + for Fedora or RHEL 7+. + #. Create a ``/etc/yum.repos.d/ceph-extras.repo`` file with the following contents, and replace ``{distro}`` with your Linux distribution. Follow the ``baseurl`` path below to see which distributions Ceph supports:: diff --git a/doc/install/manual-deployment.rst b/doc/install/manual-deployment.rst index dce8716f6d155..d7506ec8ba232 100644 --- a/doc/install/manual-deployment.rst +++ b/doc/install/manual-deployment.rst @@ -192,7 +192,7 @@ The procedure is as follows: #. Populate the monitor daemon(s) with the monitor map and keyring. :: - ceph-mon --mkfs -i {hostname} --monmap /tmp/monmap --keyring /tmp/ceph.mon.keyring + ceph-mon [--cluster {cluster-name}] --mkfs -i {hostname} --monmap /tmp/monmap --keyring /tmp/ceph.mon.keyring For example:: @@ -248,7 +248,7 @@ The procedure is as follows: For Ubuntu, use Upstart:: - sudo start ceph-mon id=node1 + sudo start ceph-mon id=node1 [cluster={cluster-name}] In this case, to allow the start of the daemon at each reboot you must create two empty files like this:: @@ -360,13 +360,13 @@ OSDs with the long form procedure, execute the following on ``node2`` and OSD starts up. The following command will output the OSD number, which you will need for subsequent steps. :: - ceph osd create [{uuid}] + ceph osd create [{uuid} [{id}]] #. Create the default directory on your new OSD. :: ssh {new-osd-host} - sudo mkdir /var/lib/ceph/osd/ceph-{osd-number} + sudo mkdir /var/lib/ceph/osd/{cluster-name}-{osd-number} #. If the OSD is for a drive other than the OS drive, prepare it @@ -374,7 +374,7 @@ OSDs with the long form procedure, execute the following on ``node2`` and ssh {new-osd-host} sudo mkfs -t {fstype} /dev/{hdd} - sudo mount -o user_xattr /dev/{hdd} /var/lib/ceph/osd/ceph-{osd-number} + sudo mount -o user_xattr /dev/{hdd} /var/lib/ceph/osd/{cluster-name}-{osd-number} #. Initialize the OSD data directory. :: @@ -391,12 +391,12 @@ OSDs with the long form procedure, execute the following on ``node2`` and ``ceph-{osd-num}`` in the path is the ``$cluster-$id``. If your cluster name differs from ``ceph``, use your cluster name instead.:: - sudo ceph auth add osd.{osd-num} osd 'allow *' mon 'allow profile osd' -i /var/lib/ceph/osd/ceph-{osd-num}/keyring + sudo ceph auth add osd.{osd-num} osd 'allow *' mon 'allow profile osd' -i /var/lib/ceph/osd/{cluster-name}-{osd-num}/keyring #. Add your Ceph Node to the CRUSH map. :: - ceph osd crush add-bucket {hostname} host + ceph [--cluster {cluster-name}] osd crush add-bucket {hostname} host For example:: @@ -413,7 +413,7 @@ OSDs with the long form procedure, execute the following on ``node2`` and bucket (if it's not already in the CRUSH map), add the device as an item in the host, assign it a weight, recompile it and set it. :: - ceph osd crush add {id-or-name} {weight} [{bucket-type}={bucket-name} ...] + ceph [--cluster {cluster-name}] osd crush add {id-or-name} {weight} [{bucket-type}={bucket-name} ...] For example:: @@ -426,7 +426,7 @@ OSDs with the long form procedure, execute the following on ``node2`` and For Ubuntu, use Upstart:: - sudo start ceph-osd id={osd-num} + sudo start ceph-osd id={osd-num} [cluster={cluster-name}] For example:: @@ -435,7 +435,7 @@ OSDs with the long form procedure, execute the following on ``node2`` and For Debian/CentOS/RHEL, use sysvinit:: - sudo /etc/init.d/ceph start osd.{osd-num} + sudo /etc/init.d/ceph start osd.{osd-num} [--cluster {cluster-name}] For example:: @@ -445,11 +445,12 @@ OSDs with the long form procedure, execute the following on ``node2`` and In this case, to allow the start of the daemon at each reboot you must create an empty file like this:: - sudo touch /var/lib/ceph/mon/{cluster-name}-{hostname}/sysvinit + sudo touch /var/lib/ceph/osd/{cluster-name}-{osd-num}/sysvinit For example:: - sudo touch /var/lib/ceph/mon/ceph-node1/sysvinit + sudo touch /var/lib/ceph/osd/ceph-0/sysvinit + sudo touch /var/lib/ceph/osd/ceph-1/sysvinit Once you start your OSD, it is ``up`` and ``in``. diff --git a/doc/install/upgrading-ceph.rst b/doc/install/upgrading-ceph.rst index b7e8c117f186c..b97a6d30bcb13 100644 --- a/doc/install/upgrading-ceph.rst +++ b/doc/install/upgrading-ceph.rst @@ -691,7 +691,7 @@ cluster, we recommend upgrading ``ceph-common`` and client libraries #. Upgrade the package:: ssh {client-host} - apt-get update && sudo apt-get install ceph-common librados2 librbd1 python-ceph + apt-get update && sudo apt-get install ceph-common librados2 librbd1 python-rados python-rbd #. Ensure that you have the latest version:: diff --git a/doc/man/8/ceph-authtool.rst b/doc/man/8/ceph-authtool.rst index 1f8c11325263d..523d14d1cc2d4 100644 --- a/doc/man/8/ceph-authtool.rst +++ b/doc/man/8/ceph-authtool.rst @@ -1,3 +1,5 @@ +:orphan: + ================================================= ceph-authtool -- ceph keyring manipulation tool ================================================= @@ -165,7 +167,7 @@ When mounting a Ceph file system, you can grab the appropriately encoded secret Availability ============ -**ceph-authtool** is part of the Ceph distributed storage system. Please +**ceph-authtool** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to the Ceph documentation at http://ceph.com/docs for more information. diff --git a/doc/man/8/ceph-clsinfo.rst b/doc/man/8/ceph-clsinfo.rst index 948397f46cae6..0188ce1310aa1 100644 --- a/doc/man/8/ceph-clsinfo.rst +++ b/doc/man/8/ceph-clsinfo.rst @@ -1,3 +1,5 @@ +:orphan: + =============================================== ceph-clsinfo -- show class object information =============================================== @@ -36,7 +38,7 @@ Options Availability ============ -**ceph-clsinfo** is part of the Ceph distributed storage system. Please +**ceph-clsinfo** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to the Ceph documentation at http://ceph.com/docs for more information. diff --git a/doc/man/8/ceph-conf.rst b/doc/man/8/ceph-conf.rst index b5674f7311544..9782e38988ad0 100644 --- a/doc/man/8/ceph-conf.rst +++ b/doc/man/8/ceph-conf.rst @@ -1,3 +1,5 @@ +:orphan: + ================================== ceph-conf -- ceph conf file tool ================================== @@ -11,7 +13,7 @@ Synopsis | **ceph-conf** -c *conffile* -L | **ceph-conf** -c *conffile* -l *prefix* | **ceph-conf** *key* -s *section1* ... -| **ceph-conf** [-s *section* ] --lookup *key* +| **ceph-conf** [-s *section* ] [-r] --lookup *key* | **ceph-conf** [-s *section* ] *key* @@ -26,30 +28,59 @@ Ceph configuration file to use with the ``-c`` flag. Actions ======= -.. TODO format this like a proper man page +**ceph-conf** performs one of the following actions: + +.. option:: -L, --list-all-sections + + list all sections in the configuration file. + +.. option:: -l, --list-sections *prefix* + + list the sections with the given *prefix*. For example, ``--list-sections mon`` + would list all sections beginning with ``mon``. + +.. option:: --lookup *key* + + search and print the specified configuration setting. Note: ``--lookup`` is + the default action. If no other actions are given on the command line, we will + default to doing a lookup. + +.. option:: -h, --help + + print a summary of usage. + + +Options +======= -**ceph-conf** will perform one of the following actions: +.. option:: -c *conffile* ---list-all-sections or -L prints out a list of all the section names in the configuration -file. + the Ceph configuration file. ---list-sections or -l prints out a list of all the sections that begin -with a given prefix. For example, --list-sections mon would list all -sections beginning with mon. +.. option:: --filter-key *key* ---lookup will search the configuration for a given value. By default, the sections that -are searched are determined by the Ceph name that we are using. The Ceph name defaults to -client.admin. It can be specified with --name. + filter section list to only include sections with given *key* defined. -For example, if we specify --name osd.0, the following sections will be searched: -[osd.0], [osd], [global] +.. option:: --filter-key-value *key* ``=`` *value* -You can specify additional sections to search with --section or -s. These additional -sections will be searched before the sections that would normally be searched. As always, -the first matching entry we find will be returned. + filter section list to only include sections with given *key*/*value* pair. -Note: --lookup is the default action. If no other actions are given on the command line, -we will default to doing a lookup. +.. option:: --name *type.id* + + the Ceph name in which the sections are searched (default 'client.admin'). + For example, if we specify ``--name osd.0``, the following sections will be + searched: [osd.0], [osd], [global] + +.. option:: -r, --resolve-search + + search for the first file that exists and can be opened in the resulted + comma delimited search list. + +.. option:: -s, --section + + additional sections to search. These additional sections will be searched + before the sections that would normally be searched. As always, the first + matching entry we find will be returned. Examples @@ -63,7 +94,7 @@ To find out what value will mds a use for the "log file" option:: ceph-conf -c foo.conf --name mds.a "log file" -To list all sections that begin with osd:: +To list all sections that begin with "osd":: ceph-conf -c foo.conf -l osd @@ -71,11 +102,23 @@ To list all sections:: ceph-conf -c foo.conf -L +To print the path of the "keyring" used by "client.0":: + + ceph-conf --name client.0 -r -l keyring + + +Files +===== + +``/etc/ceph/$cluster.conf``, ``~/.ceph/$cluster.conf``, ``$cluster.conf`` + +the Ceph configuration files to use if not specified. + Availability ============ -**ceph-conf** is part of the Ceph distributed storage system. Please refer +**ceph-conf** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to the Ceph documentation at http://ceph.com/docs for more information. diff --git a/doc/man/8/ceph-create-keys.rst b/doc/man/8/ceph-create-keys.rst new file mode 100644 index 0000000000000..8d1dc915a9a78 --- /dev/null +++ b/doc/man/8/ceph-create-keys.rst @@ -0,0 +1,63 @@ +:orphan: + +=============================================== +ceph-create-keys -- ceph keyring generate tool +=============================================== + +.. program:: ceph-create-keys + +Synopsis +======== + +| **ceph-create-keys** [-h] [-v] [--cluster *name*] --id *id* + + +Description +=========== + +:program:`ceph-create-keys` is a utility to generate bootstrap keyrings using +the given monitor when it is ready. + +It creates following auth entities (or users) + +``client.admin`` + + and its key for your client host. + +``client.bootstrap-{osd, rgw, mds}`` + + and their keys for bootstrapping corresponding services + +To list all users in the cluster:: + + ceph auth list + + +Options +======= + +.. option:: --cluster + + name of the cluster (default 'ceph'). + +.. option:: -i, --id + + id of a ceph-mon that is coming up. **ceph-create-keys** will wait until it joins quorum. + +.. option:: -v, --verbose + + be more verbose. + + +Availability +============ + +**ceph-create-keys** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer +to the Ceph documentation at http://ceph.com/docs for more +information. + + +See also +======== + +:doc:`ceph `\(8) diff --git a/doc/man/8/ceph-debugpack.rst b/doc/man/8/ceph-debugpack.rst index d4874b5cc4703..4f2c4f2f6b37b 100644 --- a/doc/man/8/ceph-debugpack.rst +++ b/doc/man/8/ceph-debugpack.rst @@ -1,3 +1,5 @@ +:orphan: + ============================================= ceph-debugpack -- ceph debug packer utility ============================================= @@ -36,7 +38,7 @@ Options Availability ============ -**ceph-debugpack** is part of the Ceph distributed storage system. Please +**ceph-debugpack** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to the Ceph documentation at http://ceph.com/docs for more information. diff --git a/doc/man/8/ceph-dencoder.rst b/doc/man/8/ceph-dencoder.rst index a7252a987a3a9..cf2e429e103cd 100644 --- a/doc/man/8/ceph-dencoder.rst +++ b/doc/man/8/ceph-dencoder.rst @@ -1,3 +1,5 @@ +:orphan: + ============================================== ceph-dencoder -- ceph encoder/decoder utility ============================================== @@ -138,7 +140,7 @@ do that like this: Availability ============ -**ceph-dencoder** is part of the Ceph distributed storage system. Please +**ceph-dencoder** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to the Ceph documentation at http://ceph.com/docs for more information. diff --git a/doc/man/8/ceph-deploy.rst b/doc/man/8/ceph-deploy.rst new file mode 100644 index 0000000000000..8a04ef33f0544 --- /dev/null +++ b/doc/man/8/ceph-deploy.rst @@ -0,0 +1,608 @@ +:orphan: + +===================================== + ceph-deploy -- Ceph deployment tool +===================================== + +.. program:: ceph-deploy + +Synopsis +======== + +| **ceph-deploy** **new** [*initial-monitor-node(s)*] + +| **ceph-deploy** **install** [*ceph-node*] [*ceph-node*...] + +| **ceph-deploy** **mon** *create-initial* + +| **ceph-deploy** **osd** *prepare* [*ceph-node*]:[*dir-path*] + +| **ceph-deploy** **osd** *activate* [*ceph-node*]:[*dir-path*] + +| **ceph-deploy** **osd** *create* [*ceph-node*]:[*dir-path*] + +| **ceph-deploy** **admin** [*admin-node*][*ceph-node*...] + +| **ceph-deploy** **purgedata** [*ceph-node*][*ceph-node*...] + +| **ceph-deploy** **forgetkeys** + +Description +=========== + +:program:`ceph-deploy` is a tool which allows easy and quick deployment of a +Ceph cluster without involving complex and detailed manual configuration. It +uses ssh to gain access to other Ceph nodes from the admin node, sudo for +administrator privileges on them and the underlying Python scripts automates +the manual process of Ceph installation on each node from the admin node itself. +It can be easily run on an workstation and doesn't require servers, databases or +any other automated tools. With :program:`ceph-deploy`, it is really easy to set +up and take down a cluster. However, it is not a generic deployment tool. It is +a specific tool which is designed for those who want to get Ceph up and running +quickly with only the unavoidable initial configuration settings and without the +overhead of installing other tools like ``Chef``, ``Puppet`` or ``Juju``. Those +who want to customize security settings, partitions or directory locations and +want to set up a cluster following detailed manual steps, should use other tools +i.e, ``Chef``, ``Puppet``, ``Juju`` or ``Crowbar``. + +With :program:`ceph-deploy`, you can install Ceph packages on remote nodes, +create a cluster, add monitors, gather/forget keys, add OSDs and metadata +servers, configure admin hosts or take down the cluster. + +Commands +======== + +new +--- + +Start deploying a new cluster and write a configuration file and keyring for it. +It tries to copy ssh keys from admin node to gain passwordless ssh to monitor +node(s), validates host IP, creates a cluster with a new initial monitor node or +nodes for monitor quorum, a ceph configuration file, a monitor secret keyring and +a log file for the new cluster. It populates the newly created Ceph configuration +file with ``fsid`` of cluster, hostnames and IP addresses of initial monitor +members under ``[global]`` section. + +Usage:: + + ceph-deploy new [MON][MON...] + +Here, [MON] is the initial monitor hostname (short hostname i.e, ``hostname -s``). + +Other options like :option:`--no-ssh-copykey`, :option:`--fsid`, +:option:`--cluster-network` and :option:`--public-network` can also be used with +this command. + +If more than one network interface is used, ``public network`` setting has to be +added under ``[global]`` section of Ceph configuration file. If the public subnet +is given, ``new`` command will choose the one IP from the remote host that exists +within the subnet range. Public network can also be added at runtime using +:option:`--public-network` option with the command as mentioned above. + + +install +------- + +Install Ceph packages on remote hosts. As a first step it installs +``yum-plugin-priorities`` in admin and other nodes using passwordless ssh and sudo +so that Ceph packages from upstream repository get more priority. It then detects +the platform and distribution for the hosts and installs Ceph normally by +downloading distro compatible packages if adequate repo for Ceph is already added. +``--release`` flag is used to get the latest release for installation. During +detection of platform and distribution before installation, if it finds the +``distro.init`` to be ``sysvinit`` (Fedora, CentOS/RHEL etc), it doesn't allow +installation with custom cluster name and uses the default name ``ceph`` for the +cluster. + +If the user explicitly specifies a custom repo url with :option:`--repo-url` for +installation, anything detected from the configuration will be overridden and +the custom repository location will be used for installation of Ceph packages. +If required, valid custom repositories are also detected and installed. In case +of installation from a custom repo a boolean is used to determine the logic +needed to proceed with a custom repo installation. A custom repo install helper +is used that goes through config checks to retrieve repos (and any extra repos +defined) and installs them. ``cd_conf`` is the object built from ``argparse`` +that holds the flags and information needed to determine what metadata from the +configuration is to be used. + +A user can also opt to install only the repository without installing Ceph and +its dependencies by using :option:`--repo` option. + +Usage:: + + ceph-deploy install [HOST][HOST...] + +Here, [HOST] is/are the host node(s) where Ceph is to be installed. + +An option ``--release`` is used to install a release known as CODENAME +(default: firefly). + +Other options like :option:`--testing`, :option:`--dev`, :option:`--adjust-repos`, +:option:`--no-adjust-repos`, :option:`--repo`, :option:`--local-mirror`, +:option:`--repo-url` and :option:`--gpg-url` can also be used with this command. + + +mds +--- + +Deploy Ceph mds on remote hosts. A metadata server is needed to use CephFS and +the ``mds`` command is used to create one on the desired host node. It uses the +subcommand ``create`` to do so. ``create`` first gets the hostname and distro +information of the desired mds host. It then tries to read the ``bootstrap-mds`` +key for the cluster and deploy it in the desired host. The key generally has a +format of ``{cluster}.bootstrap-mds.keyring``. If it doesn't finds a keyring, +it runs ``gatherkeys`` to get the keyring. It then creates a mds on the desired +host under the path ``/var/lib/ceph/mds/`` in ``/var/lib/ceph/mds/{cluster}-{name}`` +format and a bootstrap keyring under ``/var/lib/ceph/bootstrap-mds/`` in +``/var/lib/ceph/bootstrap-mds/{cluster}.keyring`` format. It then runs appropriate +commands based on ``distro.init`` to start the ``mds``. To remove the mds, +subcommand ``destroy`` is used. + +Usage:: + + ceph-deploy mds create [HOST[:DAEMON-NAME]] [HOST[:DAEMON-NAME]...] + + ceph-deploy mds destroy [HOST[:DAEMON-NAME]] [HOST[:DAEMON-NAME]...] + +The [DAEMON-NAME] is optional. + + +mon +--- + +Deploy Ceph monitor on remote hosts. ``mon`` makes use of certain subcommands +to deploy Ceph monitors on other nodes. + +Subcommand ``create-initial`` deploys for monitors defined in +``mon initial members`` under ``[global]`` section in Ceph configuration file, +wait until they form quorum and then gatherkeys, reporting the monitor status +along the process. If monitors don't form quorum the command will eventually +time out. + +Usage:: + + ceph-deploy mon create-initial + +Subcommand ``create`` is used to deploy Ceph monitors by explicitly specifying +the hosts which are desired to be made monitors. If no hosts are specified it +will default to use the ``mon initial members`` defined under ``[global]`` +section of Ceph configuration file. ``create`` first detects platform and distro +for desired hosts and checks if hostname is compatible for deployment. It then +uses the monitor keyring initially created using ``new`` command and deploys the +monitor in desired host. If multiple hosts were specified during ``new`` command +i.e, if there are multiple hosts in ``mon initial members`` and multiple keyrings +were created then a concatenated keyring is used for deployment of monitors. In +this process a keyring parser is used which looks for ``[entity]`` sections in +monitor keyrings and returns a list of those sections. A helper is then used to +collect all keyrings into a single blob that will be used to inject it to monitors +with :option:`--mkfs` on remote nodes. All keyring files are concatenated to be +in a directory ending with ``.keyring``. During this process the helper uses list +of sections returned by keyring parser to check if an entity is already present +in a keyring and if not, adds it. The concatenated keyring is used for deployment +of monitors to desired multiple hosts. + +Usage:: + + ceph-deploy mon create [HOST] [HOST...] + +Here, [HOST] is hostname of desired monitor host(s). + +Subcommand ``add`` is used to add a monitor to an existing cluster. It first +detects platform and distro for desired host and checks if hostname is compatible +for deployment. It then uses the monitor keyring, ensures configuration for new +monitor host and adds the monitor to the cluster. If the section for the monitor +exists and defines a mon addr that will be used, otherwise it will fallback by +resolving the hostname to an IP. If :option:`--address` is used it will override +all other options. After adding the monitor to the cluster, it gives it some time +to start. It then looks for any monitor errors and checks monitor status. Monitor +errors arise if the monitor is not added in ``mon initial members``, if it doesn't +exist in ``monmap`` and if neither ``public_addr`` nor ``public_network`` keys +were defined for monitors. Under such conditions, monitors may not be able to +form quorum. Monitor status tells if the monitor is up and running normally. The +status is checked by running ``ceph daemon mon.hostname mon_status`` on remote +end which provides the output and returns a boolean status of what is going on. +``False`` means a monitor that is not fine even if it is up and running, while +``True`` means the monitor is up and running correctly. + +Usage:: + + ceph-deploy mon add [HOST] + + ceph-deploy mon add [HOST] --address [IP] + +Here, [HOST] is the hostname and [IP] is the IP address of the desired monitor +node. Please note, unlike other ``mon`` subcommands, only one node can be +specified at a time. + +Subcommand ``destroy`` is used to completely remove monitors on remote hosts. +It takes hostnames as arguments. It stops the monitor, verifies if ``ceph-mon`` +daemon really stopped, creates an archive directory ``mon-remove`` under +``/var/lib/ceph/``, archives old monitor directory in +``{cluster}-{hostname}-{stamp}`` format in it and removes the monitor from +cluster by running ``ceph remove...`` command. + +Usage:: + + ceph-deploy mon destroy [HOST] [HOST...] + +Here, [HOST] is hostname of monitor that is to be removed. + + +gatherkeys +---------- + +Gather authentication keys for provisioning new nodes. It takes hostnames as +arguments. It checks for and fetches ``client.admin`` keyring, monitor keyring +and ``bootstrap-mds/bootstrap-osd`` keyring from monitor host. These +authentication keys are used when new ``monitors/OSDs/MDS`` are added to the +cluster. + +Usage:: + + ceph-deploy gatherkeys [HOST] [HOST...] + +Here, [HOST] is hostname of the monitor from where keys are to be pulled. + + +disk +---- + +Manage disks on a remote host. It actually triggers the ``ceph-disk`` utility +and it's subcommands to manage disks. + +Subcommand ``list`` lists disk partitions and Ceph OSDs. + +Usage:: + + ceph-deploy disk list [HOST:[DISK]] + +Here, [HOST] is hostname of the node and [DISK] is disk name or path. + +Subcommand ``prepare`` prepares a directory, disk or drive for a Ceph OSD. It +creates a GPT partition, marks the partition with Ceph type uuid, creates a +file system, marks the file system as ready for Ceph consumption, uses entire +partition and adds a new partition to the journal disk. + +Usage:: + + ceph-deploy disk prepare [HOST:[DISK]] + +Here, [HOST] is hostname of the node and [DISK] is disk name or path. + +Subcommand ``activate`` activates the Ceph OSD. It mounts the volume in a +temporary location, allocates an OSD id (if needed), remounts in the correct +location ``/var/lib/ceph/osd/$cluster-$id`` and starts ``ceph-osd``. It is +triggered by ``udev`` when it sees the OSD GPT partition type or on ceph service +start with ``ceph disk activate-all``. + +Usage:: + + ceph-deploy disk activate [HOST:[DISK]] + +Here, [HOST] is hostname of the node and [DISK] is disk name or path. + +Subcommand ``zap`` zaps/erases/destroys a device's partition table and contents. +It actually uses ``sgdisk`` and it's option ``--zap-all`` to destroy both GPT and +MBR data structures so that the disk becomes suitable for repartitioning. +``sgdisk`` then uses ``--mbrtogpt`` to convert the MBR or BSD disklabel disk to a +GPT disk. The ``prepare`` subcommand can now be executed which will create a new +GPT partition. + +Usage:: + + ceph-deploy disk zap [HOST:[DISK]] + +Here, [HOST] is hostname of the node and [DISK] is disk name or path. + + +osd +--- + +Manage OSDs by preparing data disk on remote host. ``osd`` makes use of certain +subcommands for managing OSDs. + +Subcommand ``prepare`` prepares a directory, disk or drive for a Ceph OSD. It +first checks against multiple OSDs getting created and warns about the +possibility of more than the recommended which would cause issues with max +allowed PIDs in a system. It then reads the bootstrap-osd key for the cluster or +writes the bootstrap key if not found. It then uses :program:`ceph-disk` +utility's ``prepare`` subcommand to prepare the disk, journal and deploy the OSD +on the desired host. Once prepared, it gives some time to the OSD to settle and +checks for any possible errors and if found, reports to the user. + +Usage:: + + ceph-deploy osd prepare HOST:DISK[:JOURNAL] [HOST:DISK[:JOURNAL]...] + +Subcommand ``activate`` activates the OSD prepared using ``prepare`` subcommand. +It actually uses :program:`ceph-disk` utility's ``activate`` subcommand with +appropriate init type based on distro to activate the OSD. Once activated, it +gives some time to the OSD to start and checks for any possible errors and if +found, reports to the user. It checks the status of the prepared OSD, checks the +OSD tree and makes sure the OSDs are up and in. + +Usage:: + + ceph-deploy osd activate HOST:DISK[:JOURNAL] [HOST:DISK[:JOURNAL]...] + +Subcommand ``create`` uses ``prepare`` and ``activate`` subcommands to create an +OSD. + +Usage:: + + ceph-deploy osd create HOST:DISK[:JOURNAL] [HOST:DISK[:JOURNAL]...] + +Subcommand ``list`` lists disk partitions, Ceph OSDs and prints OSD metadata. +It gets the osd tree from a monitor host, uses the ``ceph-disk-list`` output +and gets the mount point by matching the line where the partition mentions +the OSD name, reads metadata from files, checks if a journal path exists, +if the OSD is in a OSD tree and prints the OSD metadata. + +Usage:: + + ceph-deploy osd list HOST:DISK[:JOURNAL] [HOST:DISK[:JOURNAL]...] + + +admin +----- + +Push configuration and ``client.admin`` key to a remote host. It takes +the ``{cluster}.client.admin.keyring`` from admin node and writes it under +``/etc/ceph`` directory of desired node. + +Usage:: + + ceph-deploy admin [HOST] [HOST...] + +Here, [HOST] is desired host to be configured for Ceph administration. + + +config +------ + +Push/pull configuration file to/from a remote host. It uses ``push`` subcommand +to takes the configuration file from admin host and write it to remote host under +``/etc/ceph`` directory. It uses ``pull`` subcommand to do the opposite i.e, pull +the configuration file under ``/etc/ceph`` directory of remote host to admin node. + +Usage:: + + ceph-deploy push [HOST] [HOST...] + + ceph-deploy pull [HOST] [HOST...] + +Here, [HOST] is the hostname of the node where config file will be pushed to or +pulled from. + + +uninstall +--------- + +Remove Ceph packages from remote hosts. It detects the platform and distro of +selected host and uninstalls Ceph packages from it. However, some dependencies +like ``librbd1`` and ``librados2`` will not be removed because they can cause +issues with ``qemu-kvm``. + +Usage:: + + ceph-deploy uninstall [HOST] [HOST...] + +Here, [HOST] is hostname of the node from where Ceph will be uninstalled. + + +purge +----- + +Remove Ceph packages from remote hosts and purge all data. It detects the +platform and distro of selected host, uninstalls Ceph packages and purges all +data. However, some dependencies like ``librbd1`` and ``librados2`` will not be +removed because they can cause issues with ``qemu-kvm``. + +Usage:: + + ceph-deploy purge [HOST] [HOST...] + +Here, [HOST] is hostname of the node from where Ceph will be purged. + + +purgedata +--------- + +Purge (delete, destroy, discard, shred) any Ceph data from ``/var/lib/ceph``. +Once it detects the platform and distro of desired host, it first checks if Ceph +is still installed on the selected host and if installed, it won't purge data +from it. If Ceph is already uninstalled from the host, it tries to remove the +contents of ``/var/lib/ceph``. If it fails then probably OSDs are still mounted +and needs to be unmounted to continue. It unmount the OSDs and tries to remove +the contents of ``/var/lib/ceph`` again and checks for errors. It also removes +contents of ``/etc/ceph``. Once all steps are successfully completed, all the +Ceph data from the selected host are removed. + +Usage:: + + ceph-deploy purgedata [HOST] [HOST...] + +Here, [HOST] is hostname of the node from where Ceph data will be purged. + + +forgetkeys +---------- + +Remove authentication keys from the local directory. It removes all the +authentication keys i.e, monitor keyring, client.admin keyring, bootstrap-osd +and bootstrap-mds keyring from the node. + +Usage:: + + ceph-deploy forgetkeys + + +pkg +--- + +Manage packages on remote hosts. It is used for installing or removing packages +from remote hosts. The package names for installation or removal are to be +specified after the command. Two options :option:`--install` and +:option:`--remove` are used for this purpose. + +Usage:: + + ceph-deploy pkg --install [PKGs] [HOST] [HOST...] + + ceph-deploy pkg --remove [PKGs] [HOST] [HOST...] + +Here, [PKGs] is comma-separated package names and [HOST] is hostname of the +remote node where packages are to be installed or removed from. + + +calamari +-------- + +Install and configure Calamari nodes. It first checks if distro is supported +for Calamari installation by ceph-deploy. An argument ``connect`` is used for +installation and configuration. It checks for ``ceph-deploy`` configuration +file (cd_conf) and Calamari release repo or ``calamari-minion`` repo. It relies +on default for repo installation as it doesn't install Ceph unless specified +otherwise. ``options`` dictionary is also defined because ``ceph-deploy`` +pops items internally which causes issues when those items are needed to be +available for every host. If the distro is Debian/Ubuntu, it is ensured that +proxy is disabled for ``calamari-minion`` repo. ``calamari-minion`` package is +then installed and custom repository files are added. minion config is placed +prior to installation so that it is present when the minion first starts. +config directory, calamari salt config are created and salt-minion package +is installed. If the distro is Redhat/CentOS, the salt-minion service needs to +be started. + +Usage:: + + ceph-deploy calamari {connect} [HOST] [HOST...] + +Here, [HOST] is the hostname where Calamari is to be installed. + +An option ``--release`` can be used to use a given release from repositories +defined in :program:`ceph-deploy`'s configuration. Defaults to ``calamari-minion``. + +Another option :option:`--master` can also be used with this command. + +Options +======= + +.. option:: --version + + The current installed version of :program:`ceph-deploy`. + +.. option:: --username + + The username to connect to the remote host. + +.. option:: --overwrite-conf + + Overwrite an existing conf file on remote host (if present). + +.. option:: --cluster + + Name of the cluster. + +.. option:: --ceph-conf + + Use (or reuse) a given ``ceph.conf`` file. + +.. option:: --no-ssh-copykey + + Do not attempt to copy ssh keys. + +.. option:: --fsid + + Provide an alternate FSID for ``ceph.conf`` generation. + +.. option:: --cluster-network + + Specify the (internal) cluster network. + +.. option:: --public-network + + Specify the public network for a cluster. + +.. option:: --testing + + Install the latest development release. + +.. option:: --dev + + Install a bleeding edge built from Git branch or tag (default: master). + +.. option:: --adjust-repos + + Install packages modifying source repos. + +.. option:: --no-adjust-repos + + Install packages without modifying source repos. + +.. option:: --repo + + Install repo files only (skips package installation). + +.. option:: --local-mirror + + Fetch packages and push them to hosts for a local repo mirror. + +.. option:: --repo-url + + Specify a repo url that mirrors/contains Ceph packages. + +.. option:: --gpg-url + + Specify a GPG key url to be used with custom repos (defaults to ceph.com). + +.. option:: --address + + IP address of the host node to be added to the cluster. + +.. option:: --keyrings + + Concatenate multiple keyrings to be seeded on new monitors. + +.. option:: --zap-disk + + Destroy the partition table and content of a disk. + +.. option:: --fs-type + + Filesystem to use to format disk ``(xfs, btrfs or ext4)``. + +.. option:: --dmcrypt + + Encrypt [data-path] and/or journal devices with ``dm-crypt``. + +.. option:: --dmcrypt-key-dir + + Directory where ``dm-crypt`` keys are stored. + +.. option:: --install + + Comma-separated package(s) to install on remote hosts. + +.. option:: --remove + + Comma-separated package(s) to remove from remote hosts. + +.. option:: --master + + The domain for the Calamari master server. + + +Availability +============ + +:program:`ceph-deploy` is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to +the documentation at http://ceph.com/ceph-deploy/docs for more information. + + +See also +======== + +:doc:`ceph-mon `\(8), +:doc:`ceph-osd `\(8), +:doc:`ceph-disk `\(8), +:doc:`ceph-mds `\(8) diff --git a/doc/man/8/ceph-detect-init.rst b/doc/man/8/ceph-detect-init.rst new file mode 100644 index 0000000000000..aeb3316e5039c --- /dev/null +++ b/doc/man/8/ceph-detect-init.rst @@ -0,0 +1,54 @@ +:orphan: + +============================================================ + ceph-detect-init -- display the init system Ceph should use +============================================================ + +.. program:: ceph-detect-init + +Synopsis +======== + +| **ceph-detect-init** [--verbose] [--use-rhceph] [--default *init*] + +Description +=========== + +:program:`ceph-detect-init` is a utility that prints the init system +Ceph uses. It can be one of ``sysvinit``, ``upstart`` or ``systemd``. +The init system Ceph uses may not be the default init system of the +host operating system. For instance on Debian Jessie, Ceph may use +``sysvinit`` although ``systemd`` is the default. + +If the init system of the host operating system is unknown, return on +error, unless :option:`--default` is specified. + +Options +======= + +.. option:: --use-rhceph + + When an operating system identifies itself as Red Hat, it is + treated as if it was CentOS. With :option:`--use-rhceph` it is + treated as RHEL instead. + +.. option:: --default INIT + + If the init system of the host operating system is unkown, return + the value of *INIT* instead of failing with an error. + +.. option:: --verbose + + Display additional information for debugging. + +Availability +============ + +:program:`ceph-detect-init` is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to +the Ceph documentation at http://ceph.com/docs for more information. + +See also +======== + +:doc:`ceph-disk `\(8), +:doc:`ceph-deploy `\(8) diff --git a/doc/man/8/ceph-disk.rst b/doc/man/8/ceph-disk.rst index 41e360994cf9e..bb67163c28949 100644 --- a/doc/man/8/ceph-disk.rst +++ b/doc/man/8/ceph-disk.rst @@ -1,3 +1,5 @@ +:orphan: + =================================================================== ceph-disk -- Ceph disk preparation and activation utility for OSD =================================================================== @@ -11,6 +13,8 @@ Synopsis [--fs-type *xfs|ext4|btrfs*] [*data-path*] [*journal-path*] | **ceph-disk** **activate** [*data-path*] [--activate-key *path*] + [--mark-init *sysvinit|upstart|systemd|auto|none*] + [--no-start-daemon] | **ceph-disk** **activate-all** @@ -19,121 +23,189 @@ Synopsis Description =========== -**ceph-disk** is a utility that can prepare and activate a disk, partition or -directory as a ceph OSD. It is run directly or triggered by **ceph-deploy** -or udev. +:program:`ceph-disk` is a utility that can prepare and activate a disk, partition or +directory as a Ceph OSD. It is run directly or triggered by :program:`ceph-deploy` +or ``udev``. It can also be triggered by other deployment utilities like ``Chef``, +``Juju``, ``Puppet`` etc. It actually automates the multiple steps involved in manual creation and start -of an OSD into 2 steps of preparing and activating the OSD by using the -subcommands **prepare** and **activate**. +of an OSD into two steps of preparing and activating the OSD by using the +subcommands ``prepare`` and ``activate``. Subcommands ============ -**prepare**: Prepare a directory, disk or drive for a ceph OSD. It creates a GPT -partition, marks the partition with ceph type uuid, creates a file system, marks -the file system as ready for ceph consumption, uses entire partition and adds a -new partition to the journal disk. It is run directly or triggered by -**ceph-deploy**. +prepare +-------- + +Prepare a directory, disk for a Ceph OSD. It creates a GPT partition, +marks the partition with Ceph type ``uuid``, creates a file system, marks the +file system as ready for Ceph consumption, uses entire partition and adds a new +partition to the journal disk. It is run directly or triggered by +:program:`ceph-deploy`. + +Usage:: + + ceph-disk prepare --cluster [cluster-name] --cluster-uuid [uuid] --fs-type + [ext4|xfs|btrfs] [data-path] [journal-path] + +Other options like :option:`--osd-uuid`, :option:`--journal-uuid`, +:option:`--zap-disk`, :option:`--data-dir`, :option:`--data-dev`, +:option:`--journal-file`, :option:`--journal-dev`, :option:`--dmcrypt` +and :option:`--dmcrypt-key-dir` can also be used with the subcommand. + +activate +-------- + +Activate the Ceph OSD. It mounts the volume in a temporary location, allocates +an OSD id (if needed), remounts in the correct location +``/var/lib/ceph/osd/$cluster-$id`` and starts ceph-osd. It is triggered by +``udev`` when it sees the OSD GPT partition type or on ceph service start with +``ceph disk activate-all``. It is also run directly or triggered by +:program:`ceph-deploy`. + +Usage:: + + ceph-disk activate [PATH] + +Here, [PATH] is path to a block device or a directory. + +An additional option :option:`--activate-key` has to be used with this +subcommand when a copy of ``/var/lib/ceph/bootstrap-osd/{cluster}.keyring`` +isn't present in the OSD node. + +Usage:: + + ceph-disk activate [PATH] [--activate-key PATH] + +Another option :option:`--mark-init` can also be used with this +subcommand. ``--mark-init`` provides init system to manage the OSD +directory. It defaults to ``auto`` which detects the init system +suitable for ceph (either ``sysvinit``, ``systemd`` or +``upstart``). The argument can be used to override the init system. It +may be convenient when an operating system supports multiple init +systems, such as Debian GNU/Linux jessie with ``systemd`` and +``sysvinit``. If the argument is ``none``, the OSD is not marked with +any init system and ``ceph-disk activate`` needs to be called +explicitely after each reboot. + -Usage: ceph-disk prepare --cluster [cluster-name] --cluster-uuid [uuid] --fs-type -[ext4|xfs|btrfs] [data-path] [journal-path] +Usage:: -Other options like --osd-uuid, --journal-uuid, --zap-disk, --data-dir, --data-dev, ---journal-file, --journal-dev, --dmcrypt and --dmcrypt-key-dir can also be used -with the subcommand. + ceph-disk activate [PATH] [--mark-init *sysvinit|upstart|systemd|auto|none*] -**activate**: Activate the ceph OSD. It mounts the volume in a temporary -location, allocates an OSD id (if needed), remounts in the correct location -/var/lib/ceph/osd/$cluster-$id and starts ceph-osd. It is triggered by udev -when it sees the OSD GPT partition type or on ceph service start with -'ceph disk activate-all'. It is also run directly or triggered by **ceph-deploy**. +If the option :option:`--no-start-daemon` is given, the activation +steps are performed but the OSD daemon is not started. -Usage: ceph-disk activate [PATH] +activate-journal +---------------- -Here, [PATH] is path to block device or directory. +Activate an OSD via it's journal device. ``udev`` triggers +``ceph-disk activate-journal `` based on the partition type. -An additional option [--activate-key PATH] has to be used with this subcommand -when a copy of /var/lib/ceph/bootstrap-osd/{cluster}.keyring isn't present in the -OSD node. +Usage:: -Usage: ceph-disk activate [PATH] [--activate-key PATH] + ceph-disk activate-journal [DEV] -Another option --mark-init can also be used with this subcommand. +Here, [DEV] is the path to a journal block device. -**activate-journal**: Activate an OSD via it's journal device. udev triggers -'ceph-disk activate-journal ' based on the partition type. +Others options like :option:`--activate-key` and :option:`--mark-init` can also +be used with this subcommand. -Usage: ceph-disk activate-journal [DEV] +``--mark-init`` provides init system to manage the OSD directory. -Here, [DEV] is the path to journal block device. +Usage:: -Others options can also be used with this subcommand like --activate-key and ---mark-init. + ceph-disk activate-journal [--activate-key PATH] [--mark-init INITSYSTEM] [DEV] -Usage: ceph-disk activate-journal [--activate-key PATH] [--mark-init INITSYSTEM] -[DEV] +activate-all +------------ -**activate-all**: Activate all tagged OSD partitions. activate-all relies on -/dev/disk/by-parttype-uuid/$typeuuid.$uuid to find all partitions. Special udev -rules are installed to create these links. It is triggered on ceph service start -or run directly. +Activate all tagged OSD partitions. ``activate-all`` relies on +``/dev/disk/by-parttype-uuid/$typeuuid.$uuid`` to find all partitions. Special +``udev`` rules are installed to create these links. It is triggered on ceph +service start or run directly. -Usage: ceph-disk activate-all +Usage:: -Others options can also be used with this subcommand like --activate-key and ---mark-init. + ceph-disk activate-all -Usage: ceph-disk activate-all [--activate-key PATH] [--mark-init INITSYSTEM] +Others options like :option:`--activate-key` and :option:`--mark-init` can +also be used with this subcommand. -**list**: List disk partitions and ceph OSDs. It is run directly or triggered -by **ceph-deploy**. +``--mark-init`` provides init system to manage the OSD directory. -Usage: ceph-disk list +Usage:: -**suppress-activate**: Suppress activate on a device (prefix). -Mark devices that you want to suppress activate with a file like -/var/lib/ceph/tmp/suppress-activate.sdb where the last bit is -the sanitized device name (/dev/X without the /dev/ prefix). A -function is_suppressed() checks for and matches a prefix (/dev/). -It means suppressing sdb will stop activate on sdb1, sdb2, etc. + ceph-disk activate-all [--activate-key PATH] [--mark-init INITSYSTEM] -Usage: ceph-disk suppress-activate [PATH] +list +---- -Here, [PATH] is path to block device or directory. +List disk partitions and Ceph OSDs. It is run directly or triggered by +:program:`ceph-deploy`. -**unsuppress-activate**: Stop suppressing activate on a device (prefix). +Usage:: -Usage: ceph-disk unsuppress-activate [PATH] + ceph-disk list -Here, [PATH] is path to block device or directory. +suppress-activate +----------------- -**zap**: Zap/erase/destroy a device's partition table and contents. -It actually uses 'sgdisk' and it's option '--zap-all' to destroy both -GPT and MBR data structures so that the disk becomes suitable for -repartitioning. 'sgdisk' then uses '--mbrtogpt' to convert the MBR or -BSD disklabel disk to a GPT disk. The **prepare** subcommand can now be -executed which will create a new GPT partition. It is also run directly -or triggered by **ceph-deploy**. +Suppress activate on a device (prefix). Mark devices that you don't want to +activate with a file like ``/var/lib/ceph/tmp/suppress-activate.sdb`` where the +last bit is the sanitized device name (/dev/X without the /dev/ prefix). A +function ``is_suppressed()`` checks for and matches a prefix (/dev/). It means +suppressing sdb will stop activate on sdb1, sdb2, etc. -Usage: ceph-disk zap [DEV] +Usage:: -Here, [DEV] is path to block device. + ceph-disk suppress-activate [PATH] + +Here, [PATH] is path to a block device or a directory. + +unsuppress-activate +------------------- + +Stop suppressing activate on a device (prefix). It is used to activate a device +that was earlier kept deactivated using ``suppress-activate``. + +Usage:: + + ceph-disk unsuppress-activate [PATH] + +Here, [PATH] is path to a block device or a directory. + +zap +--- + +Zap/erase/destroy a device's partition table and contents. It actually uses +``sgdisk`` and it's option ``--zap-all`` to destroy both GPT and MBR data +structures so that the disk becomes suitable for repartitioning. ``sgdisk`` +then uses ``--mbrtogpt`` to convert the MBR or BSD disklabel disk to a GPT +disk. The ``prepare`` subcommand can now be executed which will create a new +GPT partition. It is also run directly or triggered by :program:`ceph-deploy`. + +Usage:: + + ceph-disk zap [DEV] + +Here, [DEV] is path to a block device. Options ======= .. option:: --prepend-to-path PATH - Prepend PATH to $PATH for backward compatibility (default /usr/bin). + Prepend PATH to $PATH for backward compatibility (default ``/usr/bin``). .. option:: --statedir PATH - Directory in which ceph configuration is preserved (default /usr/lib/ceph). + Directory in which ceph configuration is preserved (default ``/usr/lib/ceph``). .. option:: --sysconfdir PATH - Directory in which ceph configuration files are found (default /etc/ceph). + Directory in which ceph configuration files are found (default ``/etc/ceph``). .. option:: --cluster @@ -145,7 +217,7 @@ Options .. option:: --fs-type - Provide the filesytem type for the OSD. e.g. 'xfs/ext4/btrfs'. + Provide the filesytem type for the OSD. e.g. ``xfs/ext4/btrfs``. .. option:: --osd-uuid @@ -161,11 +233,11 @@ Options .. option:: --data-dir - Verify that [data-path] is of a directory. + Verify that ``[data-path]`` is of a directory. .. option:: --data-dev - Verify that [data-path] is of a block device. + Verify that ``[data-path]`` is of a block device. .. option:: --journal-file @@ -177,15 +249,15 @@ Options .. option:: --dmcrypt - Encrypt [data-path] and/or journal devices with dm-crypt. + Encrypt ``[data-path]`` and/or journal devices with ``dm-crypt``. .. option:: --dmcrypt-key-dir - Directory where dm-crypt keys are stored. + Directory where ``dm-crypt`` keys are stored. .. option:: --activate-key - Use when a copy of /var/lib/ceph/bootstrap-osd/{cluster}.keyring isn't + Use when a copy of ``/var/lib/ceph/bootstrap-osd/{cluster}.keyring`` isn't present in the OSD node. Suffix the option by the path to the keyring. .. option:: --mark-init @@ -195,5 +267,11 @@ Options Availability ============ -**ceph-disk** is a part of the Ceph distributed storage system. Please refer to +:program:`ceph-disk` is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to the Ceph documentation at http://ceph.com/docs for more information. + +See also +======== + +:doc:`ceph-osd `\(8), +:doc:`ceph-deploy `\(8) diff --git a/doc/man/8/ceph-fuse.rst b/doc/man/8/ceph-fuse.rst index c33cd29b44dea..cede60e5f4176 100644 --- a/doc/man/8/ceph-fuse.rst +++ b/doc/man/8/ceph-fuse.rst @@ -1,3 +1,5 @@ +:orphan: + ========================================= ceph-fuse -- FUSE-based client for ceph ========================================= @@ -51,7 +53,7 @@ Any options not recognized by ceph-fuse will be passed on to libfuse. Availability ============ -**ceph-fuse** is part of the Ceph distributed storage system. Please refer to +**ceph-fuse** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to the Ceph documentation at http://ceph.com/docs for more information. diff --git a/doc/man/8/ceph-mds.rst b/doc/man/8/ceph-mds.rst index 964dbdcd972ce..d2ae92292256b 100644 --- a/doc/man/8/ceph-mds.rst +++ b/doc/man/8/ceph-mds.rst @@ -1,3 +1,5 @@ +:orphan: + ========================================= ceph-mds -- ceph metadata server daemon ========================================= @@ -67,7 +69,7 @@ Options Availability ============ -**ceph-mds** is part of the Ceph distributed storage system. Please refer to the Ceph documentation at +**ceph-mds** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to the Ceph documentation at http://ceph.com/docs for more information. diff --git a/doc/man/8/ceph-mon.rst b/doc/man/8/ceph-mon.rst index fdb33599912f6..287c668349bb9 100644 --- a/doc/man/8/ceph-mon.rst +++ b/doc/man/8/ceph-mon.rst @@ -1,3 +1,5 @@ +:orphan: + ================================= ceph-mon -- ceph monitor daemon ================================= @@ -68,7 +70,7 @@ Options Availability ============ -**ceph-mon** is part of the Ceph distributed storage system. Please refer +**ceph-mon** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to the Ceph documentation at http://ceph.com/docs for more information. diff --git a/doc/man/8/ceph-osd.rst b/doc/man/8/ceph-osd.rst index 6398bd6e8b44b..9a1e6afdd88d0 100644 --- a/doc/man/8/ceph-osd.rst +++ b/doc/man/8/ceph-osd.rst @@ -1,3 +1,5 @@ +:orphan: + ======================================== ceph-osd -- ceph object storage daemon ======================================== @@ -97,7 +99,7 @@ Options Availability ============ -**ceph-osd** is part of the Ceph distributed storage system. Please refer to +**ceph-osd** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to the Ceph documentation at http://ceph.com/docs for more information. See also diff --git a/doc/man/8/ceph-post-file.rst b/doc/man/8/ceph-post-file.rst index c4f0e7f7ae321..7e4899f5a1ad9 100644 --- a/doc/man/8/ceph-post-file.rst +++ b/doc/man/8/ceph-post-file.rst @@ -1,3 +1,5 @@ +:orphan: + ================================================== ceph-post-file -- post files for ceph developers ================================================== @@ -59,7 +61,7 @@ To upload several directories:: Availability ============ -**ceph-post-file** is part of the Ceph distributed storage system. Please refer to +**ceph-post-file** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to the Ceph documentation at http://ceph.com/docs for more information. See also diff --git a/doc/man/8/ceph-rbdnamer.rst b/doc/man/8/ceph-rbdnamer.rst index 4e5b893e2aff4..123c6e2847005 100644 --- a/doc/man/8/ceph-rbdnamer.rst +++ b/doc/man/8/ceph-rbdnamer.rst @@ -1,3 +1,5 @@ +:orphan: + ================================================== ceph-rbdnamer -- udev helper to name RBD devices ================================================== @@ -27,7 +29,7 @@ set up a device symlink. Availability ============ -**ceph-rbdnamer** is part of the Ceph distributed storage system. Please +**ceph-rbdnamer** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to the Ceph documentation at http://ceph.com/docs for more information. diff --git a/doc/man/8/ceph-rest-api.rst b/doc/man/8/ceph-rest-api.rst index 786833b2952de..f9eb3d4323dbe 100644 --- a/doc/man/8/ceph-rest-api.rst +++ b/doc/man/8/ceph-rest-api.rst @@ -1,3 +1,5 @@ +:orphan: + ===================================================== ceph-rest-api -- ceph RESTlike administration server ===================================================== @@ -138,7 +140,7 @@ see those messages in case of problem. Availability ============ -**ceph-rest-api** is part of the Ceph distributed storage system. Please refer to the Ceph documentation at +**ceph-rest-api** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to the Ceph documentation at http://ceph.com/docs for more information. diff --git a/doc/man/8/ceph-run.rst b/doc/man/8/ceph-run.rst index 6984564caf306..ed76c2848237d 100644 --- a/doc/man/8/ceph-run.rst +++ b/doc/man/8/ceph-run.rst @@ -1,3 +1,5 @@ +:orphan: + ========================================= ceph-run -- restart daemon on core dump ========================================= @@ -30,7 +32,7 @@ None Availability ============ -**ceph-run** is part of the Ceph distributed storage system. Please refer to +**ceph-run** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to the Ceph documentation at http://ceph.com/docs for more information. diff --git a/doc/man/8/ceph-syn.rst b/doc/man/8/ceph-syn.rst index fb00ca71b76ff..a30c460cb6f8d 100644 --- a/doc/man/8/ceph-syn.rst +++ b/doc/man/8/ceph-syn.rst @@ -1,3 +1,5 @@ +:orphan: + =============================================== ceph-syn -- ceph synthetic workload generator =============================================== @@ -87,7 +89,7 @@ line. This is not a complete list. Availability ============ -**ceph-syn** is part of the Ceph distributed storage system. Please refer to +**ceph-syn** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to the Ceph documentation at http://ceph.com/docs for more information. See also diff --git a/doc/man/8/ceph.rst b/doc/man/8/ceph.rst index 65f4bed00012e..a1d31884f7928 100644 --- a/doc/man/8/ceph.rst +++ b/doc/man/8/ceph.rst @@ -1,42 +1,1337 @@ -========================================== - ceph -- ceph file system control utility -========================================== +:orphan: + +================================== + ceph -- ceph administration tool +================================== .. program:: ceph -Synopsis -======== +Synopsis +======== + +| **ceph** **auth** [ *add* \| *caps* \| *del* \| *export* \| *get* \| *get-key* \| *get-or-create* \| *get-or-create-key* \| *import* \| *list* \| *print-key* \| *print_key* ] ... + +| **ceph** **compact** + +| **ceph** **config-key** [ *del* | *exists* | *get* | *list* | *put* ] ... + +| **ceph** **daemon** ** \| ** ** ... + +| **ceph** **daemonperf** ** \| ** [ *interval* [ *count* ] ] + +| **ceph** **df** *{detail}* + +| **ceph** **fs** [ *ls* \| *new* \| *reset* \| *rm* ] ... + +| **ceph** **fsid** + +| **ceph** **health** *{detail}* + +| **ceph** **heap** [ *dump* \| *start_profiler* \| *stop_profiler* \| *release* \| *stats* ] ... + +| **ceph** **injectargs** ** [ **... ] + +| **ceph** **log** ** [ **... ] + +| **ceph** **mds** [ *add_data_pool* \| *cluster_down* \| *cluster_up* \| *compat* \| *deactivate* \| *dump* \| *fail* \| *getmap* \| *newfs* \| *remove_data_pool* \| *rm* \| *rmfailed* \| *set* \| *set_max_mds* \| *set_state* \| *setmap* \| *stat* \| *stop* \| *tell* ] ... + +| **ceph** **mon** [ *add* \| *dump* \| *getmap* \| *remove* \| *stat* ] ... + +| **ceph** **mon_status** + +| **ceph** **osd** [ *blacklist* \| *blocked-by* \| *create* \| *deep-scrub* \| *df* \| *down* \| *dump* \| *erasure-code-profile* \| *find* \| *getcrushmap* \| *getmap* \| *getmaxosd* \| *in* \| *lspools* \| *map* \| *metadata* \| *out* \| *pause* \| *perf* \| *pg-temp* \| *primary-affinity* \| *primary-temp* \| *repair* \| *reweight* \| *reweight-by-pg* \| *rm* \| *scrub* \| *set* \| *setcrushmap* \| *setmaxosd* \| *stat* \| *thrash* \| *tree* \| *unpause* \| *unset* ] ... + +| **ceph** **osd** **crush** [ *add* \| *add-bucket* \| *create-or-move* \| *dump* \| *get-tunable* \| *link* \| *move* \| *remove* \| *rename-bucket* \| *reweight* \| *reweight-all* \| *reweight-subtree* \| *rm* \| *rule* \| *set* \| *set-tunable* \| *show-tunables* \| *tunables* \| *unlink* ] ... + +| **ceph** **osd** **pool** [ *create* \| *delete* \| *get* \| *get-quota* \| *ls* \| *mksnap* \| *rename* \| *rmsnap* \| *set* \| *set-quota* \| *stats* ] ... + +| **ceph** **osd** **tier** [ *add* \| *add-cache* \| *cache-mode* \| *remove* \| *remove-overlay* \| *set-overlay* ] ... + +| **ceph** **pg** [ *debug* \| *deep-scrub* \| *dump* \| *dump_json* \| *dump_pools_json* \| *dump_stuck* \| *force_create_pg* \| *getmap* \| *ls* \| *ls-by-osd* \| *ls-by-pool* \| *ls-by-primary* \| *map* \| *repair* \| *scrub* \| *send_pg_creates* \| *set_full_ratio* \| *set_nearfull_ratio* \| *stat* ] ... + +| **ceph** **quorum** [ *enter* \| *exit* ] + +| **ceph** **quorum_status** + +| **ceph** **report** { ** [ *...* ] } + +| **ceph** **scrub** + +| **ceph** **status** + +| **ceph** **sync** **force** {--yes-i-really-mean-it} {--i-know-what-i-am-doing} + +| **ceph** **tell** * [...]* + +| **ceph** **version** + +Description +=========== + +:program:`ceph` is a control utility which is used for manual deployment and maintenance +of a Ceph cluster. It provides a diverse set of commands that allows deployment of +monitors, OSDs, placement groups, MDS and overall maintenance, administration +of the cluster. + +Commands +======== + +auth +---- + +Manage authentication keys. It is used for adding, removing, exporting +or updating of authentication keys for a particular entity such as a monitor or +OSD. It uses some additional subcommands. + +Subcommand ``add`` adds authentication info for a particular entity from input +file, or random key if no input is given and/or any caps specified in the command. + +Usage:: + + ceph auth add { [...]} + +Subcommand ``caps`` updates caps for **name** from caps specified in the command. + +Usage:: + + ceph auth caps [...] + +Subcommand ``del`` deletes all caps for ``name``. + +Usage:: + + ceph auth del + +Subcommand ``export`` writes keyring for requested entity, or master keyring if +none given. + +Usage:: + + ceph auth export {} + +Subcommand ``get`` writes keyring file with requested key. + +Usage:: + + ceph auth get + +Subcommand ``get-key`` displays requested key. + +Usage:: + + ceph auth get-key + +Subcommand ``get-or-create`` adds authentication info for a particular entity +from input file, or random key if no input given and/or any caps specified in the +command. + +Usage:: + + ceph auth get-or-create { [...]} + +Subcommand ``get-or-create-key`` gets or adds key for ``name`` from system/caps +pairs specified in the command. If key already exists, any given caps must match +the existing caps for that key. + +Usage:: + + ceph auth get-or-create-key { [...]} + +Subcommand ``import`` reads keyring from input file. + +Usage:: + + ceph auth import + +Subcommand ``list`` lists authentication state. + +Usage:: + + ceph auth list + +Subcommand ``print-key`` displays requested key. + +Usage:: + + ceph auth print-key + +Subcommand ``print_key`` displays requested key. + +Usage:: + + ceph auth print_key + + +compact +------- + +Causes compaction of monitor's leveldb storage. + +Usage:: + + ceph compact + + +config-key +---------- + +Manage configuration key. It uses some additional subcommands. + +Subcommand ``del`` deletes configuration key. + +Usage:: + + ceph config-key del + +Subcommand ``exists`` checks for configuration keys existence. + +Usage:: + + ceph config-key exists + +Subcommand ``get`` gets the configuration key. + +Usage:: + + ceph config-key get + +Subcommand ``list`` lists configuration keys. + +Usage:: + + ceph config-key list + +Subcommand ``put`` puts configuration key and values. + +Usage:: + + ceph config-key put {} + + +daemon +------ + +Submit admin-socket commands. + +Usage:: + + ceph daemon {daemon_name|socket_path} {command} ... + +Example:: + + ceph daemon osd.0 help + + +daemonperf +---------- + +Watch performance counters from a Ceph daemon. + +Usage:: + + ceph daemonperf {daemon_name|socket_path} [{interval} [{count}]] + + +df +-- + +Show cluster's free space status. + +Usage:: + + ceph df {detail} + + +fs +-- + +Manage cephfs filesystems. It uses some additional subcommands. + +Subcommand ``ls`` to list filesystems + +Usage:: + + ceph fs ls + +Subcommand ``new`` to make a new filesystem using named pools and + +Usage:: + + ceph fs new + +Subcommand ``reset`` is used for disaster recovery only: reset to a single-MDS map + +Usage:: + + ceph fs reset {--yes-i-really-mean-it} + +Subcommand ``rm`` to disable the named filesystem + +Usage:: + + ceph fs rm {--yes-i-really-mean-it} + + +fsid +---- + +Show cluster's FSID/UUID. + +Usage:: + + ceph fsid + + +health +------ + +Show cluster's health. + +Usage:: + + ceph health {detail} + + +heap +---- + +Show heap usage info (available only if compiled with tcmalloc) + +Usage:: + + ceph heap dump|start_profiler|stop_profiler|release|stats + + +injectargs +---------- + +Inject configuration arguments into monitor. + +Usage:: + + ceph injectargs [...] + + +log +--- + +Log supplied text to the monitor log. + +Usage:: + + ceph log [...] + + +mds +--- + +Manage metadata server configuration and administration. It uses some +additional subcommands. + +Subcommand ``add_data_pool`` adds data pool. + +Usage:: + + ceph mds add_data_pool + +Subcommand ``cluster_down`` takes mds cluster down. + +Usage:: + + ceph mds cluster_down + +Subcommand ``cluster_up`` brings mds cluster up. + +Usage:: + + ceph mds cluster_up + +Subcommand ``compat`` manages compatible features. It uses some additional +subcommands. + +Subcommand ``rm_compat`` removes compatible feature. + +Usage:: + + ceph mds compat rm_compat + +Subcommand ``rm_incompat`` removes incompatible feature. + +Usage:: + + ceph mds compat rm_incompat + +Subcommand ``show`` shows mds compatibility settings. + +Usage:: + + ceph mds compat show + +Subcommand ``deactivate`` stops mds. + +Usage:: + + ceph mds deactivate + +Subcommand ``dump`` dumps information, optionally from epoch. + +Usage:: + + ceph mds dump {} + +Subcommand ``fail`` forces mds to status fail. + +Usage:: + + ceph mds fail + +Subcommand ``getmap`` gets MDS map, optionally from epoch. + +Usage:: + + ceph mds getmap {} + +Subcommand ``newfs`` makes new filesystem using pools and . + +Usage:: + + ceph mds newfs {--yes-i-really-mean-it} + +Subcommand ``remove_data_pool`` removes data pool. + +Usage:: + + ceph mds remove_data_pool + +Subcommand ``rm`` removes inactive mds. + +Usage:: + + ceph mds rm (type.id)> + +Subcommand ``rmfailed`` removes failed mds. + +Usage:: + + ceph mds rmfailed + +Subcommand ``set`` set mds parameter to + +Usage:: + + ceph mds set max_mds|max_file_size|allow_new_snaps|inline_data {} + +Subcommand ``set_max_mds`` sets max MDS index. + +Usage:: + + ceph mds set_max_mds + +Subcommand ``set_state`` sets mds state of to . + +Usage:: + + ceph mds set_state + +Subcommand ``setmap`` sets mds map; must supply correct epoch number. + +Usage:: + + ceph mds setmap + +Subcommand ``stat`` shows MDS status. + +Usage:: + + ceph mds stat + +Subcommand ``stop`` stops mds. + +Usage:: + + ceph mds stop + +Subcommand ``tell`` sends command to particular mds. + +Usage:: + + ceph mds tell [...] + +mon +--- + +Manage monitor configuration and administration. It uses some additional +subcommands. + +Subcommand ``add`` adds new monitor named at . + +Usage:: + + ceph mon add + +Subcommand ``dump`` dumps formatted monmap (optionally from epoch) + +Usage:: + + ceph mon dump {} + +Subcommand ``getmap`` gets monmap. + +Usage:: + + ceph mon getmap {} + +Subcommand ``remove`` removes monitor named . + +Usage:: + + ceph mon remove + +Subcommand ``stat`` summarizes monitor status. + +Usage:: + + ceph mon stat + +mon_status +---------- + +Reports status of monitors. + +Usage:: + + ceph mon_status + +osd +--- + +Manage OSD configuration and administration. It uses some additional +subcommands. + +Subcommand ``blacklist`` manage blacklisted clients. It uses some additional +subcommands. + +Subcommand ``add`` add to blacklist (optionally until seconds +from now) + +Usage:: + + ceph osd blacklist add {} + +Subcommand ``ls`` show blacklisted clients + +Usage:: + + ceph osd blacklist ls + +Subcommand ``rm`` remove from blacklist + +Usage:: + + ceph osd blacklist rm + +Subcommand ``blocked-by`` prints a histogram of which OSDs are blocking their peers + +Usage:: + + ceph osd blocked-by + +Subcommand ``create`` creates new osd (with optional UUID and ID). + +Usage:: + + ceph osd create {} {} + +Subcommand ``crush`` is used for CRUSH management. It uses some additional +subcommands. + +Subcommand ``add`` adds or updates crushmap position and weight for with + and location . + +Usage:: + + ceph osd crush add [...] + +Subcommand ``add-bucket`` adds no-parent (probably root) crush bucket of +type . + +Usage:: + + ceph osd crush add-bucket + +Subcommand ``create-or-move`` creates entry or moves existing entry for + at/to location . + +Usage:: + + ceph osd crush create-or-move + [...] + +Subcommand ``dump`` dumps crush map. + +Usage:: + + ceph osd crush dump + +Subcommand ``get-tunable`` get crush tunable straw_calc_version + +Usage:: + + ceph osd crush get-tunable straw_calc_version + +Subcommand ``link`` links existing entry for under location . + +Usage:: + + ceph osd crush link [...] + +Subcommand ``move`` moves existing entry for to location . + +Usage:: + + ceph osd crush move [...] + +Subcommand ``remove`` removes from crush map (everywhere, or just at +). + +Usage:: + + ceph osd crush remove {} + +Subcommand ``rename-bucket`` renames buchket to + +Usage:: + + ceph osd crush rename-bucket + +Subcommand ``reweight`` change 's weight to in crush map. + +Usage:: + + ceph osd crush reweight + +Subcommand ``reweight-all`` recalculate the weights for the tree to +ensure they sum correctly + +Usage:: + + ceph osd crush reweight-all + +Subcommand ``reweight-subtree`` changes all leaf items beneath +to in crush map + +Usage:: + + ceph osd crush reweight-subtree + +Subcommand ``rm`` removes from crush map (everywhere, or just at +). + +Usage:: + + ceph osd crush rm {} + +Subcommand ``rule`` is used for creating crush rules. It uses some additional +subcommands. + +Subcommand ``create-erasure`` creates crush rule for erasure coded pool +created with (default default). + +Usage:: + + ceph osd crush rule create-erasure {} + +Subcommand ``create-simple`` creates crush rule to start from , +replicate across buckets of type , using a choose mode of +(default firstn; indep best for erasure pools). + +Usage:: + + ceph osd crush rule create-simple {firstn|indep} + +Subcommand ``dump`` dumps crush rule (default all). + +Usage:: + + ceph osd crush rule dump {} + +Subcommand ``list`` lists crush rules. + +Usage:: + + ceph osd crush rule list + +Subcommand ``ls`` lists crush rules. + +Usage:: + + ceph osd crush rule ls + +Subcommand ``rm`` removes crush rule . + +Usage:: + + ceph osd crush rule rm + +Subcommand ``set`` used alone, sets crush map from input file. + +Usage:: + + ceph osd crush set + +Subcommand ``set`` with osdname/osd.id update crushmap position and weight +for to with location . + +Usage:: + + ceph osd crush set [...] + +Subcommand ``set-tunable`` set crush tunable to . The only +tunable that can be set is straw_calc_version. + +Usage:: + + ceph osd crush set-tunable straw_calc_version + +Subcommand ``show-tunables`` shows current crush tunables. + +Usage:: + + ceph osd crush show-tunables + +Subcommand ``tree`` shows the crush buckets and items in a tree view. + +Usage:: + + ceph osd crush tree + +Subcommand ``tunables`` sets crush tunables values to . + +Usage:: + + ceph osd crush tunables legacy|argonaut|bobtail|firefly|hammer|optimal|default + +Subcommand ``unlink`` unlinks from crush map (everywhere, or just at +). + +Usage:: + + ceph osd crush unlink {} + +Subcommand ``df`` shows OSD utilization + +Usage:: + + ceph osd df {plain|tree} + +Subcommand ``deep-scrub`` initiates deep scrub on specified osd. + +Usage:: + + ceph osd deep-scrub + +Subcommand ``down`` sets osd(s) [...] down. + +Usage:: + + ceph osd down [...] + +Subcommand ``dump`` prints summary of OSD map. + +Usage:: + + ceph osd dump {} + +Subcommand ``erasure-code-profile`` is used for managing the erasure code +profiles. It uses some additional subcommands. + +Subcommand ``get`` gets erasure code profile . + +Usage:: + + ceph osd erasure-code-profile get + +Subcommand ``ls`` lists all erasure code profiles. + +Usage:: + + ceph osd erasure-code-profile ls + +Subcommand ``rm`` removes erasure code profile . + +Usage:: + + ceph osd erasure-code-profile rm + +Subcommand ``set`` creates erasure code profile with [ ...] +pairs. Add a --force at the end to override an existing profile (IT IS RISKY). + +Usage:: + + ceph osd erasure-code-profile set { [...]} + +Subcommand ``find`` find osd in the CRUSH map and shows its location. + +Usage:: + + ceph osd find + +Subcommand ``getcrushmap`` gets CRUSH map. + +Usage:: + + ceph osd getcrushmap {} -| **ceph** [ -m *monaddr* ] [ -w | *command* ... ] +Subcommand ``getmap`` gets OSD map. +Usage:: -Description -=========== + ceph osd getmap {} + +Subcommand ``getmaxosd`` shows largest OSD id. + +Usage:: + + ceph osd getmaxosd + +Subcommand ``in`` sets osd(s) [...] in. + +Usage:: + + ceph osd in [...] + +Subcommand ``lost`` marks osd as permanently lost. THIS DESTROYS DATA IF NO +MORE REPLICAS EXIST, BE CAREFUL. + +Usage:: + + ceph osd lost {--yes-i-really-mean-it} + +Subcommand ``ls`` shows all OSD ids. + +Usage:: + + ceph osd ls {} + +Subcommand ``lspools`` lists pools. + +Usage:: + + ceph osd lspools {} + +Subcommand ``map`` finds pg for in . + +Usage:: + + ceph osd map + +Subcommand ``metadata`` fetches metadata for osd . + +Usage:: + + ceph osd metadata + +Subcommand ``out`` sets osd(s) [...] out. + +Usage:: + + ceph osd out [...] + +Subcommand ``pause`` pauses osd. + +Usage:: + + ceph osd pause + +Subcommand ``perf`` prints dump of OSD perf summary stats. + +Usage:: + + ceph osd perf + +Subcommand ``pg-temp`` set pg_temp mapping pgid:[ [...]] (developers +only). + +Usage:: + + ceph osd pg-temp { [...]} + +Subcommand ``pool`` is used for managing data pools. It uses some additional +subcommands. + +Subcommand ``create`` creates pool. + +Usage:: + + ceph osd pool create {} {replicated|erasure} + {} {} {} + +Subcommand ``delete`` deletes pool. + +Usage:: + + ceph osd pool delete {} {--yes-i-really-really-mean-it} + +Subcommand ``get`` gets pool parameter . + +Usage:: + + ceph osd pool get size|min_size|crash_replay_interval|pg_num| + pgp_num|crush_ruleset|auid|write_fadvise_dontneed + +Only for tiered pools:: + + ceph osd pool get hit_set_type|hit_set_period|hit_set_count|hit_set_fpp| + target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_dirty_high_ratio| + cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age| + min_read_recency_for_promote + +Only for erasure coded pools:: + + ceph osd pool get erasure_code_profile + +Use ``all`` to get all pool parameters that apply to the pool's type:: + + ceph osd pool get all + +Subcommand ``get-quota`` obtains object or byte limits for pool. + +Usage:: + + ceph osd pool get-quota + +Subcommand ``ls`` list pools + +Usage:: + + ceph osd pool ls {detail} + +Subcommand ``mksnap`` makes snapshot in . + +Usage:: + + ceph osd pool mksnap + +Subcommand ``rename`` renames to . + +Usage:: + + ceph osd pool rename + +Subcommand ``rmsnap`` removes snapshot from . + +Usage:: + + ceph osd pool rmsnap + +Subcommand ``set`` sets pool parameter to . + +Usage:: + + ceph osd pool set size|min_size|crash_replay_interval|pg_num| + pgp_num|crush_ruleset|hashpspool|nodelete|nopgchange|nosizechange| + hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|debug_fake_ec_pool| + target_max_bytes|target_max_objects|cache_target_dirty_ratio| + cache_target_dirty_high_ratio| + cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid| + min_read_recency_for_promote|write_fadvise_dontneed + {--yes-i-really-mean-it} + +Subcommand ``set-quota`` sets object or byte limit on pool. + +Usage:: + + ceph osd pool set-quota max_objects|max_bytes + +Subcommand ``stats`` obtain stats from all pools, or from specified pool. + +Usage:: + + ceph osd pool stats {} + +Subcommand ``primary-affinity`` adjust osd primary-affinity from 0.0 <= +<= 1.0 + +Usage:: + + ceph osd primary-affinity + +Subcommand ``primary-temp`` sets primary_temp mapping pgid:|-1 (developers +only). + +Usage:: + + ceph osd primary-temp + +Subcommand ``repair`` initiates repair on a specified osd. + +Usage:: + + ceph osd repair + +Subcommand ``reweight`` reweights osd to 0.0 < < 1.0. + +Usage:: + + osd reweight + +Subcommand ``reweight-by-pg`` reweight OSDs by PG distribution +[overload-percentage-for-consideration, default 120]. + +Usage:: + + ceph osd reweight-by-pg {} { [} + +Subcommand ``rm`` removes osd(s) [...] in the cluster. + +Usage:: + + ceph osd rm [...] + +Subcommand ``scrub`` initiates scrub on specified osd. + +Usage:: + + ceph osd scrub + +Subcommand ``set`` sets . + +Usage:: + + ceph osd set full|pause|noup|nodown|noout|noin|nobackfill| + norebalance|norecover|noscrub|nodeep-scrub|notieragent + +Subcommand ``setcrushmap`` sets crush map from input file. + +Usage:: + + ceph osd setcrushmap + +Subcommand ``setmaxosd`` sets new maximum osd value. + +Usage:: + + ceph osd setmaxosd + +Subcommand ``stat`` prints summary of OSD map. + +Usage:: + + ceph osd stat + +Subcommand ``thrash`` thrashes OSDs for . + +Usage:: + + ceph osd thrash + +Subcommand ``tier`` is used for managing tiers. It uses some additional +subcommands. + +Subcommand ``add`` adds the tier (the second one) to base pool +(the first one). + +Usage:: + + ceph osd tier add {--force-nonempty} + +Subcommand ``add-cache`` adds a cache (the second one) of size +to existing pool (the first one). + +Usage:: + + ceph osd tier add-cache + +Subcommand ``cache-mode`` specifies the caching mode for cache tier . + +Usage:: + + ceph osd tier cache-mode none|writeback|forward|readonly| + readforward|readproxy + +Subcommand ``remove`` removes the tier (the second one) from base pool + (the first one). + +Usage:: -**ceph** is a control utility for communicating with the monitor -cluster of a running Ceph distributed storage system. + ceph osd tier remove -There are three basic modes of operation. +Subcommand ``remove-overlay`` removes the overlay pool for base pool . -Interactive mode ----------------- +Usage:: -To start in interactive mode, no arguments are necessary. Control-d or -'quit' will exit. + ceph osd tier remove-overlay -Watch mode +Subcommand ``set-overlay`` set the overlay pool for base pool to be +. + +Usage:: + + ceph osd tier set-overlay + +Subcommand ``tree`` prints OSD tree. + +Usage:: + + ceph osd tree {} + +Subcommand ``unpause`` unpauses osd. + +Usage:: + + ceph osd unpause + +Subcommand ``unset`` unsets . + +Usage:: + + ceph osd unset full|pause|noup|nodown|noout|noin|nobackfill| + norebalance|norecover|noscrub|nodeep-scrub|notieragent + + +pg +-- + +It is used for managing the placement groups in OSDs. It uses some +additional subcommands. + +Subcommand ``debug`` shows debug info about pgs. + +Usage:: + + ceph pg debug unfound_objects_exist|degraded_pgs_exist + +Subcommand ``deep-scrub`` starts deep-scrub on . + +Usage:: + + ceph pg deep-scrub + +Subcommand ``dump`` shows human-readable versions of pg map (only 'all' valid +with plain). + +Usage:: + + ceph pg dump {all|summary|sum|delta|pools|osds|pgs|pgs_brief} [{all|summary|sum|delta|pools|osds|pgs|pgs_brief...]} + +Subcommand ``dump_json`` shows human-readable version of pg map in json only. + +Usage:: + + ceph pg dump_json {all|summary|sum|delta|pools|osds|pgs|pgs_brief} [{all|summary|sum|delta|pools|osds|pgs|pgs_brief...]} + +Subcommand ``dump_pools_json`` shows pg pools info in json only. + +Usage:: + + ceph pg dump_pools_json + +Subcommand ``dump_stuck`` shows information about stuck pgs. + +Usage:: + + ceph pg dump_stuck {inactive|unclean|stale|undersized|degraded [inactive|unclean|stale|undersized|degraded...]} + {} + +Subcommand ``force_create_pg`` forces creation of pg . + +Usage:: + + ceph pg force_create_pg + +Subcommand ``getmap`` gets binary pg map to -o/stdout. + +Usage:: + + ceph pg getmap + +Subcommand ``ls`` lists pg with specific pool, osd, state + +Usage:: + + ceph pg ls {} {active|clean|down|replay|splitting| + scrubbing|scrubq|degraded|inconsistent|peering|repair| + recovery|backfill_wait|incomplete|stale| remapped| + deep_scrub|backfill|backfill_toofull|recovery_wait| + undersized [active|clean|down|replay|splitting| + scrubbing|scrubq|degraded|inconsistent|peering|repair| + recovery|backfill_wait|incomplete|stale|remapped| + deep_scrub|backfill|backfill_toofull|recovery_wait| + undersized...]} + +Subcommand ``ls-by-osd`` lists pg on osd [osd] + +Usage:: + + ceph pg ls-by-osd {} + {active|clean|down|replay|splitting| + scrubbing|scrubq|degraded|inconsistent|peering|repair| + recovery|backfill_wait|incomplete|stale| remapped| + deep_scrub|backfill|backfill_toofull|recovery_wait| + undersized [active|clean|down|replay|splitting| + scrubbing|scrubq|degraded|inconsistent|peering|repair| + recovery|backfill_wait|incomplete|stale|remapped| + deep_scrub|backfill|backfill_toofull|recovery_wait| + undersized...]} + +Subcommand ``ls-by-pool`` lists pg with pool = [poolname | poolid] + +Usage:: + + ceph pg ls-by-pool {} {active| + clean|down|replay|splitting| + scrubbing|scrubq|degraded|inconsistent|peering|repair| + recovery|backfill_wait|incomplete|stale| remapped| + deep_scrub|backfill|backfill_toofull|recovery_wait| + undersized [active|clean|down|replay|splitting| + scrubbing|scrubq|degraded|inconsistent|peering|repair| + recovery|backfill_wait|incomplete|stale|remapped| + deep_scrub|backfill|backfill_toofull|recovery_wait| + undersized...]} + +Subcommand ``ls-by-primary`` lists pg with primary = [osd] + +Usage:: + + ceph pg ls-by-primary {} + {active|clean|down|replay|splitting| + scrubbing|scrubq|degraded|inconsistent|peering|repair| + recovery|backfill_wait|incomplete|stale| remapped| + deep_scrub|backfill|backfill_toofull|recovery_wait| + undersized [active|clean|down|replay|splitting| + scrubbing|scrubq|degraded|inconsistent|peering|repair| + recovery|backfill_wait|incomplete|stale|remapped| + deep_scrub|backfill|backfill_toofull|recovery_wait| + undersized...]} + +Subcommand ``map`` shows mapping of pg to osds. + +Usage:: + + ceph pg map + +Subcommand ``repair`` starts repair on . + +Usage:: + + ceph pg repair + +Subcommand ``scrub`` starts scrub on . + +Usage:: + + ceph pg scrub + +Subcommand ``send_pg_creates`` triggers pg creates to be issued. + +Usage:: + + ceph pg send_pg_creates + +Subcommand ``set_full_ratio`` sets ratio at which pgs are considered full. + +Usage:: + + ceph pg set_full_ratio + +Subcommand ``set_nearfull_ratio`` sets ratio at which pgs are considered nearly +full. + +Usage:: + + ceph pg set_nearfull_ratio + +Subcommand ``stat`` shows placement group status. + +Usage:: + + ceph pg stat + + +quorum +------ + +Enter or exit quorum. + +Usage:: + + ceph quorum enter|exit + + +quorum_status +------------- + +Reports status of monitor quorum. + +Usage:: + + ceph quorum_status + + +report +------ + +Reports full status of cluster, optional title tag strings. + +Usage:: + + ceph report { [...]} + + +scrub +----- + +Scrubs the monitor stores. + +Usage:: + + ceph scrub + + +status +------ + +Shows cluster status. + +Usage:: + + ceph status + + +sync force ---------- -Watch mode shows cluster state changes as they occur. For example:: +Forces sync of and clear monitor store. + +Usage:: + + ceph sync force {--yes-i-really-mean-it} {--i-know-what-i-am-doing} + + +tell +---- + +Sends a command to a specific daemon. + +Usage:: - ceph -w + ceph tell [...] -Command line mode ------------------ +version +------- -Finally, to send a single instruction to the monitor cluster (and wait -for a response), the command can be specified on the command line. +Show mon daemon version +Usage:: + + ceph version Options ======= @@ -56,42 +1351,87 @@ Options .. option:: -c ceph.conf, --conf=ceph.conf Use ceph.conf configuration file instead of the default - /etc/ceph/ceph.conf to determine monitor addresses during startup. + ``/etc/ceph/ceph.conf`` to determine monitor addresses during startup. -.. option:: -m monaddress[:port] +.. option:: --id CLIENT_ID, --user CLIENT_ID - Connect to specified monitor (instead of looking through ceph.conf). + Client id for authentication. +.. option:: --name CLIENT_NAME, -n CLIENT_NAME -Examples -======== + Client name for authentication. + +.. option:: --cluster CLUSTER + + Name of the Ceph cluster. + +.. option:: --admin-daemon ADMIN_SOCKET + + Submit admin-socket commands. + +.. option:: --admin-socket ADMIN_SOCKET_NOPE + + You probably mean --admin-daemon + +.. option:: -s, --status + + Show cluster status. + +.. option:: -w, --watch + + Watch live cluster changes. + +.. option:: --watch-debug + + Watch debug events. + +.. option:: --watch-info + + Watch info events. + +.. option:: --watch-sec + + Watch security events. + +.. option:: --watch-warn + + Watch warning events. + +.. option:: --watch-error + + Watch error events. + +.. option:: --version, -v + + Display version. -To grab a copy of the current OSD map:: +.. option:: --verbose - ceph -m 1.2.3.4:6789 osd getmap -o osdmap + Make verbose. -To get a dump of placement group (PG) state:: +.. option:: --concise - ceph pg dump -o pg.txt + Make less verbose. +.. option:: -f {json,json-pretty,xml,xml-pretty,plain}, --format -Monitor commands -================ + Format of output. -A more complete summary of commands understood by the monitor cluster can be found in the -online documentation, at +.. option:: --connect-timeout CLUSTER_TIMEOUT - http://ceph.com/docs/master/rados/operations/control + Set a timeout for connecting to the cluster. Availability ============ -**ceph** is part of the Ceph distributed storage system. Please refer to the Ceph documentation at -http://ceph.com/docs for more information. +:program:`ceph` is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to +the Ceph documentation at http://ceph.com/docs for more information. See also ======== -:doc:`ceph `\(8), +:doc:`ceph-mon `\(8), +:doc:`ceph-osd `\(8), +:doc:`ceph-mds `\(8) diff --git a/doc/man/8/cephfs.rst b/doc/man/8/cephfs.rst index 39f105b4386f4..0ad91d045df66 100644 --- a/doc/man/8/cephfs.rst +++ b/doc/man/8/cephfs.rst @@ -1,3 +1,5 @@ +:orphan: + ============================================ cephfs -- ceph file system options utility ============================================ @@ -58,7 +60,7 @@ Setting options: .. option:: -o --osd - Set the preferred OSD to use as the primary + Set the preferred OSD to use as the primary (deprecated and ignored) Limitations @@ -86,7 +88,7 @@ in modern versions of the Ceph servers; do not use it. Availability ============ -**cephfs** is part of the Ceph distributed storage system. Please refer +**cephfs** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to the Ceph documentation at http://ceph.com/docs for more information. diff --git a/doc/man/8/crushtool.rst b/doc/man/8/crushtool.rst index 27f1de757a322..3c449476b30ef 100644 --- a/doc/man/8/crushtool.rst +++ b/doc/man/8/crushtool.rst @@ -1,3 +1,5 @@ +:orphan: + ========================================== crushtool -- CRUSH map manipulation tool ========================================== @@ -47,7 +49,7 @@ The tool has four modes of operation. object names. See below for a detailed explanation. Unlike other Ceph tools, **crushtool** does not accept generic options -such as **--debug-crush** from the command line. They can however be +such as **--debug-crush** from the command line. They can, however, be provided via the CEPH_ARGS environment variable. For instance, to silence all output from the CRUSH subsystem:: @@ -60,13 +62,15 @@ Running tests with --test The test mode will use the input crush map ( as specified with **-i map** ) and perform a dry run of CRUSH mapping or random placement ( if **--simulate** is set ). On completion, two kinds of reports can be -created. The **--show-...** options output human readable information -on stderr. The **--output-csv** option creates CSV files that are +created. +1) The **--show-...** option outputs human readable information +on stderr. +2) The **--output-csv** option creates CSV files that are documented by the **--help-output** option. .. option:: --show-statistics - for each rule display the mapping of each object. For instance:: + For each rule, displays the mapping of each object. For instance:: CRUSH rule 1 x 24 [11,6] @@ -80,7 +84,7 @@ documented by the **--help-output** option. mapped **1024** objects to **result size == 5** devices when trying to map them to **num_rep 5** replicas. When it fails to provide the required mapping, presumably because the number of **tries** must - be increased, a breakdown of the failures is displays. For instance:: + be increased, a breakdown of the failures is displayed. For instance:: rule 1 (metadata) num_rep 10 result size == 8: 4/1024 rule 1 (metadata) num_rep 10 result size == 9: 93/1024 @@ -92,17 +96,17 @@ documented by the **--help-output** option. .. option:: --show-bad-mappings - display which object failed to be mapped to the required number of + Displays which object failed to be mapped to the required number of devices. For instance:: bad mapping rule 1 x 781 num_rep 7 result [8,10,2,11,6,9] shows that when rule **1** was required to map **7** devices, it - could only map six : **[8,10,2,11,6,9]**. + could map only six : **[8,10,2,11,6,9]**. .. option:: --show-utilization - display the expected and actual utilisation for each device, for + Displays the expected and actual utilisation for each device, for each number of replicas. For instance:: device 0: stored : 951 expected : 853.333 @@ -114,13 +118,13 @@ documented by the **--help-output** option. .. option:: --show-utilization-all - displays the same as **--show-utilization** but does not suppress + Displays the same as **--show-utilization** but does not suppress output when the weight of a device is zero. Implies **--show-statistics**. .. option:: --show-choose-tries - display how many attempts were needed to find a device mapping. + Displays how many attempts were needed to find a device mapping. For instance:: 0: 95224 @@ -134,10 +138,10 @@ documented by the **--help-output** option. .. option:: --output-csv - create CSV files (in the current directory) containing information + Creates CSV files (in the current directory) containing information documented by **--help-output**. The files are named after the rule used when collecting the statistics. For instance, if the rule - metadata is used, the CSV files will be:: + : 'metadata' is used, the CSV files will be:: metadata-absolute_weights.csv metadata-device_utilization.csv @@ -153,7 +157,7 @@ documented by the **--help-output** option. .. option:: --output-name NAME - prepend **NAME** to the file names generated when **--output-csv** + Prepend **NAME** to the file names generated when **--output-csv** is specified. For instance **--output-name FOO** will create files:: @@ -244,7 +248,7 @@ creating a new Ceph cluster. They can be further edited with:: Availability ============ -**crushtool** is part of the Ceph distributed storage system. Please +**crushtool** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to the Ceph documentation at http://ceph.com/docs for more information. diff --git a/doc/man/8/librados-config.rst b/doc/man/8/librados-config.rst index 37e6760456663..940e8c2e4fb5a 100644 --- a/doc/man/8/librados-config.rst +++ b/doc/man/8/librados-config.rst @@ -1,3 +1,5 @@ +:orphan: + ======================================================= librados-config -- display information about librados ======================================================= @@ -32,7 +34,7 @@ Options Availability ============ -**librados-config** is part of the Ceph distributed storage system. +**librados-config** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to the Ceph documentation at http://ceph.com/docs for more information. diff --git a/doc/man/8/monmaptool.rst b/doc/man/8/monmaptool.rst index 1680e893e107e..97d5d40324bdc 100644 --- a/doc/man/8/monmaptool.rst +++ b/doc/man/8/monmaptool.rst @@ -1,3 +1,5 @@ +:orphan: + ========================================================== monmaptool -- ceph monitor cluster map manipulation tool ========================================================== @@ -93,7 +95,7 @@ To replace one monitor:: Availability ============ -**monmaptool** is part of the Ceph distributed storage system. Please +**monmaptool** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to the Ceph documentation at http://ceph.com/docs for more information. diff --git a/doc/man/8/mount.ceph.rst b/doc/man/8/mount.ceph.rst index 89995f58716cd..c257a70f71647 100644 --- a/doc/man/8/mount.ceph.rst +++ b/doc/man/8/mount.ceph.rst @@ -1,3 +1,5 @@ +:orphan: + ======================================== mount.ceph -- mount a ceph file system ======================================== @@ -134,7 +136,7 @@ If there are multiple monitors:: mount.ceph monhost1,monhost2,monhost3:/ /mnt/foo -If :doc:`ceph-mon `\(8) is running on a non-standard +If :doc:`ceph-mon `\(8) is running on a non-standard port:: mount.ceph monhost1:7000,monhost2:7000,monhost3:7000:/ /mnt/foo @@ -152,7 +154,7 @@ automatically invoked by mount(8) like so:: Availability ============ -**mount.ceph** is part of the Ceph distributed storage system. Please +**mount.ceph** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to the Ceph documentation at http://ceph.com/docs for more information. diff --git a/doc/man/8/osdmaptool.rst b/doc/man/8/osdmaptool.rst index 120896bd583b2..cf274244fdc1a 100644 --- a/doc/man/8/osdmaptool.rst +++ b/doc/man/8/osdmaptool.rst @@ -1,3 +1,5 @@ +:orphan: + ====================================================== osdmaptool -- ceph osd cluster map manipulation tool ====================================================== @@ -63,7 +65,7 @@ To view the result:: Availability ============ -**osdmaptool** is part of the Ceph distributed storage system. Please +**osdmaptool** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to the Ceph documentation at http://ceph.com/docs for more information. diff --git a/doc/man/8/rados.rst b/doc/man/8/rados.rst index e9088d23a0d6a..ce8a8032a95a3 100644 --- a/doc/man/8/rados.rst +++ b/doc/man/8/rados.rst @@ -1,3 +1,5 @@ +:orphan: + ======================================= rados -- rados object storage utility ======================================= @@ -51,6 +53,15 @@ Options Connect to specified monitor (instead of looking through ceph.conf). +.. option:: -b block_size + + Set the block size for put/get ops and for write benchmarking. + +.. option:: --striper + + Uses the striping API of rados rather than the default one. + Available for stat, get, put, truncate, rm, ls and all xattr related operation + Global commands =============== @@ -102,7 +113,8 @@ Pool specific commands sequential or random. Before running one of the reading benchmarks, run a write benchmark with the *--no-cleanup* option. The default object size is 4 MB, and the default number of simulated threads - (parallel writes) is 16. + (parallel writes) is 16. + Note: -b *objsize* option is valid only in *write* mode. :command:`cleanup` @@ -159,7 +171,7 @@ To read a previously snapshotted version of an object:: Availability ============ -**rados** is part of the Ceph distributed storage system. Please refer to +**rados** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to the Ceph documentation at http://ceph.com/docs for more information. diff --git a/doc/man/8/radosgw-admin.rst b/doc/man/8/radosgw-admin.rst index cc7b626cab74d..6f987d5c818c3 100644 --- a/doc/man/8/radosgw-admin.rst +++ b/doc/man/8/radosgw-admin.rst @@ -1,3 +1,5 @@ +:orphan: + ================================================================= radosgw-admin -- rados REST gateway user administration utility ================================================================= @@ -13,75 +15,207 @@ Synopsis Description =========== -**radosgw-admin** is a RADOS gateway user administration utility. It +:program:`radosgw-admin` is a RADOS gateway user administration utility. It allows creating and modifying users. Commands ======== -*command* can be one of the following options: +:program:`radosgw-admin` utility uses many commands for administration purpose +which are as follows: :command:`user create` - Create a new user + Create a new user. :command:`user modify` - Modify a user + Modify a user. :command:`user info` Display information of a user, and any potentially available - subusers and keys + subusers and keys. :command:`user rm` - Remove a user + Remove a user. + +:command:`user suspend` + Suspend a user. + +:command:`user enable` + Re-enable user after suspension. + +:command:`user check` + Check user info. + +:command:`user stats` + Show user stats as accounted by quota subsystem. + +:command:`caps add` + Add user capabilities. + +:command:`caps rm` + Remove user capabilities. :command:`subuser create` - Create a new subuser (primarily useful for clients using the Swift API) + Create a new subuser (primarily useful for clients using the Swift API). :command:`subuser modify` - Modify a subuser + Modify a subuser. :command:`subuser rm` - Remove a subuser + Remove a subuser. + +:command:`key create` + Create access key. + +:command:`key rm` + Remove access key. :command:`bucket list` - List all buckets + List all buckets. + +:command:`bucket link` + Link bucket to specified user. :command:`bucket unlink` - Remove a bucket + Unlink bucket from specified user. + +:command:`bucket stats` + Returns bucket statistics. :command:`bucket rm` - Remove a bucket + Remove a bucket. + +:command:`bucket check` + Check bucket index. :command:`object rm` - Remove an object + Remove an object. -:command:`key create` - Create an access key +:command:`object unlink` + Unlink object from bucket index. -:command:`key rm` - Remove an access key +:command:`quota set` + Set quota params. + +:command:`quota enable` + Enable quota. + +:command:`quota disable` + Disable quota. + +:command:`region get` + Show region info. + +:command:`regions list` + List all regions set on this cluster. + +:command:`region set` + Set region info (requires infile). + +:command:`region default` + Set default region. + +:command:`region-map get` + Show region-map. + +:command:`region-map set` + Set region-map (requires infile). + +:command:`zone get` + Show zone cluster params. + +:command:`zone set` + Set zone cluster params (requires infile). + +:command:`zone list` + List all zones set on this cluster. :command:`pool add` - Add an existing pool for data placement + Add an existing pool for data placement. :command:`pool rm` - Remove an existing pool from data placement set + Remove an existing pool from data placement set. :command:`pools list` - List placement active set + List placement active set. :command:`policy` - Display bucket/object policy + Display bucket/object policy. + +:command:`log list` + List log objects. :command:`log show` - Show the log of a bucket (with a specified date) + Dump a log from specific object or (bucket + date + bucket-id). + +:command:`log rm` + Remove log object. :command:`usage show` - Show the usage information (with optional user and date range) + Show the usage information (with optional user and date range). :command:`usage trim` - Trim usage information (with optional user and date range) + Trim usage information (with optional user and date range). + +:command:`temp remove` + Remove temporary objects that were created up to specified date + (and optional time). + +:command:`gc list` + Dump expired garbage collection objects (specify --include-all to list all + entries, including unexpired). + +:command:`gc process` + Manually process garbage. + +:command:`metadata get` + Get metadata info. + +:command:`metadata put` + Put metadata info. + +:command:`metadata rm` + Remove metadata info. + +:command:`metadata list` + List metadata info. + +:command:`mdlog list` + List metadata log. + +:command:`mdlog trim` + Trim metadata log. + +:command:`bilog list` + List bucket index log. + +:command:`bilog trim` + Trim bucket index log (use start-marker, end-marker). + +:command:`datalog list` + List data log. + +:command:`datalog trim` + Trim data log. + +:command:`opstate list` + List stateful operations entries (use client_id, op_id, object). + +:command:`opstate set` + Set state on an entry (use client_id, op_id, object, state). + +:command:`opstate renew` + Renew state on an entry (use client_id, op_id, object). + +:command:`opstate rm` + Remove entry (use client_id, op_id, object). + +:command:`replicalog get` + Get replica metadata log entry. + +:command:`replicalog delete` + Delete replica metadata log entry. Options @@ -89,7 +223,7 @@ Options .. option:: -c ceph.conf, --conf=ceph.conf - Use *ceph.conf* configuration file instead of the default + Use ``ceph.conf`` configuration file instead of the default ``/etc/ceph/ceph.conf`` to determine monitor addresses during startup. @@ -101,17 +235,45 @@ Options The radosgw user ID. -.. option:: --secret=secret +.. option:: --subuser= - The secret associated with a given key. + Name of the subuser. + +.. option:: --email=email + + The e-mail address of the user. .. option:: --display-name=name Configure the display name of the user. -.. option:: --email=email +.. option:: --access-key= + + S3 access key. + +.. option:: --gen-access-key + + Generate random access key (for S3). + +.. option:: --secret=secret + + The secret associated with a given key. + +.. option:: --gen-secret + + Generate random secret key. + +.. option:: --key-type= + + key type, options are: swift, S3. + +.. option:: --temp-url-key[-2]= - The e-mail address of the user + Temporary url key. + +.. option:: --system + + Set the system flag on the user. .. option:: --bucket=bucket @@ -123,32 +285,124 @@ Options .. option:: --date=yyyy-mm-dd - The date needed for some commands + The date needed for some commands. .. option:: --start-date=yyyy-mm-dd - The start date needed for some commands + The start date needed for some commands. .. option:: --end-date=yyyy-mm-dd - The end date needed for some commands + The end date needed for some commands. + +.. option:: --shard-id= + + Optional for mdlog list. Required for ``mdlog trim``, + ``replica mdlog get/delete``, ``replica datalog get/delete``. .. option:: --auth-uid=auid - The librados auid + The librados auid. .. option:: --purge-data - Remove user data before user removal + Remove user data before user removal. + +.. option:: --purge-keys + + When specified, subuser removal will also purge all the subuser keys. .. option:: --purge-objects - Remove all objects before bucket removal + Remove all objects before bucket removal. .. option:: --lazy-remove - Defer removal of object tail - + Defer removal of object tail. + +.. option:: --metadata-key= + + Key to retrieve metadata from with ``metadata get``. + +.. option:: --rgw-region= + + Region in which radosgw is running. + +.. option:: --rgw-zone= + + Zone in which radosgw is running. + +.. option:: --fix + + Besides checking bucket index, will also fix it. + +.. option:: --check-objects + + bucket check: Rebuilds bucket index according to actual objects state. + +.. option:: --format= + + Specify output format for certain operations: xml, json. + +.. option:: --sync-stats + + Option to 'user stats', update user stats with current stats reported by + user's buckets indexes. + +.. option:: --show-log-entries= + + Enable/disable dump of log entries on log show. + +.. option:: --show-log-sum= + + Enable/disable dump of log summation on log show. + +.. option:: --skip-zero-entries + + Log show only dumps entries that don't have zero value in one of the numeric + field. + +.. option:: --infile + + Specify a file to read in when setting data. + +.. option:: --state= + + Specify a state for the opstate set command. + +.. option:: --replica-log-type + + Replica log type (metadata, data, bucket), required for replica log + operations. + +.. option:: --categories= + + Comma separated list of categories, used in usage show. + +.. option:: --caps= + + List of caps (e.g., "usage=read, write; user=read". + +.. option:: --yes-i-really-mean-it + + Required for certain operations. + + +Quota Options +============= + +.. option:: --max-objects + + Specify max objects (negative value to disable). + +.. option:: --max-size + + Specify max size (in bytes, negative value to disable). + +.. option:: --quota-scope + + Scope of quota (bucket, user). + Examples ======== @@ -182,7 +436,7 @@ Remove a bucket:: Show the logs of a bucket from April 1st, 2012:: - $ radosgw-admin log show --bucket=foo --date=2012=04-01 + $ radosgw-admin log show --bucket=foo --date=2012-04-01 Show usage information for user from March 1st to (but not including) April 1st, 2012:: @@ -197,14 +451,17 @@ Trim usage information for user until March 1st, 2012:: $ radosgw-admin usage trim --uid=johnny --end-date=2012-04-01 + Availability ============ -**radosgw-admin** is part of the Ceph distributed storage system. Please -refer to the Ceph documentation at http://ceph.com/docs for more -information. +:program:`radosgw-admin` is part of Ceph, a massively scalable, open-source, +distributed storage system. Please refer to the Ceph documentation at +http://ceph.com/docs for more information. + See also ======== :doc:`ceph `\(8) +:doc:`radosgw `\(8) diff --git a/doc/man/8/radosgw.rst b/doc/man/8/radosgw.rst index b9cdea304bdb9..1f74dec6d0b7b 100644 --- a/doc/man/8/radosgw.rst +++ b/doc/man/8/radosgw.rst @@ -1,3 +1,5 @@ +:orphan: + =============================== radosgw -- rados REST gateway =============================== @@ -13,7 +15,7 @@ Synopsis Description =========== -**radosgw** is an HTTP REST gateway for the RADOS object store, a part +:program:`radosgw` is an HTTP REST gateway for the RADOS object store, a part of the Ceph distributed storage system. It is implemented as a FastCGI module using libfcgi, and can be used in conjunction with any FastCGI capable web server. @@ -24,13 +26,12 @@ Options .. option:: -c ceph.conf, --conf=ceph.conf - Use *ceph.conf* configuration file instead of the default + Use ``ceph.conf`` configuration file instead of the default ``/etc/ceph/ceph.conf`` to determine monitor addresses during startup. .. option:: -m monaddress[:port] - Connect to specified monitor (instead of looking through - ``ceph.conf``). + Connect to specified monitor (instead of looking through ``ceph.conf``). .. option:: -i ID, --id ID @@ -68,69 +69,153 @@ Options Configuration ============= -Currently it's the easiest to use the RADOS Gateway with Apache and mod_fastcgi:: +Earlier RADOS Gateway had to be configured with ``Apache`` and ``mod_fastcgi``. +Now, ``mod_proxy_fcgi`` module is used instead of ``mod_fastcgi``. +``mod_proxy_fcgi`` works differently than a traditional FastCGI module. This +module requires the service of ``mod_proxy`` which provides support for the +FastCGI protocol. So, to be able to handle FastCGI protocol, both ``mod_proxy`` +and ``mod_proxy_fcgi`` have to be present in the server. Unlike ``mod_fastcgi``, +``mod_proxy_fcgi`` cannot start the application process. Some platforms have +``fcgistarter`` for that purpose. However, external launching of application +or process management may be available in the FastCGI application framework +in use. - FastCgiExternalServer /var/www/s3gw.fcgi -socket /tmp/radosgw.sock +``Apache`` can be configured in a way that enables ``mod_proxy_fcgi`` to be used +with localhost tcp or through unix domain socket. ``mod_proxy_fcgi`` that doesn't +support unix domain socket such as the ones in Apache 2.2 and earlier versions of +Apache 2.4, needs to be configured for use with localhost tcp. Later versions of +Apache like Apache 2.4.9 or later support unix domain socket and as such they +allow for the configuration with unix domain socket instead of localhost tcp. - - ServerName rgw.example1.com - ServerAlias rgw - ServerAdmin webmaster@example1.com - DocumentRoot /var/www +The following steps show the configuration in Ceph's configuration file i.e, +``/etc/ceph/ceph.conf`` and the gateway configuration file i.e, +``/etc/httpd/conf.d/rgw.conf`` (RPM-based distros) or +``/etc/apache2/conf-available/rgw.conf`` (Debian-based distros) with localhost +tcp and through unix domain socket: - RewriteEngine On - RewriteRule ^/([a-zA-Z0-9-_.]*)([/]?.*) /s3gw.fcgi?page=$1¶ms=$2&%{QUERY_STRING} [E=HTTP_AUTHORIZATION:%{HTTP:Authorization},L] +#. For distros with Apache 2.2 and early versions of Apache 2.4 that use + localhost TCP and do not support Unix Domain Socket, append the following + contents to ``/etc/ceph/ceph.conf``:: - - - Options +ExecCGI - AllowOverride All - SetHandler fastcgi-script - Order allow,deny - Allow from all - AuthBasicAuthoritative Off - - + [client.radosgw.gateway] + host = {hostname} + keyring = /etc/ceph/ceph.client.radosgw.keyring + rgw socket path = "" + log file = /var/log/radosgw/client.radosgw.gateway.log + rgw frontends = fastcgi socket_port=9000 socket_host=0.0.0.0 + rgw print continue = false - AllowEncodedSlashes On - ServerSignature Off - +#. Add the following content in the gateway configuration file: -And the corresponding radosgw script (/var/www/s3gw.fcgi):: + For Debian/Ubuntu add in ``/etc/apache2/conf-available/rgw.conf``:: - #!/bin/sh - exec /usr/bin/radosgw -c /etc/ceph/ceph.conf -n client.radosgw.gateway + + ServerName localhost + DocumentRoot /var/www/html -The radosgw daemon is a standalone process which needs a configuration -section in the ceph.conf The section name should start with -'client.radosgw.' as specified in /etc/init.d/radosgw:: + ErrorLog /var/log/apache2/rgw_error.log + CustomLog /var/log/apache2/rgw_access.log combined - [client.radosgw.gateway] - host = gateway - keyring = /etc/ceph/keyring.radosgw.gateway - rgw socket path = /tmp/radosgw.sock + # LogLevel debug + + RewriteEngine On + + RewriteRule .* - [E=HTTP_AUTHORIZATION:%{HTTP:Authorization},L] + + SetEnv proxy-nokeepalive 1 + + ProxyPass / fcgi://localhost:9000/ + + + + For CentOS/RHEL add in ``/etc/httpd/conf.d/rgw.conf``:: + + + ServerName localhost + DocumentRoot /var/www/html + + ErrorLog /var/log/httpd/rgw_error.log + CustomLog /var/log/httpd/rgw_access.log combined + + # LogLevel debug + + RewriteEngine On + + RewriteRule .* - [E=HTTP_AUTHORIZATION:%{HTTP:Authorization},L] + + SetEnv proxy-nokeepalive 1 + + ProxyPass / fcgi://localhost:9000/ + + + +#. For distros with Apache 2.4.9 or later that support Unix Domain Socket, + append the following configuration to ``/etc/ceph/ceph.conf``:: + + [client.radosgw.gateway] + host = {hostname} + keyring = /etc/ceph/ceph.client.radosgw.keyring + rgw socket path = /var/run/ceph/ceph.radosgw.gateway.fastcgi.sock + log file = /var/log/radosgw/client.radosgw.gateway.log + rgw print continue = false + +#. Add the following content in the gateway configuration file: + + For CentOS/RHEL add in ``/etc/httpd/conf.d/rgw.conf``:: + + + ServerName localhost + DocumentRoot /var/www/html + + ErrorLog /var/log/httpd/rgw_error.log + CustomLog /var/log/httpd/rgw_access.log combined + + # LogLevel debug + + RewriteEngine On + + RewriteRule .* - [E=HTTP_AUTHORIZATION:%{HTTP:Authorization},L] + + SetEnv proxy-nokeepalive 1 + + ProxyPass / unix:///var/run/ceph/ceph.radosgw.gateway.fastcgi.sock|fcgi://localhost:9000/ + + + + The latest version of Ubuntu i.e, 14.04 ships with ``Apache 2.4.7`` that + does not have Unix Domain Socket support in it and as such it has to be + configured with localhost tcp. The Unix Domain Socket support is available in + ``Apache 2.4.9`` and later versions. A bug has been filed to backport the UDS + support to ``Apache 2.4.7`` for ``Ubuntu 14.04``. + See: https://bugs.launchpad.net/ubuntu/+source/apache2/+bug/1411030 + +#. Generate a key for radosgw to use for authentication with the cluster. :: + + ceph-authtool -C -n client.radosgw.gateway --gen-key /etc/ceph/keyring.radosgw.gateway + ceph-authtool -n client.radosgw.gateway --cap mon 'allow rw' --cap osd 'allow rwx' /etc/ceph/keyring.radosgw.gateway + +#. Add the key to the auth entries. :: -You will also have to generate a key for the radosgw to use for -authentication with the cluster:: + ceph auth add client.radosgw.gateway --in-file=keyring.radosgw.gateway - ceph-authtool -C -n client.radosgw.gateway --gen-key /etc/ceph/keyring.radosgw.gateway - ceph-authtool -n client.radosgw.gateway --cap mon 'allow rw' --cap osd 'allow rwx' /etc/ceph/keyring.radosgw.gateway +#. Start Apache and radosgw. -And add the key to the auth entries:: + Debian/Ubuntu:: - ceph auth add client.radosgw.gateway --in-file=keyring.radosgw.gateway + sudo /etc/init.d/apache2 start + sudo /etc/init.d/radosgw start -Now you can start Apache and the radosgw daemon:: + CentOS/RHEL:: - /etc/init.d/apache2 start - /etc/init.d/radosgw start + sudo apachectl start + sudo /etc/init.d/ceph-radosgw start Usage Logging ============= -The **radosgw** maintains an asynchronous usage log. It accumulates +:program:`radosgw` maintains an asynchronous usage log. It accumulates statistics about user operations and flushes it periodically. The -logs can be accessed and managed through **radosgw-admin**. +logs can be accessed and managed through :program:`radosgw-admin`. The information that is being logged contains total data transfer, total operations, and total successful operations. The data is being @@ -159,9 +244,9 @@ synchronous flush. Availability ============ -**radosgw** is part of the Ceph distributed storage system. Please refer -to the Ceph documentation at http://ceph.com/docs for more -information. +:program:`radosgw` is part of Ceph, a massively scalable, open-source, distributed +storage system. Please refer to the Ceph documentation at http://ceph.com/docs for +more information. See also diff --git a/doc/man/8/rbd-fuse.rst b/doc/man/8/rbd-fuse.rst index 8ffdb4fb8d4d9..394bdba7af08a 100644 --- a/doc/man/8/rbd-fuse.rst +++ b/doc/man/8/rbd-fuse.rst @@ -1,3 +1,5 @@ +:orphan: + ======================================= rbd-fuse -- expose rbd images as files ======================================= @@ -43,7 +45,7 @@ Any options not recognized by rbd-fuse will be passed on to libfuse. Availability ============ -**rbd-fuse** is part of the Ceph distributed storage system. Please refer to +**rbd-fuse** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to the Ceph documentation at http://ceph.com/docs for more information. diff --git a/doc/man/8/rbd-replay-many.rst b/doc/man/8/rbd-replay-many.rst index f397d45acdd32..5fb93498a2372 100644 --- a/doc/man/8/rbd-replay-many.rst +++ b/doc/man/8/rbd-replay-many.rst @@ -1,3 +1,5 @@ +:orphan: + ================================================================================== rbd-replay-many -- replay a rados block device (RBD) workload on several clients ================================================================================== @@ -60,7 +62,7 @@ This results in the following commands being executed:: Availability ============ -**rbd-replay-many** is part of the Ceph distributed storage system. Please refer to +**rbd-replay-many** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to the Ceph documentation at http://ceph.com/docs for more information. diff --git a/doc/man/8/rbd-replay-prep.rst b/doc/man/8/rbd-replay-prep.rst index e71894af97f10..4b78ac3fcdbc0 100644 --- a/doc/man/8/rbd-replay-prep.rst +++ b/doc/man/8/rbd-replay-prep.rst @@ -1,3 +1,5 @@ +:orphan: + ==================================================================================== rbd-replay-prep -- prepare captured rados block device (RBD) workloads for replay ==================================================================================== @@ -39,7 +41,7 @@ To prepare workload1-trace for replay:: Availability ============ -**rbd-replay-prep** is part of the Ceph distributed storage system. Please refer to +**rbd-replay-prep** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to the Ceph documentation at http://ceph.com/docs for more information. diff --git a/doc/man/8/rbd-replay.rst b/doc/man/8/rbd-replay.rst index e590659c8948a..74b8018f5674d 100644 --- a/doc/man/8/rbd-replay.rst +++ b/doc/man/8/rbd-replay.rst @@ -1,3 +1,5 @@ +:orphan: + ========================================================= rbd-replay -- replay rados block device (RBD) workloads ========================================================= @@ -65,7 +67,7 @@ To replay workload1 but use test_image instead of prod_image:: Availability ============ -**rbd-replay** is part of the Ceph distributed storage system. Please refer to +**rbd-replay** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to the Ceph documentation at http://ceph.com/docs for more information. diff --git a/doc/man/8/rbd.rst b/doc/man/8/rbd.rst index 542454715fa2c..d7eb72d7c9d22 100644 --- a/doc/man/8/rbd.rst +++ b/doc/man/8/rbd.rst @@ -1,3 +1,5 @@ +:orphan: + =============================================== rbd -- manage rados block device (RBD) images =============================================== @@ -33,7 +35,7 @@ Options Connect to specified monitor (instead of looking through ceph.conf). -.. option:: -p pool, --pool pool +.. option:: -p pool-name, --pool pool-name Interact with the given pool. Required by most commands. @@ -46,7 +48,7 @@ Options Parameters ========== -.. option:: --image-format format +.. option:: --image-format format-id Specifies which object layout to use. The default is 1. @@ -59,18 +61,18 @@ Parameters support for cloning and is more easily extensible to allow more features in the future. -.. option:: --size size-in-mb +.. option:: --size size-in-M/G/T - Specifies the size (in megabytes) of the new rbd image. + Specifies the size (in M/G/T) of the new rbd image. .. option:: --order bits Specifies the object size expressed as a number of bits, such that the object size is ``1 << order``. The default is 22 (4 MB). -.. option:: --stripe-unit size-in-bytes +.. option:: --stripe-unit size-in-B/K/M - Specifies the stripe unit size in bytes. See striping section (below) for more details. + Specifies the stripe unit size in B/K/M. See striping section (below) for more details. .. option:: --stripe-count num @@ -85,18 +87,18 @@ Parameters Specifies the username (without the ``client.`` prefix) to use with the map command. -.. option:: --keyfile filename - - Specifies a file containing the secret to use with the map command. - If not specified, ``client.admin`` will be used by default. - .. option:: --keyring filename Specifies a keyring file containing a secret for the specified user to use with the map command. If not specified, the default keyring locations will be searched. -.. option:: --shared tag +.. option:: --keyfile filename + + Specifies a file containing the secret key of ``--id user`` to use with the map command. + This option is overridden by ``--keyring`` if the latter is also specified. + +.. option:: --shared lock-tag Option for `lock add` that allows multiple clients to lock the same image if they use the same tag. The tag is an arbitrary @@ -123,35 +125,71 @@ Parameters Map the image read-only. Equivalent to -o ro. +.. option:: --image-feature feature-name + + Specifies which RBD format 2 feature should be enabled when creating + an image. Multiple features can be enabled by repeating this option + multiple times. The following features are supported: + + * layering: layering support + * striping: striping v2 support + * exclusive-lock: exclusive locking support + * object-map: object map support (requires exclusive-lock) + * fast-diff: fast diff calculations (requires object-map) + * deep-flatten: snapshot flatten support + +.. option:: --image-shared + + Specifies that the image will be used concurrently by multiple clients. + This will disable features that are dependent upon exclusive ownership + of the image. + +.. option:: --whole-object + + Specifies that the diff should be limited to the extents of a full object + instead of showing intra-object deltas. When the object map feature is + enabled on an image, limiting the diff to the object extents will + dramatically improve performance since the differences can be computed + by examining the in-memory object map instead of querying RADOS for each + object within the image. Commands ======== .. TODO rst "option" directive seems to require --foo style options, parsing breaks on subcommands.. the args show up as bold too -:command:`ls` [-l | --long] [pool-name] +:command:`ls` [-l | --long] [*pool-name*] Will list all rbd images listed in the rbd_directory object. With -l, also show snapshots, and use longer-format output including size, parent (if clone), format, etc. -:command:`info` [*image-name*] +:command:`du` [-p | --pool *pool-name*] [*image-spec* | *snap-spec*] + Will calculate the provisioned and actual disk usage of all images and + associated snapshots within the specified pool. It can also be used against + individual images and snapshots. + + If the RBD fast-diff feature isn't enabled on images, this operation will + require querying the OSDs for every potential object within the image. + +:command:`info` *image-spec* | *snap-spec* Will dump information (such as size and order) about a specific rbd image. If image is a clone, information about its parent is also displayed. If a snapshot is specified, whether it is protected is shown as well. -:command:`create` [*image-name*] +:command:`create` (-s | --size *size-in-M/G/T*) [--image-format *format-id*] [--order *bits*] [--stripe-unit *size-in-B/K/M* --stripe-count *num*] [--image-feature *feature-name*]... [--image-shared] *image-spec* Will create a new rbd image. You must also specify the size via --size. The --stripe-unit and --stripe-count arguments are optional, but must be used together. -:command:`clone` [*parent-snapname*] [*image-name*] +:command:`clone` [--order *bits*] [--stripe-unit *size-in-B/K/M* --stripe-count *num*] [--image-feature *feature-name*] [--image-shared] *parent-snap-spec* *child-image-spec* Will create a clone (copy-on-write child) of the parent snapshot. Object order will be identical to that of the parent image unless - specified. Size will be the same as the parent snapshot. + specified. Size will be the same as the parent snapshot. The --stripe-unit + and --stripe-count arguments are optional, but must be used together. The parent snapshot must be protected (see `rbd snap protect`). This requires image format 2. -:command:`flatten` [*image-name*] +:command:`flatten` *image-spec* If image is a clone, copy all shared blocks from the parent snapshot and make the child independent of the parent, severing the link between parent snap and child. The parent snapshot can be unprotected and @@ -159,30 +197,33 @@ Commands This requires image format 2. -:command:`children` [*image-name*] +:command:`children` *snap-spec* List the clones of the image at the given snapshot. This checks every pool, and outputs the resulting poolname/imagename. This requires image format 2. -:command:`resize` [*image-name*] [--allow-shrink] +:command:`resize` (-s | --size *size-in-M/G/T*) [--allow-shrink] *image-spec* Resizes rbd image. The size parameter also needs to be specified. The --allow-shrink option lets the size be reduced. -:command:`rm` [*image-name*] +:command:`rm` *image-spec* Deletes an rbd image (including all data blocks). If the image has snapshots, this fails and nothing is deleted. -:command:`export` [*image-name*] [*dest-path*] +:command:`export` (*image-spec* | *snap-spec*) [*dest-path*] Exports image to dest path (use - for stdout). -:command:`import` [*path*] [*dest-image*] +:command:`import` [--image-format *format-id*] [--order *bits*] [--stripe-unit *size-in-B/K/M* --stripe-count *num*] [--image-feature *feature-name*]... [--image-shared] *src-path* [*image-spec*] Creates a new image and imports its data from path (use - for stdin). The import operation will try to create sparse rbd images if possible. For import from stdin, the sparsification unit is the data block size of the destination image (1 << order). -:command:`export-diff` [*image-name*] [*dest-path*] [--from-snap *snapname*] + The --stripe-unit and --stripe-count arguments are optional, but must be + used together. + +:command:`export-diff` [--from-snap *snap-name*] [--whole-object] (*image-spec* | *snap-spec*) *dest-path* Exports an incremental diff for an image to dest path (use - for stdout). If an initial snapshot is specified, only changes since that snapshot are included; otherwise, any regions of the image that contain data are included. The end snapshot is specified @@ -190,42 +231,67 @@ Commands metadata about image size changes, and the start and end snapshots. It efficiently represents discarded or 'zero' regions of the image. -:command:`import-diff` [*src-path*] [*image-name*] +:command:`merge-diff` *first-diff-path* *second-diff-path* *merged-diff-path* + Merge two continuous incremental diffs of an image into one single diff. The + first diff's end snapshot must be equal with the second diff's start snapshot. + The first diff could be - for stdin, and merged diff could be - for stdout, which + enables multiple diff files to be merged using something like + 'rbd merge-diff first second - | rbd merge-diff - third result'. Note this command + currently only support the source incremental diff with stripe_count == 1 + +:command:`import-diff` *src-path* *image-spec* Imports an incremental diff of an image and applies it to the current image. If the diff was generated relative to a start snapshot, we verify that snapshot already exists before continuing. If there was an end snapshot we verify it does not already exist before applying the changes, and create the snapshot when we are done. -:command:`diff` [*image-name*] [--from-snap *snapname*] +:command:`diff` [--from-snap *snap-name*] [--whole-object] *image-spec* | *snap-spec* Dump a list of byte extents in the image that have changed since the specified start snapshot, or since the image was created. Each output line includes the starting offset (in bytes), the length of the region (in bytes), and either 'zero' or 'data' to indicate whether the region is known to be zeros or may contain other data. -:command:`cp` [*src-image*] [*dest-image*] +:command:`cp` (*src-image-spec* | *src-snap-spec*) *dest-image-spec* Copies the content of a src-image into the newly created dest-image. dest-image will have the same size, order, and image format as src-image. -:command:`mv` [*src-image*] [*dest-image*] +:command:`mv` *src-image-spec* *dest-image-spec* Renames an image. Note: rename across pools is not supported. -:command:`snap` ls [*image-name*] +:command:`image-meta list` *image-spec* + Show metadata held on the image. The first column is the key + and the second column is the value. + +:command:`image-meta get` *image-spec* *key* + Get metadata value with the key. + +:command:`image-meta set` *image-spec* *key* *value* + Set metadata key with the value. They will displayed in `image-meta list`. + +:command:`image-meta remove` *image-spec* *key* + Remove metadata key with the value. + +:command:`object-map rebuild` *image-spec* | *snap-spec* + Rebuilds an invalid object map for the specified image. An image snapshot can be + specified to rebuild an invalid object map for a snapshot. + +:command:`snap ls` *image-spec* Dumps the list of snapshots inside a specific image. -:command:`snap` create [*image-name*] +:command:`snap create` *snap-spec* Creates a new snapshot. Requires the snapshot name parameter specified. -:command:`snap` rollback [*image-name*] +:command:`snap rollback` *snap-spec* Rollback image content to snapshot. This will iterate through the entire blocks array and update the data head content to the snapshotted version. -:command:`snap` rm [*image-name*] +:command:`snap rm` *snap-spec* Removes the specified snapshot. -:command:`snap` purge [*image-name*] +:command:`snap purge` *image-spec* Removes all snapshots from an image. -:command:`snap` protect [*image-name*] +:command:`snap protect` *snap-spec* Protect a snapshot from deletion, so that clones can be made of it (see `rbd clone`). Snapshots must be protected before clones are made; protection implies that there exist dependent cloned children that @@ -234,27 +300,38 @@ Commands This requires image format 2. -:command:`snap` unprotect [*image-name*] +:command:`snap unprotect` *snap-spec* Unprotect a snapshot from deletion (undo `snap protect`). If cloned children remain, `snap unprotect` fails. (Note that clones may exist in different pools than the parent snapshot.) This requires image format 2. -:command:`map` [*image-name*] [-o | --options *map-options* ] [--read-only] +:command:`map` [-o | --options *map-options* ] [--read-only] *image-spec* | *snap-spec* Maps the specified image to a block device via the rbd kernel module. -:command:`unmap` [*device-path*] +:command:`unmap` *image-spec* | *snap-spec* | *device-path* Unmaps the block device that was mapped via the rbd kernel module. :command:`showmapped` Show the rbd images that are mapped via the rbd kernel module. -:command:`lock` list [*image-name*] +:command:`status` *image-spec* + Show the status of the image, including which clients have it open. + +:command:`feature disable` *image-spec* *feature-name*... + Disables the specified feature on the specified image. Multiple features can + be specified. + +:command:`feature enable` *image-spec* *feature-name*... + Enables the specified feature on the specified image. Multiple features can + be specified. + +:command:`lock list` *image-spec* Show locks held on the image. The first column is the locker to use with the `lock remove` command. -:command:`lock` add [*image-name*] [*lock-id*] +:command:`lock add` [--shared *lock-tag*] *image-spec* *lock-id* Lock an image. The lock-id is an arbitrary name for the user's convenience. By default, this is an exclusive lock, meaning it will fail if the image is already locked. The --shared option @@ -262,26 +339,26 @@ Commands any operation other than adding a lock. It does not protect an image from being deleted. -:command:`lock` remove [*image-name*] [*lock-id*] [*locker*] +:command:`lock remove` *image-spec* *lock-id* *locker* Release a lock on an image. The lock id and locker are as output by lock ls. -:command:`bench-write` [*image-name*] --io-size [*io-size-in-bytes*] --io-threads [*num-ios-in-flight*] --io-total [*total-bytes-to-write*] - Generate a series of sequential writes to the image and measure the - write throughput and latency. Defaults are: --io-size 4096, --io-threads 16, - --io-total 1GB +:command:`bench-write` [--io-size *size-in-B/K/M/G/T*] [--io-threads *num-ios-in-flight*] [--io-total *total-size-to-write-in-B/K/M/G/T*] [--io-pattern seq | rand] *image-spec* + Generate a series of writes to the image and measure the write throughput and + latency. Defaults are: --io-size 4096, --io-threads 16, --io-total 1G, + --io-pattern seq. -Image name -========== - -In addition to using the --pool and the --snap options, the image name can include both -the pool name and the snapshot name. The image name format is as follows:: +Image and snap specs +==================== - [pool/]image-name[@snap] +| *image-spec* is [*pool-name*]/*image-name* +| *snap-spec* is [*pool-name*]/*image-name*\ @\ *snap-name* -Thus an image name that contains a slash character ('/') requires specifying the pool -name explicitly. +The default for *pool-name* is "rbd". If an image name contains a slash +character ('/'), *pool-name* is required. +You may specify each name individually, using --pool, --image and --snap +options, but this is discouraged in favor of the above spec syntax. Striping ======== @@ -295,14 +372,17 @@ bottleneck when individual images get large or busy. The striping is controlled by three parameters: .. option:: order + The size of objects we stripe over is a power of two, specifically 2^[*order*] bytes. The default is 22, or 4 MB. .. option:: stripe_unit + Each [*stripe_unit*] contiguous bytes are stored adjacently in the same object, before we move on to the next object. .. option:: stripe_count + After we write [*stripe_unit*] bytes to [*stripe_count*] objects, we loop back to the initial object and write another stripe, until the object reaches its maximum size (as specified by [*order*]. At that point, we move on to the next [*stripe_count*] objects. @@ -319,6 +399,8 @@ Most of these options are useful mainly for debugging and benchmarking. The default values are set in the kernel and may therefore depend on the version of the running kernel. +libceph (per client instance) options: + * fsid=aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee - FSID that should be assumed by the client. @@ -332,24 +414,40 @@ the running kernel. * nocrc - Disable CRC32C checksumming for data writes. +* cephx_require_signatures - Require cephx message signing (since 3.19, + default). + +* nocephx_require_signatures - Don't require cephx message signing (since + 3.19). + +* tcp_nodelay - Disable Nagle's algorithm on client sockets (since 4.0, + default). + +* notcp_nodelay - Enable Nagle's algorithm on client sockets (since 4.0). + +* mount_timeout=x - A timeout on various steps in `rbd map` and `rbd unmap` + sequences (default is 60 seconds). In particular, since 4.2 this can be used + to ensure that `rbd unmap` eventually times out when there is no network + connection to a cluster. + * osdkeepalive=x - OSD keepalive timeout (default is 5 seconds). * osd_idle_ttl=x - OSD idle TTL (default is 60 seconds). +Mapping (per block device) options: + * rw - Map the image read-write (default). * ro - Map the image read-only. Equivalent to --read-only. +* queue_depth=x - queue depth (since 4.2, default is 128 requests). + Examples ======== To create a new rbd image that is 100 GB:: - rbd -p mypool create myimage --size 102400 - -or alternatively:: - rbd create mypool/myimage --size 102400 To use a non-default object size (8 MB):: @@ -387,13 +485,13 @@ To unmap an image:: To create an image and a clone from it:: rbd import --image-format 2 image mypool/parent - rbd snap create --snap snapname mypool/parent + rbd snap create mypool/parent@snap rbd snap protect mypool/parent@snap rbd clone mypool/parent@snap otherpool/child To create an image with a smaller stripe_unit (to better distribute small writes in some workloads):: - rbd -p mypool create myimage --size 102400 --stripe-unit 65536 --stripe-count 16 + rbd create mypool/myimage --size 102400 --stripe-unit 65536B --stripe-count 16 To change an image from one image format to another, export it and then import it as the desired image format:: @@ -413,7 +511,7 @@ To release a lock:: Availability ============ -**rbd** is part of the Ceph distributed storage system. Please refer to +**rbd** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to the Ceph documentation at http://ceph.com/docs for more information. diff --git a/doc/rados/api/librados-intro.rst b/doc/rados/api/librados-intro.rst index c120ec9fe9072..e43e665d8e4b9 100644 --- a/doc/rados/api/librados-intro.rst +++ b/doc/rados/api/librados-intro.rst @@ -36,7 +36,7 @@ Step 1: Getting librados Your client application must bind with ``librados`` to connect to the Ceph Storage Cluster. You must install ``librados`` and any required packages to write applications that use ``librados``. The ``librados`` API is written in -C++, with additional bindings for C, Python and Java. +C++, with additional bindings for C, Python, Java and PHP. Getting librados for C/C++ @@ -50,7 +50,7 @@ distributions, execute the following:: To install ``librados`` development support files for C/C++ on RHEL/CentOS distributions, execute the following:: - sudo yum install ceph-devel + sudo yum install librados2-devel Once you install ``librados`` for developers, you can find the required headers for C/C++ under ``/usr/include/rados``. :: @@ -63,19 +63,19 @@ Getting librados for Python The ``rados.py`` modules provides ``librados`` support to Python applications. The ``librados-dev`` package for Debian/Ubuntu -and the ``ceph-devel`` package for RHEL/CentOS will install the -``python-ceph`` package for you. You may install ``python-ceph`` +and the ``librados2-devel`` package for RHEL/CentOS will install the +``python-rados`` package for you. You may install ``python-rados`` directly too. To install ``librados`` development support files for Python on Debian/Ubuntu distributions, execute the following:: - sudo apt-get install python-ceph + sudo apt-get install python-rados -To install ``librados`` development support files for C/C++ on RHEL/CentOS +To install ``librados`` development support files for Python on RHEL/CentOS distributions, execute the following:: - sudo yum install python-ceph + sudo yum install python-rados You can find the module under ``/usr/share/pyshared`` on Debian systems, or under ``/usr/lib/python*/site-packages`` on CentOS/RHEL systems. @@ -119,6 +119,36 @@ To build the documentation, execute the following:: ant docs +Getting librados for PHP +------------------------- + +To install the ``librados`` extension for PHP, you need to execute the following procedure: + +#. Install php-dev. For Debian/Ubuntu, execute:: + + sudo apt-get install php5-dev build-essential + + For CentOS/RHEL, execute:: + + sudo yum install php-devel + +#. Clone the ``phprados`` repository:: + + git clone https://github.com/ceph/phprados.git + +#. Build ``phprados``:: + + cd phprados + phpize + ./configure + make + sudo make install + +#. Enable ``phprados`` in php.ini by adding:: + + extension=rados.so + + Step 2: Configuring a Cluster Handle ==================================== @@ -456,6 +486,29 @@ to specify the classpath. For example:: java CephClient +PHP Example +------------ + +With the RADOS extension enabled in PHP you can start creating a new cluster handle very easily: + +.. code-block:: php + + { ruleset - type [ replicated | raid4 ] + type [ replicated | erasure ] min_size max_size step take @@ -538,7 +534,7 @@ A rule takes the following form:: :Type: String :Required: Yes :Default: ``replicated`` -:Valid Values: Currently only ``replicated`` +:Valid Values: Currently only ``replicated`` and ``erasure`` ``min_size`` diff --git a/doc/rados/operations/erasure-code-isa.rst b/doc/rados/operations/erasure-code-isa.rst index 241c7faeef9c3..9d4ff7f52f0aa 100644 --- a/doc/rados/operations/erasure-code-isa.rst +++ b/doc/rados/operations/erasure-code-isa.rst @@ -2,14 +2,14 @@ ISA erasure code plugin ======================= -The *isa* plugin is encapsulates the `ISA +The *isa* plugin encapsulates the `ISA `_ library. It only runs on Intel processors. Create an isa profile ===================== -To create a new *jerasure* erasure code profile:: +To create a new *isa* erasure code profile:: ceph osd erasure-code-profile set {name} \ plugin=isa \ diff --git a/doc/rados/operations/erasure-code-lrc.rst b/doc/rados/operations/erasure-code-lrc.rst index 112abfa868b81..3c9b269185a3a 100644 --- a/doc/rados/operations/erasure-code-lrc.rst +++ b/doc/rados/operations/erasure-code-lrc.rst @@ -168,7 +168,7 @@ For instance, when three coding steps are described as:: where *c* are coding chunks calculated from the data chunks *D*, the loss of chunk *7* can be recovered with the last four chunks. And the -loss of chun *2* chunk can be recovered with the first four +loss of chunk *2* chunk can be recovered with the first four chunks. Erasure code profile examples using low level configuration @@ -272,7 +272,7 @@ are applied in order. For instance, if a 4K object is encoded, it will first go thru *step 1* and be divided in four 1K chunks (the four uppercase D). They are stored in the chunks 2, 3, 6 and 7, in order. From these, two coding chunks are calculated (the two lowercase -c). The coding chunks are stored in the chunks 1 and 4, respectively. +c). The coding chunks are stored in the chunks 1 and 5, respectively. The *step 2* re-uses the content created by *step 1* in a similar fashion and stores a single coding chunk *c* at position 0. The last four @@ -300,7 +300,7 @@ The coding chunk from *step 2*, stored in chunk *0*, allows it to recover the content of chunk *2*. There are no more chunks to recover and the process stops, without considering *step 1*. -Recovering chunk *2* required reading chunks *0, 1, 3* and writing +Recovering chunk *2* requires reading chunks *0, 1, 3* and writing back chunk *2*. If chunk *2, 3, 6* are lost:: @@ -311,7 +311,7 @@ If chunk *2, 3, 6* are lost:: step 2 cD __ _ step 3 __ cD D -The *step 3* can recover the conten of chunk *6*:: +The *step 3* can recover the content of chunk *6*:: chunk nr 01234567 @@ -355,6 +355,6 @@ For instance:: will create a ruleset that will select two crush buckets of type *rack* and for each of them choose four OSDs, each of them located in -different bucket of type *host*. +different buckets of type *host*. The ruleset can also be manually crafted for finer control. diff --git a/doc/rados/operations/erasure-code-profile.rst b/doc/rados/operations/erasure-code-profile.rst index fdc6e57e97557..3262b3db8dcb0 100644 --- a/doc/rados/operations/erasure-code-profile.rst +++ b/doc/rados/operations/erasure-code-profile.rst @@ -29,6 +29,7 @@ same time. erasure-code-jerasure erasure-code-isa erasure-code-lrc + erasure-code-shec osd erasure-code-profile set ============================ diff --git a/doc/rados/operations/erasure-code-shec.rst b/doc/rados/operations/erasure-code-shec.rst new file mode 100644 index 0000000000000..2d71ad7a49ac3 --- /dev/null +++ b/doc/rados/operations/erasure-code-shec.rst @@ -0,0 +1,133 @@ +======================== +SHEC erasure code plugin +======================== + +The *shec* plugin encapsulates the `multiple SHEC +`_ +library. It allows ceph to recover data more efficiently than Reed Solomon codes. + +Create an SHEC profile +====================== + +To create a new *shec* erasure code profile:: + + ceph osd erasure-code-profile set {name} \ + plugin=shec \ + [k={data-chunks}] \ + [m={coding-chunks}] \ + [c={durability-estimator}] \ + [ruleset-root={root}] \ + [ruleset-failure-domain={bucket-type}] \ + [directory={directory}] \ + [--force] + +Where: + +``k={data-chunks}`` + +:Description: Each object is split in **data-chunks** parts, + each stored on a different OSD. + +:Type: Integer +:Required: No. +:Default: 4 + +``m={coding-chunks}`` + +:Description: Compute **coding-chunks** for each object and store them on + different OSDs. The number of **coding-chunks** does not necessarily + equal the number of OSDs that can be down without losing data. + +:Type: Integer +:Required: No. +:Default: 3 + +``c={durability-estimator}`` + +:Description: The number of parity chunks each of which includes each data chunk in its + calculation range. The number is used as a **durability estimator**. + For instance, if c=2, 2 OSDs can be down without losing data. + +:Type: Integer +:Required: No. +:Default: 2 + +``ruleset-root={root}`` + +:Description: The name of the crush bucket used for the first step of + the ruleset. For intance **step take default**. + +:Type: String +:Required: No. +:Default: default + +``ruleset-failure-domain={bucket-type}`` + +:Description: Ensure that no two chunks are in a bucket with the same + failure domain. For instance, if the failure domain is + **host** no two chunks will be stored on the same + host. It is used to create a ruleset step such as **step + chooseleaf host**. + +:Type: String +:Required: No. +:Default: host + +``directory={directory}`` + +:Description: Set the **directory** name from which the erasure code + plugin is loaded. + +:Type: String +:Required: No. +:Default: /usr/lib/ceph/erasure-code + +``--force`` + +:Description: Override an existing profile by the same name. + +:Type: String +:Required: No. + +Brief description of SHEC's layouts +=================================== + +Space Efficiency +---------------- + +Space efficiency is a ratio of data chunks to all ones in a object and +represented as k/(k+m). +In order to improve space efficiency, you should increase k or decrease m. + +:: + + space efficiency of SHEC(4,3,2) = 4/(4+3) = 0.57 + SHEC(5,3,2) or SHEC(4,2,2) improves SHEC(4,3,2)'s space efficiency + +Durability +---------- + +The third parameter of SHEC (=c) is a durability estimator, which approximates +the number of OSDs that can be down without losing data. + +``durability estimator of SHEC(4,3,2) = 2`` + +Recovery Efficiency +------------------- + +Describing calculation of recovery efficiency is beyond the scope of this document, +but at least increasing m without increasing c achieves improvement of recovery efficiency. +(However, we must pay attention to the sacrifice of space efficiency in this case.) + +``SHEC(4,2,2) -> SHEC(4,3,2) : achieves improvement of recovery efficiency`` + +Erasure code profile examples +============================= + +:: + + $ ceph osd erasure-code-profile set SHECprofile \ + plugin=shec \ + k=8 m=4 c=3 \ + ruleset-failure-domain=host + $ ceph osd pool create shecpool 256 256 erasure SHECprofile diff --git a/doc/rados/operations/erasure-code.rst b/doc/rados/operations/erasure-code.rst index 512e1be1ea3d5..9925e21b2f1fb 100644 --- a/doc/rados/operations/erasure-code.rst +++ b/doc/rados/operations/erasure-code.rst @@ -55,7 +55,7 @@ the following profile can be defined:: k=3 \ m=2 \ ruleset-failure-domain=rack - $ ceph osd pool create ecpool 12 12 erasure *myprofile* + $ ceph osd pool create ecpool 12 12 erasure myprofile $ echo ABCDEFGHI | rados --pool ecpool put NYAN - $ rados --pool ecpool get NYAN - ABCDEFGHI @@ -137,7 +137,7 @@ because it requires partial writes. It is however possible to create an RBD image on an erasure coded pools when a replicated pool tier set a cache tier:: - $ rbd --pool ecpool create --size 10 myvolume + $ rbd create --size 10G ecpool/myvolume More information can be found in the `cache tiering <../cache-tiering>`_ documentation. @@ -171,3 +171,4 @@ Table of content erasure-code-jerasure erasure-code-isa erasure-code-lrc + erasure-code-shec diff --git a/doc/rados/operations/monitoring-osd-pg.rst b/doc/rados/operations/monitoring-osd-pg.rst index ccb9adea808f5..686b27bd6e80a 100644 --- a/doc/rados/operations/monitoring-osd-pg.rst +++ b/doc/rados/operations/monitoring-osd-pg.rst @@ -552,7 +552,7 @@ include: To identify stuck placement groups, execute the following:: - ceph pg dump_stuck [unclean|inactive|stale] + ceph pg dump_stuck [unclean|inactive|stale|undersized|degraded] See `Placement Group Subsystem`_ for additional details. To troubleshoot stuck placement groups, see `Troubleshooting PG Errors`_. diff --git a/doc/rados/operations/pg-states.rst b/doc/rados/operations/pg-states.rst index 06f67eb814782..8da73bc6ef7d1 100644 --- a/doc/rados/operations/pg-states.rst +++ b/doc/rados/operations/pg-states.rst @@ -57,9 +57,11 @@ map is ``active + clean``. full ratio. *Incomplete* - Ceph detects that a placement group is missing a necessary period of history - from its log. If you see this state, report a bug, and try to start any - failed OSDs that may contain the needed information. + Ceph detects that a placement group is missing information about + writes that may have occurred, or does not have any healthy + copies. If you see this state, try to start any failed OSDs that may + contain the needed information or temporarily adjust min_size to + allow recovery. *Stale* The placement group is in an unknown state - the monitors have not received @@ -68,3 +70,11 @@ map is ``active + clean``. *Remapped* The placement group is temporarily mapped to a different set of OSDs from what CRUSH specified. + +*Undersized* + The placement group fewer copies than the configured pool replication level. + +*Peered* + The placement group has peered, but cannot serve client IO due to not having + enough copies to reach the pool's configured min_size parameter. Recovery + may occur in this state, so the pg may heal up to min_size eventually. diff --git a/doc/rados/operations/placement-groups.rst b/doc/rados/operations/placement-groups.rst index 41ffc911390ac..b408f9c8bd974 100644 --- a/doc/rados/operations/placement-groups.rst +++ b/doc/rados/operations/placement-groups.rst @@ -9,7 +9,7 @@ A preselection of pg_num When creating a new pool with:: - ceph osd pool set {pool-name} pg_num + ceph osd pool create {pool-name} pg_num it is mandatory to choose the value of ``pg_num`` because it cannot be calculated automatically. Here are a few values commonly used: @@ -58,11 +58,10 @@ cannot realistically track placement on a per-object basis. | | +-----------------------+ -Placement groups are invisible to the Ceph user: the CRUSH algorithm -determines in which placement group the object will be -placed. Although CRUSH is a deterministic function using the object -name as a parameter, there is no way to force an object into a given -placement group. +The Ceph client will calculate which placement group an object should +be in. It does this by hashing the object ID and applying an operation +based on the number of PGs in the defined pool and the ID of the pool. +See `Mapping PGs to OSDs`_ for details. The object's contents within a placement group are stored in a set of OSDs. For instance, in a replicated pool of size two, each placement @@ -178,19 +177,19 @@ increased. No matter how short the recovery time is, there is a chance for a second OSD to fail while it is in progress. In the 10 OSDs cluster -described above, if any of them fail, then ~8 placement groups -(i.e. ~75 / 9 placement groups being recovered) will only have one +described above, if any of them fail, then ~17 placement groups +(i.e. ~150 / 9 placement groups being recovered) will only have one surviving copy. And if any of the 8 remaining OSD fail, the last -objects of one placement group are likely to be lost (i.e. ~8 / 8 +objects of two placement groups are likely to be lost (i.e. ~17 / 8 placement groups with only one remaining copy being recovered). When the size of the cluster grows to 20 OSDs, the number of Placement Groups damaged by the loss of three OSDs drops. The second OSD lost -will degrade ~2 (i.e. ~35 / 19 placement groups being recovered) -instead of ~8 and the third OSD lost will only lose data if it is one -of the two OSDs containing the surviving copy. In other words, if the +will degrade ~4 (i.e. ~75 / 19 placement groups being recovered) +instead of ~17 and the third OSD lost will only lose data if it is one +of the four OSDs containing the surviving copy. In other words, if the probability of losing one OSD is 0.0001% during the recovery time -frame, it goes from 8 * 0.0001% in the cluster with 10 OSDs to 2 * +frame, it goes from 17 * 10 * 0.0001% in the cluster with 10 OSDs to 4 * 20 * 0.0001% in the cluster with 20 OSDs. In a nutshell, more OSDs mean faster recovery and a lower risk of @@ -310,10 +309,14 @@ placement groups, execute the following:: ceph osd pool set {pool-name} pg_num {pg_num} Once you increase the number of placement groups, you must also -increase the number of placement groups for placement (``pgp_num``) before your -cluster will rebalance. The ``pgp_num`` should be equal to the ``pg_num``. -To increase the number of placement groups for placement, execute the -following:: +increase the number of placement groups for placement (``pgp_num``) +before your cluster will rebalance. The ``pgp_num`` will be the number of +placement groups that will be considered for placement by the CRUSH +algorithm. Increasing ``pg_num`` splits the placement groups but data +will not be migrated to the newer placement groups until placement +groups for placement, ie. ``pgp_num`` is increased. The ``pgp_num`` +should be equal to the ``pg_num``. To increase the number of +placement groups for placement, execute the following:: ceph osd pool set {pool-name} pgp_num {pgp_num} @@ -342,7 +345,7 @@ Get Statistics for Stuck PGs To get the statistics for all placement groups stuck in a specified state, execute the following:: - ceph pg dump_stuck inactive|unclean|stale [--format ] [-t|--threshold ] + ceph pg dump_stuck inactive|unclean|stale|undersized|degraded [--format ] [-t|--threshold ] **Inactive** Placement groups cannot process reads or writes because they are waiting for an OSD with the most up-to-date data to come up and in. @@ -427,3 +430,4 @@ entirely. To mark the "unfound" objects as "lost", execute the following:: .. _Create a Pool: ../pools#createpool +.. _Mapping PGs to OSDs: ../../../architecture#mapping-pgs-to-osds diff --git a/doc/rados/operations/pools.rst b/doc/rados/operations/pools.rst index efa75e95fadf1..6f021b7f99e4a 100644 --- a/doc/rados/operations/pools.rst +++ b/doc/rados/operations/pools.rst @@ -52,7 +52,9 @@ Create a Pool Before creating pools, refer to the `Pool, PG and CRUSH Config Reference`_. Ideally, you should override the default value for the number of placement -groups in you Ceph configuration file, as the default is NOT ideal. +groups in your Ceph configuration file, as the default is NOT ideal. +For details on placement group numbers refer to `setting the number of placement groups`_ + For example:: osd pool default pg num = 100 @@ -71,7 +73,7 @@ Where: :Description: The name of the pool. It must be unique. :Type: String -:Required: Yes. Picks up default or Ceph configuration value if not specified. +:Required: Yes. ``{pg-num}`` @@ -110,15 +112,17 @@ Where: ``[crush-ruleset-name]`` -:Description: The name of the crush ruleset for this pool. If specified ruleset - doesn't exist, the creation of **replicated** pool will fail with - -ENOENT. But **replicated** pool will create a new erasure - ruleset with specified name. +:Description: The name of a CRUSH ruleset to use for this pool. The specified + ruleset must exist. :Type: String :Required: No. -:Default: "erasure-code" for **erasure pool**. Pick up Ceph configuraion variable - **osd_pool_default_crush_replicated_ruleset** for **replicated** pool. +:Default: For **replicated** pools it is the ruleset specified by the ``osd + pool default crush replicated ruleset`` config variable. This + ruleset must exist. + For **erasure** pools it is ``erasure-code`` if the ``default`` + `erasure code profile`_ is used or ``{pool-name}`` otherwise. This + ruleset will be created implicitly if it doesn't exist already. ``[erasure-code-profile=profile]`` @@ -274,6 +278,30 @@ You may set values for the following keys: :Version: Version ``0.48`` Argonaut and above. +``nodelete`` + +:Description: Set/Unset NODELETE flag on a given pool. +:Type: Integer +:Valid Range: 1 sets flag, 0 unsets flag +:Version: Version ``FIXME`` + + +``nopgchange`` + +:Description: Set/Unset NOPGCHANGE flag on a given pool. +:Type: Integer +:Valid Range: 1 sets flag, 0 unsets flag +:Version: Version ``FIXME`` + + +``nosizechange`` + +:Description: Set/Unset NOSIZECHANGE flag on a given pool. +:Type: Integer +:Valid Range: 1 sets flag, 0 unsets flag +:Version: Version ``FIXME`` + + ``hit_set_type`` :Description: Enables hit set tracking for cache pools. @@ -322,6 +350,16 @@ You may set values for the following keys: :Default: ``.4`` +``cache_target_dirty_high_ratio`` + +:Description: The percentage of the cache pool containing modified (dirty) + objects before the cache tiering agent will flush them to the + backing storage pool with a higher speed. + +:Type: Double +:Default: ``.6`` + + ``cache_target_full_ratio`` :Description: The percentage of the cache pool containing unmodified (clean) @@ -460,6 +498,15 @@ You may get values for the following keys: :Type: Double +``cache_target_dirty_high_ratio`` + +:Description: The percentage of the cache pool containing modified (dirty) + objects before the cache tiering agent will flush them to the + backing storage pool with a higher speed. + +:Type: Double + + ``cache_target_full_ratio`` :Description: The percentage of the cache pool containing unmodified (clean) @@ -542,3 +589,4 @@ a size of 3). .. _Pool, PG and CRUSH Config Reference: ../../configuration/pool-pg-config-ref .. _Bloom Filter: http://en.wikipedia.org/wiki/Bloom_filter +.. _setting the number of placement groups: ../placement-groups#set-the-number-of-placement-groups diff --git a/doc/rados/operations/user-management.rst b/doc/rados/operations/user-management.rst index 9152deafd4528..4257dc5ed2956 100644 --- a/doc/rados/operations/user-management.rst +++ b/doc/rados/operations/user-management.rst @@ -189,7 +189,7 @@ The following entries describe each capability. bootstrapping an OSD. -``profile bootstrap-osd`` +``profile bootstrap-mds`` :Description: Gives a user permissions to bootstrap a metadata server. Conferred on deployment tools such as ``ceph-deploy``, etc. @@ -601,7 +601,7 @@ Ceph supports the following usage for user name and secret: preferred approach, because you can switch user names without switching the keyring path. For example:: - sudo rbd map foo --pool rbd myimage --id client.foo --keyring /path/to/keyring + sudo rbd map --id foo --keyring /path/to/keyring mypool/myimage .. _pools: ../pools @@ -641,7 +641,7 @@ authentication issues more fully. At the moment, none of the Ceph authentication protocols provide secrecy for messages in transit. Thus, an eavesdropper on the wire can hear and understand -all data sent between clients and servers in Ceph, even if he cannot create or +all data sent between clients and servers in Ceph, even if it cannot create or alter them. Further, Ceph does not include options to encrypt user data in the object store. Users can hand-encrypt and store their own data in the Ceph object store, of course, but Ceph provides no features to perform object @@ -650,4 +650,4 @@ encrypting their data before providing it to the Ceph system. .. _Architecture - High Availability Authentication: ../../../architecture#high-availability-authentication -.. _Cephx Config Reference: ../../configuration/auth-config-ref \ No newline at end of file +.. _Cephx Config Reference: ../../configuration/auth-config-ref diff --git a/doc/rados/troubleshooting/community.rst b/doc/rados/troubleshooting/community.rst index df51f0221ec6c..9faad131076e5 100644 --- a/doc/rados/troubleshooting/community.rst +++ b/doc/rados/troubleshooting/community.rst @@ -7,9 +7,6 @@ operational issues with Ceph releases we recommend you `subscribe to the ceph-users email list`_. When you no longer want to receive emails, you can `unsubscribe from the ceph-users email list`_. -If you have read through this guide and you have contacted ``ceph-users``, -but you haven't resolved your issue, you may contact `Inktank`_ for support. - You may also `subscribe to the ceph-devel email list`_. You should do so if your issue is: @@ -23,12 +20,10 @@ may `unsubscribe from the ceph-devel email list`_. .. tip:: The Ceph community is growing rapidly, and community members can help you if you provide them with detailed information about your problem. You - can attach your ceph configuration file, log files, CRUSH map, and other - details (e.g., ``ceph osd tree``) to help people understand your issues. + can attach the output of the ``ceph report`` command to help people understand your issues. .. _subscribe to the ceph-devel email list: mailto:majordomo@vger.kernel.org?body=subscribe+ceph-devel .. _unsubscribe from the ceph-devel email list: mailto:majordomo@vger.kernel.org?body=unsubscribe+ceph-devel .. _subscribe to the ceph-users email list: mailto:ceph-users-join@lists.ceph.com .. _unsubscribe from the ceph-users email list: mailto:ceph-users-leave@lists.ceph.com -.. _ceph-devel: ceph-devel@vger.kernel.org -.. _Inktank: http://inktank.com \ No newline at end of file +.. _ceph-devel: ceph-devel@vger.kernel.org \ No newline at end of file diff --git a/doc/rados/troubleshooting/log-and-debug.rst b/doc/rados/troubleshooting/log-and-debug.rst index 4a332a9001dd6..1b9317a3c7b77 100644 --- a/doc/rados/troubleshooting/log-and-debug.rst +++ b/doc/rados/troubleshooting/log-and-debug.rst @@ -44,7 +44,7 @@ To activate Ceph's debugging output (*i.e.*, ``dout()``) at runtime, use the Replace ``{daemon-type}`` with one of ``osd``, ``mon`` or ``mds``. You may apply the runtime setting to all daemons of a particular type with ``*``, or specify -a specific daemon's ID (i.e., its number or letter). For example, to increase +a specific daemon's ID. For example, to increase debug logging for a ``ceph-osd`` daemon named ``osd.0``, execute the following:: ceph tell osd.0 injectargs --debug-osd 0/5 @@ -147,7 +147,11 @@ Each subsystem has a logging level for its output logs, and for its logs in-memory. You may set different values for each of these subsystems by setting a log file level and a memory level for debug logging. Ceph's logging levels operate on a scale of ``1`` to ``20``, where ``1`` is terse and ``20`` is -verbose. +verbose. In general, the logs in-memory are not sent to the output log unless: + +- a fatal signal is raised or +- an ``assert`` in source code is triggered or +- upon requested. Please consult `document on admin socket `_ for more details. A debug logging setting can take a single value for the log level and the memory level, which sets them both as the same value. For example, if you diff --git a/doc/rados/troubleshooting/troubleshooting-mon.rst b/doc/rados/troubleshooting/troubleshooting-mon.rst index c753de3ca2494..dff673b39bddd 100644 --- a/doc/rados/troubleshooting/troubleshooting-mon.rst +++ b/doc/rados/troubleshooting/troubleshooting-mon.rst @@ -31,7 +31,7 @@ Initial Troubleshooting the server and, if that succeeds, try connecting to the monitor's port using you tool of choice (telnet, nc,...). -**Does ``ceph -s`` run and obtain a reply from the cluster?** +**Does ceph -s run and obtain a reply from the cluster?** If the answer is yes then your cluster is up and running. One thing you can take for granted is that the monitors will only answer to a ``status`` @@ -43,7 +43,7 @@ Initial Troubleshooting enough to form a quorum (keep in mind that a quorum if formed by a majority of monitors). -**What if ``ceph -s`` doesn't finish?** +**What if ceph -s doesn't finish?** If you haven't gone through all the steps so far, please go back and do. @@ -378,9 +378,10 @@ like this appropriately:: You may also need to add rules to IP tables on your Ceph hosts to ensure that clients can access the ports associated with your Ceph monitors (i.e., port -6789 by default) and Ceph OSDs (i.e., 6800 et. seq. by default). For example:: +6789 by default) and Ceph OSDs (i.e., 6800 through 7300 by default). For +example:: - iptables -A INPUT -m multiport -p tcp -s {ip-address}/{netmask} --dports 6789,6800:6810 -j ACCEPT + iptables -A INPUT -m multiport -p tcp -s {ip-address}/{netmask} --dports 6789,6800:7300 -j ACCEPT Everything Failed! Now What? @@ -429,8 +430,8 @@ ask you to raise them or even define other debug subsystems to obtain infos from -- but at least we started off with some useful information, instead of a massively empty log without much to go on with. -Do I need to restart a monitor to adjust deebug levels? -------------------------------------------------------- +Do I need to restart a monitor to adjust debug levels? +------------------------------------------------------ No. You may do it in one of two ways: diff --git a/doc/rados/troubleshooting/troubleshooting-osd.rst b/doc/rados/troubleshooting/troubleshooting-osd.rst index 18460067f9af3..1cbdaeec8e29a 100644 --- a/doc/rados/troubleshooting/troubleshooting-osd.rst +++ b/doc/rados/troubleshooting/troubleshooting-osd.rst @@ -4,7 +4,7 @@ Before troubleshooting your OSDs, check your monitors and network first. If you execute ``ceph health`` or ``ceph -s`` on the command line and Ceph returns -a health status, the return of a status means that the monitors have a quorum. +a health status, it means that the monitors have a quorum. If you don't have a monitor quorum or if there are errors with the monitor status, `address the monitor issues first <../troubleshooting-mon>`_. Check your networks to ensure they diff --git a/doc/rados/troubleshooting/troubleshooting-pg.rst b/doc/rados/troubleshooting/troubleshooting-pg.rst index 3c4b4224061f4..0ec2be54a0814 100644 --- a/doc/rados/troubleshooting/troubleshooting-pg.rst +++ b/doc/rados/troubleshooting/troubleshooting-pg.rst @@ -254,12 +254,12 @@ data:: "status": "osd is down"}]}, In this case, for example, the cluster knows that ``osd.1`` might have -data, but it is ``down``. The full range of possible states include:: +data, but it is ``down``. The full range of possible states include: - * already probed - * querying - * OSD is down - * not queried (yet) +* already probed +* querying +* OSD is down +* not queried (yet) Sometimes it simply takes some time for the cluster to query possible locations. @@ -359,6 +359,200 @@ monitor hosts to act as peers. See `The Network Time Protocol`_ and Ceph `Clock Settings`_ for additional details. +Erasure Coded PGs are not active+clean +====================================== + +When CRUSH fails to find enough OSDs to map to a PG, it will show as a +``2147483647`` which is ITEM_NONE or ``no OSD found``. For instance:: + + [2,1,6,0,5,8,2147483647,7,4] + +Not enough OSDs +--------------- + +If the Ceph cluster only has 8 OSDs and the erasure coded pool needs +9, that is what it will show. You can either create another erasure +coded pool that requires less OSDs:: + + ceph osd erasure-code-profile set myprofile k=5 m=3 + ceph osd pool create erasurepool 16 16 erasure myprofile + +or add a new OSDs and the PG will automatically use them. + +CRUSH constraints cannot be satisfied +------------------------------------- + +If the cluster has enough OSDs, it is possible that the CRUSH ruleset +imposes constraints that cannot be satisfied. If there are 10 OSDs on +two hosts and the CRUSH rulesets require that no two OSDs from the +same host are used in the same PG, the mapping may fail because only +two OSD will be found. You can check the constraint by displaying the +ruleset:: + + $ ceph osd crush rule ls + [ + "replicated_ruleset", + "erasurepool"] + $ ceph osd crush rule dump erasurepool + { "rule_id": 1, + "rule_name": "erasurepool", + "ruleset": 1, + "type": 3, + "min_size": 3, + "max_size": 20, + "steps": [ + { "op": "take", + "item": -1, + "item_name": "default"}, + { "op": "chooseleaf_indep", + "num": 0, + "type": "host"}, + { "op": "emit"}]} + + +You can resolve the problem by creating a new pool in which PGs are allowed +to have OSDs residing on the same host with:: + + ceph osd erasure-code-profile set myprofile ruleset-failure-domain=osd + ceph osd pool create erasurepool 16 16 erasure myprofile + +CRUSH gives up too soon +----------------------- + +If the Ceph cluster has just enough OSDs to map the PG (for instance a +cluster with a total of 9 OSDs and an erasure coded pool that requires +9 OSDs per PG), it is possible that CRUSH gives up before finding a +mapping. It can be resolved by: + +* lowering the erasure coded pool requirements to use less OSDs per PG + (that requires the creation of another pool as erasure code profiles + cannot be dynamically modified). + +* adding more OSDs to the cluster (that does not require the erasure + coded pool to be modified, it will become clean automatically) + +* use a hand made CRUSH ruleset that tries more times to find a good + mapping. It can be done by setting ``set_choose_tries`` to a value + greater than the default. + +You should first verify the problem with ``crushtool`` after +extracting the crushmap from the cluster so your experiments do not +modify the Ceph cluster and only work on a local files:: + + $ ceph osd crush rule dump erasurepool + { "rule_name": "erasurepool", + "ruleset": 1, + "type": 3, + "min_size": 3, + "max_size": 20, + "steps": [ + { "op": "take", + "item": -1, + "item_name": "default"}, + { "op": "chooseleaf_indep", + "num": 0, + "type": "host"}, + { "op": "emit"}]} + $ ceph osd getcrushmap > crush.map + got crush map from osdmap epoch 13 + $ crushtool -i crush.map --test --show-bad-mappings \ + --rule 1 \ + --num-rep 9 \ + --min-x 1 --max-x $((1024 * 1024)) + bad mapping rule 8 x 43 num_rep 9 result [3,2,7,1,2147483647,8,5,6,0] + bad mapping rule 8 x 79 num_rep 9 result [6,0,2,1,4,7,2147483647,5,8] + bad mapping rule 8 x 173 num_rep 9 result [0,4,6,8,2,1,3,7,2147483647] + +Where ``--num-rep`` is the number of OSDs the erasure code crush +ruleset needs, ``--rule`` is the value of the ``ruleset`` field +displayed by ``ceph osd crush rule dump``. The test will try mapping +one million values (i.e. the range defined by ``[--min-x,--max-x]``) +and must display at least one bad mapping. If it outputs nothing it +means all mappings are successfull and you can stop right there: the +problem is elsewhere. + +The crush ruleset can be edited by decompiling the crush map:: + + $ crushtool --decompile crush.map > crush.txt + +and adding the following line to the ruleset:: + + step set_choose_tries 100 + +The relevant part of of the ``crush.txt`` file should look something +like:: + + rule erasurepool { + ruleset 1 + type erasure + min_size 3 + max_size 20 + step set_chooseleaf_tries 5 + step set_choose_tries 100 + step take default + step chooseleaf indep 0 type host + step emit + } + +It can then be compiled and tested again:: + + $ crushtool --compile crush.txt -o better-crush.map + +When all mappings succeed, an histogram of the number of tries that +were necessary to find all of them can be displayed with the +``--show-choose-tries`` option of ``crushtool``:: + + $ crushtool -i better-crush.map --test --show-bad-mappings \ + --show-choose-tries \ + --rule 1 \ + --num-rep 9 \ + --min-x 1 --max-x $((1024 * 1024)) + ... + 11: 42 + 12: 44 + 13: 54 + 14: 45 + 15: 35 + 16: 34 + 17: 30 + 18: 25 + 19: 19 + 20: 22 + 21: 20 + 22: 17 + 23: 13 + 24: 16 + 25: 13 + 26: 11 + 27: 11 + 28: 13 + 29: 11 + 30: 10 + 31: 6 + 32: 5 + 33: 10 + 34: 3 + 35: 7 + 36: 5 + 37: 2 + 38: 5 + 39: 5 + 40: 2 + 41: 5 + 42: 4 + 43: 1 + 44: 2 + 45: 2 + 46: 3 + 47: 1 + 48: 0 + ... + 102: 0 + 103: 1 + 104: 0 + ... + +It took 11 tries to map 42 PGs, 12 tries to map 44 PGs etc. The highest number of tries is the minimum value of ``set_choose_tries`` that prevents bad mappings (i.e. 103 in the above output because it did not take more than 103 tries for any PG to be mapped). .. _check: ../../operations/placement-groups#get-the-number-of-placement-groups .. _here: ../../configuration/pool-pg-config-ref diff --git a/doc/radosgw/config-ref.rst b/doc/radosgw/config-ref.rst index c78919cfa8e71..7cac514562a7b 100644 --- a/doc/radosgw/config-ref.rst +++ b/doc/radosgw/config-ref.rst @@ -68,7 +68,7 @@ Ceph configuration file, the default value will be set automatically. ``rgw dns name`` -:Description: The DNS name of the served domain. +:Description: The DNS name of the served domain. See also the ``hostnames`` setting within regions. :Type: String :Default: None @@ -208,7 +208,7 @@ Ceph configuration file, the default value will be set automatically. :Default: ``false`` -``rgw object stripe size`` +``rgw obj stripe size`` :Description: The size of an object stripe for Ceph Object Gateway objects. See `Architecture`_ for details on striping. @@ -270,6 +270,17 @@ Ceph configuration file, the default value will be set automatically. :Default: ``1000`` +``rgw override bucket index max shards`` + +:Description: Represents the number of shards for the bucket index object, + a value of zero indicates there is no sharding. It is not + recommended to set a value too large (e.g. thousand) as it + increases the cost for bucket listing. + +:Type: Integer +:Default: ``0`` + + ``rgw num zone opstate shards`` :Description: The maximum number of shards for keeping inter-region copy @@ -316,6 +327,12 @@ Ceph configuration file, the default value will be set automatically. :Default: ``admin`` +``rgw content length compat`` + +:Description: Enable compatability handling of FCGI requests with both CONTENT_LENGTH AND HTTP_CONTENT_LENGTH set. +:Type: Boolean +:Default: ``false`` + Regions ======= @@ -373,6 +390,7 @@ The ``default`` region looks like this: "api_name": "", "is_master": "true", "endpoints": [], + "hostnames": [], "master_zone": "", "zones": [ {"name": "default", @@ -403,6 +421,11 @@ required settings: escape the forward slashes (``\/``). You may also specify a port (``fqdn:port``) for each endpoint. Optional. +#. ``hostnames``: A list of all the hostnames in the region. For example, + you may use multiple domain names to refer to the same region. Optional. + The ``rgw dns name`` setting will automatically be included in this list. + You should restart the ``radosgw`` daemon(s) after changing this setting. + #. ``master_zone``: The master zone for the region. Optional. Uses the default zone if not specified. **note:** You can only have one master zone per region. @@ -464,6 +487,7 @@ JSON object is an example of a default region map. "api_name": "", "is_master": "true", "endpoints": [], + "hostnames": [], "master_zone": "", "zones": [ { "name": "default", diff --git a/doc/radosgw/config.rst b/doc/radosgw/config.rst index 5f15c4c6d73d1..549a273885d00 100644 --- a/doc/radosgw/config.rst +++ b/doc/radosgw/config.rst @@ -38,9 +38,9 @@ the node containing the gateway instance. of placement groups first. See `Pool Configuration`_ for details. -See the `Cephx Guide`_ for additional details on Ceph authentication. +See `User Management`_ for additional details on Ceph authentication. -#. Create a keyring for the gateway. :: +#. Create a keyring for the gateway:: sudo ceph-authtool --create-keyring /etc/ceph/ceph.client.radosgw.keyring sudo chmod +r /etc/ceph/ceph.client.radosgw.keyring @@ -72,6 +72,7 @@ See the `Cephx Guide`_ for additional details on Ceph authentication. sudo mv ceph.client.radosgw.keyring /etc/ceph/ceph.client.radosgw.keyring + .. note:: The 5th step is optional if ``admin node`` is the ``gateway host``. Create Pools ============ @@ -82,17 +83,21 @@ will create the pools automatically. However, you should ensure that you have set an appropriate default number of placement groups per pool into your Ceph configuration file. +.. note:: Ceph Object Gateways have multiple pools, so don't make the number of + PGs too high considering all of the pools assigned to the same CRUSH + hierarchy, or performance may suffer. + When configuring a gateway with the default region and zone, the naming convention for pools typically omits region and zone naming, but you can use any naming convention you prefer. For example: -- ``.rgw`` - ``.rgw.root`` - ``.rgw.control`` - ``.rgw.gc`` - ``.rgw.buckets`` - ``.rgw.buckets.index`` +- ``.rgw.buckets.extra`` - ``.log`` - ``.intent-log`` - ``.usage`` @@ -103,14 +108,18 @@ naming convention you prefer. For example: See `Configuration Reference - Pools`_ for details on the default pools for -gateways. See `Pools`_ for details on creating pools. Execute the following -to create a pool:: +gateways. See `Pools`_ for details on creating pools. As already said, if +write permission is given, Ceph Object Gateway will create pools automatically. +To create a pool manually, execute the following:: - ceph osd pool create {poolname} {pg-num} {pgp-num} + ceph osd pool create {poolname} {pg-num} {pgp-num} {replicated | erasure} [{erasure-code-profile}] {ruleset-name} {ruleset-number} -.. tip:: When adding a large number of pools, it may take some time for your - cluster to return to a ``active + clean`` state. +.. tip:: Ceph supports multiple CRUSH hierarchies and CRUSH rulesets, enabling + great flexibility in the way you configure your gateway. Pools such as + ``rgw.buckets.index`` may benefit from a pool of SSDs for fast performance. + Backing storage may benefit from the increased economy of erasure-coded + storage, and/or the improved performance from cache tiering. When you have completed this step, execute the following to ensure that you have created all of the foregoing pools:: @@ -121,320 +130,482 @@ you have created all of the foregoing pools:: Add a Gateway Configuration to Ceph =================================== -Add the Ceph Object Gateway configuration to your Ceph Configuration file. The -Ceph Object Gateway configuration requires you to identify the Ceph Object -Gateway instance. Then, you must specify the host name where you installed the -Ceph Object Gateway daemon, a keyring (for use with cephx), the socket path for -FastCGI and a log file. For example:: +Add the Ceph Object Gateway configuration to your Ceph Configuration file in +``admin node``. The Ceph Object Gateway configuration requires you to +identify the Ceph Object Gateway instance. Then, you must specify the host name +where you installed the Ceph Object Gateway daemon, a keyring (for use with +cephx), the socket path for FastCGI and a log file. + +For distros with Apache 2.2 and early versions of Apache 2.4 (RHEL 6, Ubuntu +12.04, 14.04 etc), append the following configuration to ``/etc/ceph/ceph.conf`` +in your ``admin node``:: - [client.radosgw.{instance-name}] - host = {host-name} + [client.radosgw.gateway] + host = {hostname} keyring = /etc/ceph/ceph.client.radosgw.keyring - rgw socket path = /var/run/ceph/ceph.radosgw.{instance-name}.fastcgi.sock - log file = /var/log/radosgw/client.radosgw.{instance-name}.log + rgw socket path = "" + log file = /var/log/radosgw/client.radosgw.gateway.log + rgw frontends = fastcgi socket_port=9000 socket_host=0.0.0.0 + rgw print continue = false -The ``[client.radosgw.*]`` portion of the gateway instance identifies this -portion of the Ceph configuration file as configuring a Ceph Storage Cluster -client where the client type is a Ceph Object Gateway (i.e., ``radosgw``). The -instance name follows. For example:: + +.. note:: Apache 2.2 and early versions of Apache 2.4 do not use Unix Domain + Sockets but use localhost TCP. + +For distros with Apache 2.4.9 or later (RHEL 7, CentOS 7 etc), append the +following configuration to ``/etc/ceph/ceph.conf`` in your ``admin node``:: [client.radosgw.gateway] - host = ceph-gateway + host = {hostname} keyring = /etc/ceph/ceph.client.radosgw.keyring rgw socket path = /var/run/ceph/ceph.radosgw.gateway.fastcgi.sock - log file = /var/log/radosgw/client.radosgw.{instance-name}.log + log file = /var/log/radosgw/client.radosgw.gateway.log + rgw print continue = false -.. note:: The ``host`` must be your machine hostname, not the FQDN. Make sure - that the name you use for the FastCGI socket is not the same as the one - used for the object gateway, which is - ``ceph-client.radosgw.{instance-name}.asok`` by default. You must use the - same name in your S3 FastCGI file too. See `Add a Ceph Object Gateway - Script`_ for details. -Configuring Print Continue --------------------------- +.. note:: ``Apache 2.4.9`` supports Unix Domain Socket (UDS) but as + ``Ubuntu 14.04`` ships with ``Apache 2.4.7`` it doesn't have UDS support and + has to be configured for use with localhost TCP. A bug has been filed for + backporting UDS support in ``Apache 2.4.7`` for ``Ubuntu 14.04``. + See: `Backport support for UDS in Ubuntu Trusty`_ -On CentOS/RHEL distributions, turn off ``print continue``. If you have it set -to ``true``, you may encounter problems with ``PUT`` operations. :: +Here, ``{hostname}`` is the short hostname (output of command ``hostname -s``) +of the node that is going to provide the gateway service i.e, the +``gateway host``. - rgw print continue = false +The ``[client.radosgw.gateway]`` portion of the gateway instance identifies this +portion of the Ceph configuration file as configuring a Ceph Storage Cluster +client where the client type is a Ceph Object Gateway (i.e., ``radosgw``). -Configuring Operations Logging ------------------------------- -In early releases of Ceph (v0.66 and earlier), the Ceph Object Gateway will log -every successful operation in the Ceph Object Gateway backend by default. This -means that every request, whether it is a read request or a write request will -generate a gateway operation that writes data. This does not come without cost, -and may affect overall performance. Turning off logging completely can be done -by adding the following config option to the Ceph configuration file:: +.. note:: The last line in the configuration i.e, ``rgw print continue = false`` + is added to avoid issues with ``PUT`` operations. - rgw enable ops log = false +Once you finish the setup procedure, if you encounter issues with your +configuration, you can add debugging to the ``[global]`` section of your Ceph +configuration file and restart the gateway to help troubleshoot any +configuration issues. For example:: -Another way to reduce the logging load is to send operations logging data to a -UNIX domain socket, instead of writing it to the Ceph Object Gateway backend:: + [global] + #append the following in the global section. + debug ms = 1 + debug rgw = 20 - rgw ops log rados = false - rgw enable ops log = true - rgw ops log socket path = -When specifying a UNIX domain socket, it is also possible to specify the maximum -amount of memory that will be used to keep the data backlog:: +Distribute updated Ceph configuration file +========================================== - rgw ops log data backlog = +The updated Ceph configuration file needs to be distributed to all Ceph cluster +nodes from the ``admin node``. -Any backlogged data in excess to the specified size will be lost, so the socket -needs to be read constantly. +It involves the following steps: +#. Pull the updated ``ceph.conf`` from ``/etc/ceph/`` to the root directory of + the cluster in admin node (e.g. ``my-cluster`` directory). The contents of + ``ceph.conf`` in ``my-cluster`` will get overwritten. To do so, execute the + following:: -Enabling Subdomain S3 Calls ---------------------------- + ceph-deploy --overwrite-conf config pull {hostname} -To use a Ceph Object Gateway with subdomain S3 calls (e.g., -``http://bucketname.hostname``), you must add the Ceph Object Gateway DNS name -under the ``[client.radosgw.gateway]`` section of your Ceph configuration file:: + Here, ``{hostname}`` is the short hostname of the Ceph admin node. - [client.radosgw.gateway] - ... - rgw dns name = {hostname} +#. Push the updated ``ceph.conf`` file from the admin node to all other nodes in + the cluster including the ``gateway host``:: -You should also consider installing a DNS server such as `Dnsmasq`_ on your -client machine(s) when using ``http://{bucketname}.{hostname}`` syntax. The -``dnsmasq.conf`` file should include the following settings:: + ceph-deploy --overwrite-conf config push [HOST] [HOST...] - address=/{hostname}/{host-ip-address} - listen-address={client-loopback-ip} + Give the hostnames of the other Ceph nodes in place of ``[HOST] [HOST...]``. -Then, add the ``{client-loopback-ip}`` IP address as the first DNS nameserver -on client the machine(s). -See `Add Wildcard to DNS`_ for details. +Copy ceph.client.admin.keyring from admin node to gateway host +============================================================== +As the ``gateway host`` can be a different node that is not part of the cluster, +the ``ceph.client.admin.keyring`` needs to be copied from the ``admin node`` to +the ``gateway host``. To do so, execute the following on ``admin node``:: -Redeploy Ceph Configuration ---------------------------- + sudo scp /etc/ceph/ceph.client.admin.keyring ceph@{hostname}:/home/ceph + ssh {hostname} + sudo mv ceph.client.admin.keyring /etc/ceph/ceph.client.admin.keyring -To use ``ceph-deploy`` to push a new copy of the configuration file to the hosts -in your cluster, execute the following:: - ceph-deploy config push {host-name [host-name]...} +.. note:: The above step need not be executed if ``admin node`` is the + ``gateway host``. -Add a Ceph Object Gateway Script -================================ +Create Data Directory +===================== -Add a ``s3gw.fcgi`` file (use the same name referenced in the first line -of ``rgw.conf``). For Debian/Ubuntu distributions, save the file to the -``/var/www`` directory. For CentOS/RHEL distributions, save the file to the -``/var/www/html`` directory. Assuming a cluster named ``ceph`` (default), -and the user created in previous steps, the contents of the file should -include:: +Deployment scripts may not create the default Ceph Object Gateway data +directory. Create data directories for each instance of a ``radosgw`` +daemon (if you haven't done so already). The ``host`` variables in the +Ceph configuration file determine which host runs each instance of a +``radosgw`` daemon. The typical form specifies the ``radosgw`` daemon, +the cluster name and the daemon ID. - #!/bin/sh - exec /usr/bin/radosgw -c /etc/ceph/ceph.conf -n client.radosgw.gateway +To create the directory on the ``gateway host``, execute the following:: -Ensure that you apply execute permissions to ``s3gw.fcgi``. :: + sudo mkdir -p /var/lib/ceph/radosgw/ceph-radosgw.gateway - sudo chmod +x s3gw.fcgi -On some distributions, you must also change the ownership to ``apache``. :: +Adjust Socket Directory Permissions +=================================== - sudo chown apache:apache s3gw.fcgi +On some distros, the ``radosgw`` daemon runs as the unprivileged ``apache`` +UID, and this UID must have write access to the location where it will write +its socket file. +To grant permissions to the default socket location, execute the following on +the ``gateway host``:: + sudo chown apache:apache /var/run/ceph -Create Data Directory -===================== -Deployment scripts may not create the default Ceph Object Gateway data -directory. Create data directories for each instance of a ``radosgw`` daemon -(if you haven't done so already). The ``host`` variables in the Ceph -configuration file determine which host runs each instance of a ``radosgw`` -daemon. The typical form specifies the ``radosgw`` daemon, the cluster name and -the daemon ID. :: +Change Log File Owner +===================== - sudo mkdir -p /var/lib/ceph/radosgw/{$cluster}-{$id} +On some distros, the ``radosgw`` daemon runs as the unprivileged ``apache`` UID, +but the ``root`` user owns the log file by default. You must change it to the +``apache`` user so that Apache can populate the log file. To do so, execute +the following:: -Using the exemplary ``ceph.conf`` settings above, you would execute the following:: + sudo chown apache:apache /var/log/radosgw/client.radosgw.gateway.log - sudo mkdir -p /var/lib/ceph/radosgw/ceph-radosgw.gateway +Start radosgw service +===================== +The Ceph Object gateway daemon needs to be started. To do so, execute the +following on the ``gateway host``: -Create a Gateway Configuration -============================== +On Debian-based distros:: -On the host where you installed the Ceph Object Gateway, create an ``rgw.conf`` -file. For Debian/Ubuntu systems, place the file in the -``/etc/apache2/sites-available`` directory. For CentOS/RHEL systems, place the -file in the ``/etc/httpd/conf.d`` directory. + sudo /etc/init.d/radosgw start -We recommend deploying FastCGI as an external server, because allowing Apache to -manage FastCGI sometimes introduces high latency. To manage FastCGI as an -external server, use the ``FastCgiExternalServer`` directive. See -`FastCgiExternalServer`_ for details on this directive. See `Module -mod_fastcgi`_ for general details. See `Apache Virtual Host documentation`_ for -details on ```` format and settings. See ` Directive`_ -for additional details. +On RPM-based distros:: -Ceph Object Gateway requires a rewrite rule for the Amazon S3-compatible -interface. It's required for passing in the ``HTTP_AUTHORIZATION env`` for S3, -which is filtered out by Apache. The rewrite rule is not necessary for the -OpenStack Swift-compatible interface. + sudo /etc/init.d/ceph-radosgw start -You should configure Apache to allow encoded slashes, provide paths for log -files and to turn off server signatures. See below for an exemplary embodiment -of a gateway configuration for Debian/Ubuntu and CentOS/RHEL. -.. rubric:: Debian/Ubuntu +Create a Gateway Configuration file +=================================== -.. literalinclude:: rgw-debian.conf - :language: ini +On the host where you installed the Ceph Object Gateway i.e, ``gateway host``, +create an ``rgw.conf`` file. Place the file in ``/etc/apache2/conf-available`` +directory for ``Debian-based`` distros and in ``/etc/httpd/conf.d`` directory +for ``RPM-based`` distros. It is a Apache configuration file which is needed +for the ``radosgw`` service. This file must be readable by the web server. -.. rubric:: CentOS/RHEL +Execute the following steps: -.. literalinclude:: rgw-centos.conf - :language: ini +#. Create the file: + For Debian-based distros, execute:: -#. Replace the ``/{path}/{socket-name}`` entry with path to the socket and - the socket name. For example, - ``/var/run/ceph/ceph.radosgw.gateway.fastcgi.sock``. Ensure that you use the - same path and socket name in your ``ceph.conf`` entry. + sudo vi /etc/apache2/conf-available/rgw.conf -#. Replace the ``{fqdn}`` entry with the fully-qualified domain name of the - server. - -#. Replace the ``{email.address}`` entry with the email address for the - server administrator. - -#. Add a ``ServerAlias`` if you wish to use S3-style subdomains - (of course you do). + For RPM-based distros, execute:: -#. Save the configuration to a file (e.g., ``rgw.conf``). + sudo vi /etc/httpd/conf.d/rgw.conf -Finally, if you enabled SSL, make sure that you set the port to your SSL port -(usually 443) and your configuration file includes the following:: +#. For distros with Apache 2.2 and early versions of Apache 2.4 that use + localhost TCP and do not support Unix Domain Socket, add the following + contents to the file:: - SSLEngine on - SSLCertificateFile /etc/apache2/ssl/apache.crt - SSLCertificateKeyFile /etc/apache2/ssl/apache.key - SetEnv SERVER_PORT_SECURE 443 + + ServerName localhost + DocumentRoot /var/www/html + ErrorLog /var/log/httpd/rgw_error.log + CustomLog /var/log/httpd/rgw_access.log combined -.. _Module mod_fastcgi: http://www.fastcgi.com/drupal/node/25 -.. _FastCgiExternalServer: http://www.fastcgi.com/drupal/node/25#FastCgiExternalServer -.. _Apache Virtual Host documentation: http://httpd.apache.org/docs/2.2/vhosts/ -.. _ Directive: http://httpd.apache.org/docs/2.2/mod/core.html#ifmodule - - -.. important:: If you are using CentOS, RHEL or a similar distribution, make - sure that ``FastCgiWrapper`` is turned ``off`` in - ``/etc/httpd/conf.d/fastcgi.conf``. It is usually ``on`` by default. + # LogLevel debug -For Debian/Ubuntu distributions, enable the site for ``rgw.conf``. :: + RewriteEngine On - sudo a2ensite rgw.conf + RewriteRule .* - [E=HTTP_AUTHORIZATION:%{HTTP:Authorization},L] -Then, disable the default site. :: + SetEnv proxy-nokeepalive 1 - sudo a2dissite default - + ProxyPass / fcgi://localhost:9000/ -Adjust Path Ownership/Permissions -================================= + -On some distributions, you must change ownership for ``/var/log/httpd`` or -``/var/log/apache2`` and ``/var/run/ceph`` to ensure that Apache has permissions -to create a socket or log file. :: + .. note:: For Debian-based distros replace ``/var/log/httpd/`` + with ``/var/log/apache2``. - sudo chown apache:apache /path/to/file +#. For distros with Apache 2.4.9 or later that support Unix Domain Socket, + add the following contents to the file:: -On some systems, you may need to set SELinux to ``Permissive``. If you are -unable to communicate with the gateway after attempting to start it, try -executing:: + + ServerName localhost + DocumentRoot /var/www/html - getenforce - -If the result is ``1`` or ``Enforcing``, execute:: + ErrorLog /var/log/httpd/rgw_error.log + CustomLog /var/log/httpd/rgw_access.log combined - sudo setenforce 0 + # LogLevel debug -Then, restart Apache and the gateway daemon to see if that resolves the issue. -If it does, you can configure your system to disable SELinux. + RewriteEngine On + RewriteRule .* - [E=HTTP_AUTHORIZATION:%{HTTP:Authorization},L] -Restart Services and Start the Gateway -====================================== + SetEnv proxy-nokeepalive 1 -To ensure that all components have reloaded their configurations, we recommend -restarting your ``ceph`` and ``apache`` services. Then, start up the -``radosgw`` service. + ProxyPass / unix:///var/run/ceph/ceph.radosgw.gateway.fastcgi.sock|fcgi://localhost:9000/ -For the Ceph Storage Cluster, see `Operating a Cluster`_ for details. Some -versions of Ceph use different methods for starting and stopping clusters. + Restart Apache --------------- +============== + +The Apache service needs to be restarted to accept the new configuration. -On Debian/Ubuntu systems, use ``apache2``. For example:: +For Debian-based distros, run:: sudo service apache2 restart - sudo /etc/init.d/apache2 restart -On CentOS/RHEL systems, use ``httpd``. For example:: +For RPM-based distros, run:: + + sudo service httpd restart + +Or:: + + sudo systemctl restart httpd + + +Using The Gateway +================= + +To use the REST interfaces, first create an initial Ceph Object Gateway +user for the S3 interface. Then, create a subuser for the Swift interface. +See the `Admin Guide`_ for more details on user management. + +Create a radosgw user for S3 access +------------------------------------ + +A ``radosgw`` user needs to be created and granted access. The command +``man radosgw-admin`` will provide information on additional command options. + +To create the user, execute the following on the ``gateway host``:: + + sudo radosgw-admin user create --uid="testuser" --display-name="First User" + +The output of the command will be something like the following:: + + {"user_id": "testuser", + "display_name": "First User", + "email": "", + "suspended": 0, + "max_buckets": 1000, + "auid": 0, + "subusers": [], + "keys": [ + { "user": "testuser", + "access_key": "I0PJDPCIYZ665MW88W9R", + "secret_key": "dxaXZ8U90SXydYzyS5ivamEP20hkLSUViiaR+ZDA"}], + "swift_keys": [], + "caps": [], + "op_mask": "read, write, delete", + "default_placement": "", + "placement_tags": [], + "bucket_quota": { "enabled": false, + "max_size_kb": -1, + "max_objects": -1}, + "user_quota": { "enabled": false, + "max_size_kb": -1, + "max_objects": -1}, + "temp_url_keys": []} + + +.. note:: The values of ``keys->access_key`` and ``keys->secret_key`` are + needed for access validation. + +Create a Swift user +------------------- + +A Swift subuser needs to be created if this kind of access is needed. Creating +a Swift user is a two step process. The first step is to create the user. +The second is to create the secret key. + +Execute the following steps on the ``gateway host``: + +Create the Swift user:: + + sudo radosgw-admin subuser create --uid=testuser --subuser=testuser:swift --access=full + +The output will be something like the following:: + + { "user_id": "testuser", + "display_name": "First User", + "email": "", + "suspended": 0, + "max_buckets": 1000, + "auid": 0, + "subusers": [ + { "id": "testuser:swift", + "permissions": "full-control"}], + "keys": [ + { "user": "testuser:swift", + "access_key": "3Y1LNW4Q6X0Y53A52DET", + "secret_key": ""}, + { "user": "testuser", + "access_key": "I0PJDPCIYZ665MW88W9R", + "secret_key": "dxaXZ8U90SXydYzyS5ivamEP20hkLSUViiaR+ZDA"}], + "swift_keys": [], + "caps": [], + "op_mask": "read, write, delete", + "default_placement": "", + "placement_tags": [], + "bucket_quota": { "enabled": false, + "max_size_kb": -1, + "max_objects": -1}, + "user_quota": { "enabled": false, + "max_size_kb": -1, + "max_objects": -1}, + "temp_url_keys": []} + +Create the secret key:: + + sudo radosgw-admin key create --subuser=testuser:swift --key-type=swift --gen-secret + +The output will be something like the following:: + + { "user_id": "testuser", + "display_name": "First User", + "email": "", + "suspended": 0, + "max_buckets": 1000, + "auid": 0, + "subusers": [ + { "id": "testuser:swift", + "permissions": "full-control"}], + "keys": [ + { "user": "testuser:swift", + "access_key": "3Y1LNW4Q6X0Y53A52DET", + "secret_key": ""}, + { "user": "testuser", + "access_key": "I0PJDPCIYZ665MW88W9R", + "secret_key": "dxaXZ8U90SXydYzyS5ivamEP20hkLSUViiaR+ZDA"}], + "swift_keys": [ + { "user": "testuser:swift", + "secret_key": "244+fz2gSqoHwR3lYtSbIyomyPHf3i7rgSJrF\/IA"}], + "caps": [], + "op_mask": "read, write, delete", + "default_placement": "", + "placement_tags": [], + "bucket_quota": { "enabled": false, + "max_size_kb": -1, + "max_objects": -1}, + "user_quota": { "enabled": false, + "max_size_kb": -1, + "max_objects": -1}, + "temp_url_keys": []} + +Access Verification +=================== + +You then need to verify if the created users are able to access the gateway. + +Test S3 access +-------------- + +You need to write and run a Python test script for verifying S3 access. The S3 +access test script will connect to the ``radosgw``, create a new bucket and list +all buckets. The values for ``aws_access_key_id`` and ``aws_secret_access_key`` +are taken from the values of ``access_key`` and ``secret_key`` returned by the +``radosgw_admin`` command. + +Execute the following steps: + +#. You will need to install the ``python-boto`` package. + + For Debian-based distros, run:: - sudo /etc/init.d/httpd restart + sudo apt-get install python-boto + For RPM-based distros, run:: -Start the Gateway + sudo yum install python-boto + +#. Create the Python script:: + + vi s3test.py + +#. Add the following contents to the file:: + + import boto + import boto.s3.connection + access_key = 'I0PJDPCIYZ665MW88W9R' + secret_key = 'dxaXZ8U90SXydYzyS5ivamEP20hkLSUViiaR+ZDA' + conn = boto.connect_s3( + aws_access_key_id = access_key, + aws_secret_access_key = secret_key, + host = '{hostname}', + is_secure=False, + calling_format = boto.s3.connection.OrdinaryCallingFormat(), + ) + bucket = conn.create_bucket('my-new-bucket') + for bucket in conn.get_all_buckets(): + print "{name}\t{created}".format( + name = bucket.name, + created = bucket.creation_date, + ) + + Replace ``{hostname}`` with the hostname of the host where you have + configured the gateway service i.e, the ``gateway host``. + +#. Run the script:: + + python s3test.py + + The output will be something like the following:: + + my-new-bucket 2015-02-16T17:09:10.000Z + +Test swift access ----------------- -On Debian/Ubuntu systems, use ``radosgw``. For example:: +Swift access can be verified via the ``swift`` command line client. The command +``man swift`` will provide more information on available command line options. - sudo /etc/init.d/radosgw start - -On CentOS/RHEL systems, use ``ceph-radosgw``. For example:: +To install ``swift`` client, execute the following: - sudo /etc/init.d/ceph-radosgw start + For Debian-based distros:: + sudo apt-get install python-setuptools + sudo easy_install pip + sudo pip install --upgrade setuptools + sudo pip install --upgrade python-swiftclient -Verify the Runtime ------------------- + For RPM-based distros:: -Once the service is up and running, you can make an anonymous GET request to see -if the gateway returns a response. A simple HTTP request to the domain name -should return the following: + sudo yum install python-setuptools + sudo easy_install pip + sudo pip install --upgrade setuptools + sudo pip install --upgrade python-swiftclient -.. code-block:: xml +To test swift access, execute the following:: - - - anonymous - - - - + swift -A http://{IP ADDRESS}/auth/1.0 -U testuser:swift -K ‘{swift_secret_key}’ list +Replace ``{IP ADDRESS}`` with the public IP address of the gateway server and +``{swift_secret_key}`` with its value from the output of +``radosgw-admin key create`` command executed for the ``swift`` user. -If you receive an error, check your settings and try again. See -`Adjust Path Ownership/Permissions`_ for details. +For example:: -Using The Gateway -================= + swift -A http://10.19.143.116/auth/1.0 -U testuser:swift -K ‘244+fz2gSqoHwR3lYtSbIyomyPHf3i7rgSJrF/IA’ list -To use the REST interfaces, first create an initial Ceph Object Gateway user for -the S3 interface. Then, create a subuser for the swift interface. See the `Admin -Guide`_ for details. +The output should be:: + my-new-bucket -.. _Dnsmasq: https://help.ubuntu.com/community/Dnsmasq .. _Configuration Reference - Pools: ../config-ref#pools .. _Pool Configuration: ../../rados/configuration/pool-pg-config-ref/ .. _Pools: ../../rados/operations/pools -.. _Cephx Guide: ../../rados/operations/authentication/#cephx-guide -.. _Operating a Cluster: ../../rados/rados/operations/operating +.. _User Management: ../../rados/operations/user-management +.. _Backport support for UDS in Ubuntu Trusty: https://bugs.launchpad.net/ubuntu/+source/apache2/+bug/1411030 .. _Admin Guide: ../admin -.. _Add Wildcard to DNS: ../../install/install-ceph-gateway#add-wildcard-to-dns diff --git a/doc/radosgw/federated-config.rst b/doc/radosgw/federated-config.rst index 863d1f7339bb4..9add92e19e9c8 100644 --- a/doc/radosgw/federated-config.rst +++ b/doc/radosgw/federated-config.rst @@ -103,14 +103,12 @@ format prepended to the pool name, but you can use any naming convention you prefer. For example: -- ``.us.rgw.root`` - -- ``.us-east.domain.rgw`` - ``.us-east.rgw.root`` - ``.us-east.rgw.control`` - ``.us-east.rgw.gc`` -- ``.us-east.rgw.buckets.index`` - ``.us-east.rgw.buckets`` +- ``.us-east.rgw.buckets.index`` +- ``.us-east.rgw.buckets.extra`` - ``.us-east.log`` - ``.us-east.intent-log`` - ``.us-east.usage`` @@ -119,12 +117,14 @@ prefer. For example: - ``.us-east.users.swift`` - ``.us-east.users.uid`` -- ``.us-west.domain.rgw`` +| + - ``.us-west.rgw.root`` - ``.us-west.rgw.control`` - ``.us-west.rgw.gc`` -- ``.us-west.rgw.buckets.index`` - ``.us-west.rgw.buckets`` +- ``.us-west.rgw.buckets.index`` +- ``.us-west.rgw.buckets.extra`` - ``.us-west.log`` - ``.us-west.intent-log`` - ``.us-west.usage`` @@ -137,7 +137,7 @@ See `Configuration Reference - Pools`_ for details on the default pools for gateways. See `Pools`_ for details on creating pools. Execute the following to create a pool:: - ceph osd pool create {poolname} {pg-num} {pgp-num} + ceph osd pool create {poolname} {pg-num} {pgp-num} {replicated | erasure} [{erasure-code-profile}] {ruleset-name} {ruleset-number} .. tip:: When adding a large number of pools, it may take some time for your @@ -148,6 +148,12 @@ to create a pool:: When deploying a Ceph Storage Cluster for the entire region, consider using a CRUSH rule for the zone such that you do NOT have overlapping failure domains. See `CRUSH Map`_ for details. + + Ceph supports multiple CRUSH hierarchies and CRUSH rulesets, enabling + great flexibility in the way you configure your gateway. Pools such + as ``rgw.buckets.index`` may benefit from a modestly sized pool of SSDs + for fast performance. Backing storage may benefit from the increased economy + of erasure-coded storage, and/or the improved performance from cache tiering. When you have completed this step, execute the following to ensure that you have created all of the foregoing pools:: diff --git a/doc/radosgw/index.rst b/doc/radosgw/index.rst index 564427a515b66..e608c69fc8889 100644 --- a/doc/radosgw/index.rst +++ b/doc/radosgw/index.rst @@ -3,7 +3,7 @@ ===================== :term:`Ceph Object Gateway` is an object storage interface built on top of -``librgw`` to provide applications with a RESTful gateway to +``librados`` to provide applications with a RESTful gateway to Ceph Storage Clusters. :term:`Ceph Object Storage` supports two interfaces: #. **S3-compatible:** Provides object storage functionality with an interface diff --git a/doc/radosgw/keystone.rst b/doc/radosgw/keystone.rst index 7fae21293f472..16ca2a663c2b1 100644 --- a/doc/radosgw/keystone.rst +++ b/doc/radosgw/keystone.rst @@ -25,12 +25,69 @@ the Ceph Object Gateway gets the ticket, it looks at the tenant, and the user roles that are assigned to that ticket, and accepts/rejects the request according to the ``rgw keystone accepted roles`` configurable. + +Prior to Kilo +------------- + +Keystone itself needs to be configured to point to the Ceph Object Gateway as an +object-storage endpoint:: + + keystone service-create --name swift --type object-store + keystone endpoint-create --service-id --publicurl http://radosgw.example.com/swift/v1 \ + --internalurl http://radosgw.example.com/swift/v1 --adminurl http://radosgw.example.com/swift/v1 + + +As of Kilo +---------- + Keystone itself needs to be configured to point to the Ceph Object Gateway as an object-storage endpoint:: - keystone service-create --name swift --type object-store - keystone endpoint-create --service-id --publicurl http://radosgw.example.com/swift/v1 \ - --internalurl http://radosgw.example.com/swift/v1 --adminurl http://radosgw.example.com/swift/v1 + openstack service create --name=swift \ + --description="Swift Service" \ + object-store + +-------------+----------------------------------+ + | Field | Value | + +-------------+----------------------------------+ + | description | Swift Service | + | enabled | True | + | id | 37c4c0e79571404cb4644201a4a6e5ee | + | name | swift | + | type | object-store | + +-------------+----------------------------------+ + + openstack endpoint create --region RegionOne \ + --publicurl "http://radosgw.example.com:8080/swift/v1" \ + --adminurl "http://radosgw.example.com:8080/swift/v1" \ + --internalurl "http://radosgw.example.com:8080/swift/v1" \ + swift + +--------------+------------------------------------------+ + | Field | Value | + +--------------+------------------------------------------+ + | adminurl | http://radosgw.example.com:8080/swift/v1 | + | id | e4249d2b60e44743a67b5e5b38c18dd3 | + | internalurl | http://radosgw.example.com:8080/swift/v1 | + | publicurl | http://radosgw.example.com:8080/swift/v1 | + | region | RegionOne | + | service_id | 37c4c0e79571404cb4644201a4a6e5ee | + | service_name | swift | + | service_type | object-store | + +--------------+------------------------------------------+ + + $ openstack endpoint show object-store + +--------------+------------------------------------------+ + | Field | Value | + +--------------+------------------------------------------+ + | adminurl | http://radosgw.example.com:8080/swift/v1 | + | enabled | True | + | id | e4249d2b60e44743a67b5e5b38c18dd3 | + | internalurl | http://radosgw.example.com:8080/swift/v1 | + | publicurl | http://radosgw.example.com:8080/swift/v1 | + | region | RegionOne | + | service_id | 37c4c0e79571404cb4644201a4a6e5ee | + | service_name | swift | + | service_type | object-store | + +--------------+------------------------------------------+ The keystone URL is the Keystone admin RESTful API URL. The admin token is the diff --git a/doc/radosgw/layout.rst b/doc/radosgw/layout.rst new file mode 100644 index 0000000000000..0ebb540179023 --- /dev/null +++ b/doc/radosgw/layout.rst @@ -0,0 +1,178 @@ +=========================== + Rados Gateway Data Layout +=========================== + +Although the source code is the ultimate guide, this document helps +new developers to get up to speed with the implementation details. + +Introduction +------------ + +Swift offers something called a container, that we use interchangeably with +the term bucket. One may say that RGW's buckets implement Swift containers. + +This document does not consider how RGW operates on these structures, +e.g. the use of encode() and decode() methods for serialization and so on. + +Conceptual View +--------------- + +Although RADOS only knows about pools and objects with their xattrs and +omap[1], conceptually RGW organizes its data into three different kinds: +metadata, bucket index, and data. + +* Metadata + +We have 3 'sections' of metadata: 'user', 'bucket', and 'bucket.instance'. +You can use the following commands to introspect metadata entries: + +$ radosgw-admin metadata list + +$ radosgw-admin metadata list bucket +$ radosgw-admin metadata list bucket.instance +$ radosgw-admin metadata list user + +$ radosgw-admin metadata get bucket: +$ radosgw-admin metadata get bucket.instance:: +$ radosgw-admin metadata get user: # or set + +user: Holds user information +bucket: Holds a mapping between bucket name and bucket instance id +bucket.instance: Holds bucket instance information[2] + +Every metadata entry is kept on a single rados object. +See below for implementation defails. + +Note that the metadata is not indexed. When listing a metadata section we do a +rados pgls operation on the containing pool. + +* Bucket Index + +It's a different kind of metadata, and kept separately. The bucket index holds +a key-value map in rados objects. By default it is a single rados object per +bucket, but it is possible since Hammer to shard that map over multiple rados +objects. The map itself is kept in omap, associated with each rados object. +The key of each omap is the name of the objects, and the value holds some basic +metadata of that object -- metadata that shows up when listing the bucket. +Also, each omap holds a header, and we keep some bucket accounting metadata +in that header (number of objects, total size, etc.). + +Note that we also hold other information in the bucket index, and it's kept in +other key namespaces. We can hold the bucket index log there, and for versioned +objects there is more information that we keep on other keys. + +* Data + +Objects data is kept in one or more rados objects for each rgw object. + +Object Lookup Path +------------------ + +When accessing objects, ReST APIs come to RGW with three parameters: +account information (access key in S3 or account name in Swift), +bucket or container name, and object name (or key). At present, RGW only +uses account information to find out the user ID and for access control. +Only the bucket name and object key are used to address the object in a pool. + +The user ID in RGW is a string, typically the actual user name from the user +credentials and not a hashed or mapped identifier. + +When accessing a user's data, the user record is loaded from an object +"" in pool ".users.uid". + +Bucket names are represented directly in the pool ".rgw". Bucket record is +loaded in order to obtain so-called marker, which serves as a bucket ID. + +The object is located in pool ".rgw.buckets". Object name is "_", +for example "default.7593.4_image.png", where the marker is "default.7593.4" +and the key is "image.png". Since these concatenated names are not parsed, +only passed down to RADOS, the choice of the separator is not important and +causes no ambiguity. For the same reason, slashes are permitted in object +names (keys). + +It is also possible to create multiple data pools and make it so that +different users buckets will be created in different rados pools by default, +thus providing the necessary scaling. The layout and naming of these pools +is controlled by a 'policy' setting.[3] + +An RGW object may consist of several RADOS objects, the first of which +is the head that contains the metadata, such as manifest, ACLs, content type, +ETag, and user-defined metadata. The metadata is stored in xattrs. +The head may also contain up to 512 kilobytes of object data, for efficiency +and atomicity. The manifest describes how each object is laid out in RADOS +objects. + +Bucket and Object Listing +------------------------- + +Buckets that belong to a given user are listed in an omap of an object named +".buckets" (for example, "foo.buckets") in pool ".users.uid". +These objects are accessed when listing buckets, when updating bucket +contents, and updating and retrieving bucket statistics (e.g. for quota). + +See the user-visible, encoded class 'cls_user_bucket_entry' and its +nested class 'cls_user_bucket' for the values of these omap entires. + +These listings are kept consistent with buckets in pool ".rgw". + +Objects that belong to a given bucket are listed in a bucket index, +as discussed in sub-section 'Bucket Index' above. The default naming +for index objects is ".dir." in pool ".rgw.buckets.index". + +Footnotes +--------- + +[1] Omap is a key-value store, associated with an object, in a way similar +to how Extended Attributes associate with a POSIX file. An object's omap +is not physically located in the object's storage, but its precise +implementation is invisible and immaterial to RADOS Gateway. +In Hammer, one LevelDB is used to store omap in each OSD. + +[2] Before the Dumpling release, the 'bucket.instance' metadata did not +exist and the 'bucket' metadata contained its information. It is possible +to encounter such buckets in old installations. + +[3] In Infernalis, a pending commit exists that removes the need of prefixing +all the rgw system pools with a period, and also renames all of these pools. +See Github pull request #4944 "rgw noperiod". + +Appendix: Compendum +------------------- + +Known pools: + +.rgw.root + Unspecified region, zone, and global information records, one per object. + +.rgw.control + notify. + +.rgw + + .bucket.meta.: + +.rgw.gc + gc. + +.users.uid + Contains _both_ per-user information (RGWUserInfo) in "" objects + and per-user bucket lists in omaps of ".buckets" objects. + +.users.email + Unimportant + +.users + 47UA98JSTJZ9YAN3OS3O + It's unclear why user ID is not used to name objects in this pool. + +.users.swift + test:tester + +.rgw.buckets.index + Objects are named ".dir.", each contains a bucket index. + +.rgw.buckets + default.7593.4__shadow_.488urDFerTYXavx4yAd-Op8mxehnvTI_1 + _ + +An example of a marker would be "default.16004.1" or "default.7593.4". diff --git a/doc/radosgw/s3.rst b/doc/radosgw/s3.rst index fe975e6b9667e..8b21e42ef4fd1 100644 --- a/doc/radosgw/s3.rst +++ b/doc/radosgw/s3.rst @@ -2,7 +2,7 @@ Ceph Object Gateway S3 API ============================ -Ceph supports a RESTful API that is compatible with the basic data access model of the Amazon S3 API. +Ceph supports a RESTful API that is compatible with the basic data access model of the `Amazon S3 API`_. API --- @@ -50,7 +50,7 @@ The following table describes the support status for current Amazon S3 functiona +---------------------------------+-----------------+----------------------------------------+ | **Bucket Notification** | Not Supported | | +---------------------------------+-----------------+----------------------------------------+ -| **Bucket Object Versions** | Not Supported | | +| **Bucket Object Versions** | Supported | | +---------------------------------+-----------------+----------------------------------------+ | **Get Bucket Info (HEAD)** | Supported | | +---------------------------------+-----------------+----------------------------------------+ @@ -93,3 +93,5 @@ The following common request header fields are not supported: +----------------------------+------------+ | **x-amz-version-id** | Response | +----------------------------+------------+ + +.. _Amazon S3 API: http://docs.aws.amazon.com/AmazonS3/latest/API/APIRest.html diff --git a/doc/radosgw/s3/bucketops.rst b/doc/radosgw/s3/bucketops.rst index f066005ab24be..95457b7565f31 100644 --- a/doc/radosgw/s3/bucketops.rst +++ b/doc/radosgw/s3/bucketops.rst @@ -345,3 +345,33 @@ Response Entities +-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+ | ``CommonPrefixes.Prefix`` | String | The substring of the key after the prefix as defined by the ``prefix`` request parameter. | +-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+ + +ENABLE/SUSPEND BUCKET VERSIONING +-------------------------------- + +``PUT /?versioning`` This subresource set the versioning state of an existing bucket. To set the versioning state, you must be the bucket owner. + +You can set the versioning state with one of the following values: + +- Enabled : Enables versioning for the objects in the bucket, All objects added to the bucket receive a unique version ID. +- Suspended : Disables versioning for the objects in the bucket, All objects added to the bucket receive the version ID null. + +If the versioning state has never been set on a bucket, it has no versioning state; a GET versioning request does not return a versioning state value. + +Syntax +~~~~~~ + +:: + + PUT /{bucket}?versioning HTTP/1.1 + +REQUEST ENTITIES +~~~~~~~~~~~~~~~~ + ++-----------------------------+-----------+---------------------------------------------------------------------------+ +| Name | Type | Description | ++=============================+===========+===========================================================================+ +| ``VersioningConfiguration`` | Container | A container for the request. | ++-----------------------------+-----------+---------------------------------------------------------------------------+ +| ``Status`` | String | Sets the versioning state of the bucket. Valid Values: Suspended/Enabled | ++-----------------------------+-----------+---------------------------------------------------------------------------+ \ No newline at end of file diff --git a/doc/radosgw/swift.rst b/doc/radosgw/swift.rst index ccc27a1fcaf61..f0a8cbf9aa0e6 100644 --- a/doc/radosgw/swift.rst +++ b/doc/radosgw/swift.rst @@ -2,7 +2,7 @@ Ceph Object Gateway Swift API =============================== -Ceph supports a RESTful API that is compatible with the basic data access model of the Swift API. +Ceph supports a RESTful API that is compatible with the basic data access model of the `Swift API`_. API --- @@ -14,6 +14,7 @@ API Service Ops Container Ops Object Ops + Temp URL Ops Tutorial Java Python @@ -71,3 +72,4 @@ The following table describes the support status for current Swift functional fe | **CORS** | Not Supported | | +---------------------------------+-----------------+----------------------------------------+ +.. _Swift API: http://developer.openstack.org/api-ref-objectstorage-v1.html diff --git a/doc/radosgw/swift/auth.rst b/doc/radosgw/swift/auth.rst index e90162a50d8a2..bfc97733cec17 100644 --- a/doc/radosgw/swift/auth.rst +++ b/doc/radosgw/swift/auth.rst @@ -7,7 +7,7 @@ Swift API requests that require authentication must contain an The token may be retrieved from RADOS Gateway, or from another authenticator. To obtain a token from RADOS Gateway, you must create a user. For example:: - sudo radosgw-admin user create --uid="{username}" --displayname="{Display Name}" + sudo radosgw-admin user create --uid="{username}" --display-name="{Display Name}" For details on RADOS Gateway administration, see `radosgw-admin`_. diff --git a/doc/radosgw/swift/objectops.rst b/doc/radosgw/swift/objectops.rst index d7692dccedc29..edc3e35ac79fe 100644 --- a/doc/radosgw/swift/objectops.rst +++ b/doc/radosgw/swift/objectops.rst @@ -263,7 +263,7 @@ Syntax Request Headers ~~~~~~~~~~~~~~~ -``X-Container-Meta-{key}`` +``X-Object-Meta-{key}`` :Description: A user-defined meta data key that takes an arbitrary string value. :Type: String diff --git a/doc/radosgw/swift/python.rst b/doc/radosgw/swift/python.rst index 6d3929b7167f6..c12f89fea77f1 100644 --- a/doc/radosgw/swift/python.rst +++ b/doc/radosgw/swift/python.rst @@ -11,13 +11,13 @@ This creates a connection so that you can interact with the server: .. code-block:: python - import cloudfiles - username = 'account_name:username' - api_key = 'your_api_key' + import swiftclient + user = 'account_name:username' + key = 'your_api_key' - conn = cloudfiles.get_connection( - username=username, - api_key=api_key, + conn = swiftclient.Connection( + user=user, + key=key, authurl='https://objects.dreamhost.com/auth', ) @@ -29,8 +29,9 @@ This creates a new container called ``my-new-container``: .. code-block:: python - container = conn.create_container('my-new-container') - + container_name = 'my-new-container' + conn.put_container(container_name) + Create an Object ================ @@ -39,10 +40,11 @@ This creates a file ``hello.txt`` from the file named ``my_hello.txt``: .. code-block:: python - obj = container.create_object('hello.txt') - obj.content_type = 'text/plain' - obj.load_from_filename('./my_hello.txt') - + with open('hello.txt', 'r') as hello_file: + conn.put_object(container_name, 'hello.txt', + contents= hello_file.read(), + content_type='text/plain') + List Owned Containers ===================== @@ -51,8 +53,8 @@ This gets a list of containers that you own, and prints out the container name: .. code-block:: python - for container in conn.get_all_containers(): - print container.name + for container in conn.get_account()[1]: + print container['name'] The output will look something like this:: @@ -63,13 +65,13 @@ The output will look something like this:: List a Container's Content ========================== -This gets a list of objects in the container, and prints out each +This gets a list of objects in the container, and prints out each object's name, the file size, and last modified date: .. code-block:: python - for obj in container.get_objects(): - print "{0}\t{1}\t{2}".format(obj.name, obj.size, obj.last_modified) + for data in conn.get_container(container_name)[1]: + print '{0}\t{1}\t{2}'.format(data['name'], data['bytes'], data['last_modified']) The output will look something like this:: @@ -85,8 +87,9 @@ This downloads the object ``hello.txt`` and saves it in .. code-block:: python - obj = container.get_object('hello.txt') - obj.save_to_filename('./my_hello.txt') + obj_tuple = conn.get_object(container_name, 'hello.txt') + with open('my_hello.txt', 'w') as my_hello: + my_hello.write(obj_tuple[1]) Delete an Object @@ -96,8 +99,8 @@ This deletes the object ``goodbye.txt``: .. code-block:: python - container.delete_object('goodbye.txt') - + conn.delete_object(container_name, 'hello.txt') + Delete a Container ================== @@ -107,5 +110,5 @@ Delete a Container .. code-block:: python - conn.delete_container(container.name) + conn.delete_container(container_name) diff --git a/doc/radosgw/swift/tempurl.rst b/doc/radosgw/swift/tempurl.rst new file mode 100644 index 0000000000000..e1a517998c67e --- /dev/null +++ b/doc/radosgw/swift/tempurl.rst @@ -0,0 +1,85 @@ +==================== + Temp URL Operations +==================== + +To allow temporary access (for eg for `GET` requests) to objects +without the need to share credentials, temp url functionality is +supported by swift endpoint of radosgw. For this functionality, +initially the value of `X-Account-Meta-Temp-URL-Key` and optionally +`X-Account-Meta-Temp-URL-Key-2` should be set. The Temp URL +functionality relies on a HMAC-SHA1 signature against these secret +keys. + +POST Temp-URL Keys +================== + +A ``POST`` request to the swift account with the required Key will set +the secret temp url key for the account against which temporary url +access can be provided to accounts. Up to two keys are supported, and +signatures are checked against both the keys, if present, so that keys +can be rotated without invalidating the temporary urls. + +Syntax +~~~~~~ + +:: + + POST /{api version}/{account} HTTP/1.1 + Host: {fqdn} + X-Auth-Token: {auth-token} + +Request Headers +~~~~~~~~~~~~~~~ + +``X-Account-Meta-Temp-URL-Key`` + +:Description: A user-defined key that takes an arbitrary string value. +:Type: String +:Required: Yes + +``X-Account-Meta-Temp-URL-Key-2`` + +:Description: A user-defined key that takes an arbitrary string value. +:Type: String +:Required: No + + +GET Temp-URL Objects +==================== + +Temporary URL uses a cryptographic HMAC-SHA1 signature, which includes +the following elements: + +#. The value of the Request method, "GET" for instance +#. The expiry time, in format of seconds since the epoch, ie Unix time +#. The request path starting from "v1" onwards + +The above items are normalized with newlines appended between them, +and a HMAC is generated using the SHA-1 hashing algorithm against one +of the Temp URL Keys posted earlier. + +A sample python script to demonstrate the above is given below: + + +.. code-block:: python + + import hmac + from hashlib import sha1 + from time import time + + method = 'GET' + host = 'https://objectstore.example.com' + duration_in_seconds = 300 # Duration for which the url is valid + expires = int(time() + duration_in_seconds) + path = '/v1/your-bucket/your-object' + key = 'secret' + hmac_body = '%s\n%s\n%s' % (method, expires, path) + hmac_body = hmac.new(key, hmac_body, sha1).hexdigest() + sig = hmac.new(key, hmac_body, sha1).hexdigest() + rest_uri = "{host}{path}?temp_url_sig={sig}&temp_url_expires={expires}".format( + host=host, path=path, sig=sig, expires=expires) + print rest_uri + + # Example Output + # https://objectstore.example.com/v1/your-bucket/your-object?temp_url_sig=ff4657876227fc6025f04fcf1e82818266d022c6&temp_url_expires=1423200992 + diff --git a/doc/rbd/qemu-rbd.rst b/doc/rbd/qemu-rbd.rst index 8ce980ffc2efd..cec04ac2fc47e 100644 --- a/doc/rbd/qemu-rbd.rst +++ b/doc/rbd/qemu-rbd.rst @@ -45,7 +45,7 @@ commands as the default ``client.admin`` user unless you expressly specify another Ceph configuration file path or another user. When specifying a user, QEMU uses the ``ID`` rather than the full ``TYPE:ID``. See `User Management - User`_ for details. Do not prepend the client type (i.e., ``client.``) to the -beginning of the user ID, or you will receive an authentication error. You +beginning of the user ``ID``, or you will receive an authentication error. You should have the key for the ``admin`` user or the key of another user you specify with the ``:id={user}`` option in a keyring file stored in default path (i.e., ``/etc/ceph`` or the local directory with appropriate file ownership and diff --git a/doc/rbd/rados-rbd-cmds.rst b/doc/rbd/rados-rbd-cmds.rst index 8212df0c24164..37a70a3d4ad6b 100644 --- a/doc/rbd/rados-rbd-cmds.rst +++ b/doc/rbd/rados-rbd-cmds.rst @@ -21,13 +21,18 @@ Before you can add a block device to a node, you must create an image for it in the :term:`Ceph Storage Cluster` first. To create a block device image, execute the following:: - rbd create {image-name} --size {megabytes} --pool {pool-name} + rbd create --size {megabytes} {pool-name}/{image-name} -For example, to create a 1GB image named ``foo`` that stores information in a +For example, to create a 1GB image named ``bar`` that stores information in a pool named ``swimmingpool``, execute the following:: - rbd create foo --size 1024 - rbd create bar --size 1024 --pool swimmingpool + rbd create --size 1024 swimmingpool/bar + +If you don't specify pool when creating an image, it will be stored in the +default pool ``rbd``. For example, to create a 1GB image named ``foo`` stored in +the default pool ``rbd``, execute the following:: + + rbd create --size 1024 foo .. note:: You must create a pool first before you can specify it as a source. See `Storage Pools`_ for details. @@ -55,21 +60,21 @@ Retrieving Image Information To retrieve information from a particular image, execute the following, but replace ``{image-name}`` with the name for the image:: - rbd --image {image-name} info + rbd info {image-name} For example:: - rbd --image foo info + rbd info foo To retrieve information from an image within a pool, execute the following, but replace ``{image-name}`` with the name of the image and replace ``{pool-name}`` with the name of the pool:: - rbd --image {image-name} -p {pool-name} info + rbd info {pool-name}/{image-name} For example:: - rbd --image bar -p swimmingpool info + rbd info swimmingpool/bar Resizing a Block Device Image ============================= @@ -80,7 +85,7 @@ a maximum capacity that you set with the ``--size`` option. If you want to increase (or decrease) the maximum size of a Ceph Block Device image, execute the following:: - rbd resize --image foo --size 2048 + rbd resize --size 2048 foo Removing a Block Device Image @@ -99,11 +104,11 @@ To remove a block device from a pool, execute the following, but replace ``{image-name}`` with the name of the image to remove and replace ``{pool-name}`` with the name of the pool:: - rbd rm {image-name} -p {pool-name} + rbd rm {pool-name}/{image-name} For example:: - rbd rm bar -p swimmingpool + rbd rm swimmingpool/bar diff --git a/doc/rbd/rbd-ko.rst b/doc/rbd/rbd-ko.rst index 3b5a236b94e76..5514a963624b5 100644 --- a/doc/rbd/rbd-ko.rst +++ b/doc/rbd/rbd-ko.rst @@ -20,17 +20,17 @@ Use ``rbd`` to map an image name to a kernel module. You must specify the image name, the pool name, and the user name. ``rbd`` will load RBD kernel module on your behalf if it's not already loaded. :: - sudo rbd map {image-name} --pool {pool-name} --id {user-name} + sudo rbd map {pool-name}/{image-name} --id {user-name} For example:: - sudo rbd map --pool rbd myimage --id admin + sudo rbd map rbd/myimage --id admin If you use `cephx`_ authentication, you must also specify a secret. It may come from a keyring or a file containing the secret. :: - sudo rbd map --pool rbd myimage --id admin --keyring /path/to/keyring - sudo rbd map --pool rbd myimage --id admin --keyfile /path/to/file + sudo rbd map rbd/myimage --id admin --keyring /path/to/keyring + sudo rbd map rbd/myimage --id admin --keyfile /path/to/file Show Mapped Block Devices diff --git a/doc/rbd/rbd-openstack.rst b/doc/rbd/rbd-openstack.rst index 43eafa399863b..09f1ec368f223 100644 --- a/doc/rbd/rbd-openstack.rst +++ b/doc/rbd/rbd-openstack.rst @@ -108,8 +108,8 @@ Install Ceph client packages On the ``glance-api`` node, you'll need the Python bindings for ``librbd``:: - sudo apt-get install python-ceph - sudo yum install python-ceph + sudo apt-get install python-rbd + sudo yum install python-rbd On the ``nova-compute``, ``cinder-backup`` and on the ``cinder-volume`` node, use both the Python bindings and the client command line tools:: @@ -139,7 +139,11 @@ Add the keyrings for ``client.cinder``, ``client.glance``, and ssh {your-cinder-backup-server} sudo chown cinder:cinder /etc/ceph/ceph.client.cinder-backup.keyring Nodes running ``nova-compute`` need the keyring file for the ``nova-compute`` -process. They also need to store the secret key of the ``client.cinder`` user in +process:: + + ceph auth get-or-create client.cinder | ssh {your-nova-compute-server} sudo tee /etc/ceph/ceph.client.cinder.keyring + +They also need to store the secret key of the ``client.cinder`` user in ``libvirt``. The libvirt process needs it to access the cluster while attaching a block device from Cinder. @@ -200,6 +204,10 @@ Juno Edit ``/etc/glance/glance-api.conf`` and add under the ``[glance_store]`` section:: + [DEFAULT] + ... + default_store = rbd + ... [glance_store] stores = rbd rbd_store_pool = images @@ -210,6 +218,8 @@ Edit ``/etc/glance/glance-api.conf`` and add under the ``[glance_store]`` sectio For more information about the configuration options available in Glance please see: http://docs.openstack.org/trunk/config-reference/content/section_glance-api.conf.html. +.. important:: Glance has not completely moved to 'store' yet. + So we still need to configure the store in the DEFAULT section. Any OpenStack version ~~~~~~~~~~~~~~~~~~~~~ @@ -227,6 +237,16 @@ assuming your configuration file has ``flavor = keystone+cachemanagement``:: [paste_deploy] flavor = keystone +Image properties +~~~~~~~~~~~~~~~~ + +We recommend to use the following properties for your images: + +- ``hw_scsi_model=virtio-scsi``: add the virtio-scsi controller and get better performance and support for discard operation +- ``hw_disk_bus=scsi``: connect every cinder block devices to that controller +- ``hw_qemu_guest_agent=yes``: enable the QEMU guest agent +- ``os_require_quiesce=yes``: send fs-freeze/thaw calls through the QEMU guest agent + Configuring Cinder ------------------ @@ -292,7 +312,7 @@ configure the ephemeral backend for Nova. It is recommended to enable the RBD cache in your Ceph configuration file (enabled by default since Giant). Moreover, enabling the admin socket -brings a lot of benefits while troubleshoothing. Having one socket +brings a lot of benefits while troubleshooting. Having one socket per virtual machine using a Ceph block device will help investigating performance and/or wrong behaviors. This socket can be accessed like this:: @@ -304,7 +324,17 @@ Now on every compute nodes edit your Ceph configuration file:: [client] rbd cache = true rbd cache writethrough until flush = true - admin socket = /var/run/ceph/$cluster-$type.$id.$pid.$cctid.asok + admin socket = /var/run/ceph/guests/$cluster-$type.$id.$pid.$cctid.asok + log file = /var/log/qemu/qemu-guest-$pid.log + rbd concurrent management ops = 20 + +Configure the permissions of these paths:: + + mkdir -p /var/run/ceph/guests/ /var/log/qemu/ + chown qemu:libvirtd /var/run/ceph/guests /var/log/qemu/ + +Note that user ``qemu`` and group ``libvirtd`` can vary depending on your system. +The provided example works for RedHat based systems. .. tip:: If your virtual machine is already running you can simply restart it to get the socket @@ -323,6 +353,7 @@ On every Compute node, edit ``/etc/nova/nova.conf`` and add:: libvirt_images_type = rbd libvirt_images_rbd_pool = vms libvirt_images_rbd_ceph_conf = /etc/ceph/ceph.conf + libvirt_disk_cachemodes="network=writeback" rbd_user = cinder rbd_secret_uuid = 457eb676-33da-42ec-9a8c-9293d545c337 @@ -340,8 +371,7 @@ On every Compute node, edit ``/etc/nova/nova.conf`` and add:: To ensure a proper live-migration, use the following flags:: - libvirt_live_migration_flag="VIR_MIGRATE_UNDEFINE_SOURCE,VIR_MIGRATE_PEER2PEER,VIR_MIGRATE_LIVE,VIR_MIGRATE_PERSIST_DEST" - + libvirt_live_migration_flag="VIR_MIGRATE_UNDEFINE_SOURCE,VIR_MIGRATE_PEER2PEER,VIR_MIGRATE_LIVE,VIR_MIGRATE_PERSIST_DEST,VIR_MIGRATE_TUNNELLED" Juno ~~~~ @@ -356,6 +386,7 @@ section and add:: images_rbd_ceph_conf = /etc/ceph/ceph.conf rbd_user = cinder rbd_secret_uuid = 457eb676-33da-42ec-9a8c-9293d545c337 + disk_cachemodes="network=writeback" It is also a good practice to disable file injection. While booting an @@ -371,9 +402,19 @@ under the ``[libvirt]`` section:: inject_key = false inject_partition = -2 -To ensure a proper live-migration, use the following flags:: +To ensure a proper live-migration, use the following flags (under the ``[libvirt]`` section):: + + live_migration_flag="VIR_MIGRATE_UNDEFINE_SOURCE,VIR_MIGRATE_PEER2PEER,VIR_MIGRATE_LIVE,VIR_MIGRATE_PERSIST_DEST,VIR_MIGRATE_TUNNELLED" - live_migration_flag="VIR_MIGRATE_UNDEFINE_SOURCE,VIR_MIGRATE_PEER2PEER,VIR_MIGRATE_LIVE,VIR_MIGRATE_PERSIST_DEST" +Kilo +~~~~ + +Enable discard support for virtual machine ephemeral root disk:: + + [libvirt] + ... + ... + hw_disk_discard = unmap # enable discard support (be careful of performance) Restart OpenStack @@ -418,7 +459,7 @@ dashboard, you can boot from that volume by performing the following steps: #. Launch a new instance. #. Choose the image associated to the copy-on-write clone. -#. Select 'boot from volume' +#. Select 'boot from volume'. #. Select the volume you created. .. _qemu-img: ../qemu-rbd/#running-qemu-with-rbd diff --git a/doc/rbd/rbd-snapshot.rst b/doc/rbd/rbd-snapshot.rst index 55517921009c7..96169efa51969 100644 --- a/doc/rbd/rbd-snapshot.rst +++ b/doc/rbd/rbd-snapshot.rst @@ -58,12 +58,10 @@ Create Snapshot To create a snapshot with ``rbd``, specify the ``snap create`` option, the pool name and the image name. :: - rbd --pool {pool-name} snap create --snap {snap-name} {image-name} rbd snap create {pool-name}/{image-name}@{snap-name} For example:: - rbd --pool rbd snap create --snap snapname foo rbd snap create rbd/foo@snapname @@ -72,12 +70,10 @@ List Snapshots To list snapshots of an image, specify the pool name and the image name. :: - rbd --pool {pool-name} snap ls {image-name} rbd snap ls {pool-name}/{image-name} For example:: - rbd --pool rbd snap ls foo rbd snap ls rbd/foo @@ -87,12 +83,10 @@ Rollback Snapshot To rollback to a snapshot with ``rbd``, specify the ``snap rollback`` option, the pool name, the image name and the snap name. :: - rbd --pool {pool-name} snap rollback --snap {snap-name} {image-name} rbd snap rollback {pool-name}/{image-name}@{snap-name} For example:: - rbd --pool rbd snap rollback --snap snapname foo rbd snap rollback rbd/foo@snapname @@ -108,14 +102,12 @@ Delete a Snapshot ----------------- To delete a snapshot with ``rbd``, specify the ``snap rm`` option, the pool -name, the image name and the username. :: +name, the image name and the snap name. :: - rbd --pool {pool-name} snap rm --snap {snap-name} {image-name} rbd snap rm {pool-name}/{image-name}@{snap-name} For example:: - rbd --pool rbd snap rm --snap snapname foo rbd snap rm rbd/foo@snapname @@ -128,12 +120,10 @@ Purge Snapshots To delete all snapshots for an image with ``rbd``, specify the ``snap purge`` option and the image name. :: - rbd --pool {pool-name} snap purge {image-name} rbd snap purge {pool-name}/{image-name} For example:: - rbd --pool rbd snap purge foo rbd snap purge rbd/foo @@ -174,10 +164,9 @@ no special restrictions with cloned images. However, the copy-on-write clone of a snapshot refers to the snapshot, so you **MUST** protect the snapshot before you clone it. The following diagram depicts the process. -.. note:: Ceph only supports cloning for ``format 2`` images (i.e., created with - ``rbd create --image-format 2``), and is not yet supported by the kernel ``rbd`` module. - So you MUST use QEMU/KVM or ``librbd`` directly to access clones in the current - release. +.. note:: Ceph only supports cloning for format 2 images (i.e., created with + ``rbd create --image-format 2``). The kernel client supports cloned images + since kernel 3.10. Getting Started with Layering ----------------------------- @@ -236,12 +225,10 @@ Clones access the parent snapshots. All clones would break if a user inadvertent deleted the parent snapshot. To prevent data loss, you **MUST** protect the snapshot before you can clone it. :: - rbd --pool {pool-name} snap protect --image {image-name} --snap {snapshot-name} rbd snap protect {pool-name}/{image-name}@{snapshot-name} For example:: - rbd --pool rbd snap protect --image my-image --snap my-snapshot rbd snap protect rbd/my-image@my-snapshot .. note:: You cannot delete a protected snapshot. @@ -253,7 +240,6 @@ To clone a snapshot, specify you need to specify the parent pool, image and snapshot; and, the child pool and image name. You must protect the snapshot before you can clone it. :: - rbd --pool {pool-name} --image {parent-image} --snap {snap-name} --dest-pool {pool-name} --dest {child-image} rbd clone {pool-name}/{parent-image}@{snap-name} {pool-name}/{child-image-name} For example:: @@ -271,12 +257,10 @@ Before you can delete a snapshot, you must unprotect it first. Additionally, you may *NOT* delete snapshots that have references from clones. You must flatten each clone of a snapshot, before you can delete the snapshot. :: - rbd --pool {pool-name} snap unprotect --image {image-name} --snap {snapshot-name} rbd snap unprotect {pool-name}/{image-name}@{snapshot-name} For example:: - rbd --pool rbd snap unprotect --image my-image --snap my-snapshot rbd snap unprotect rbd/my-image@my-snapshot @@ -285,12 +269,10 @@ Listing Children of a Snapshot To list the children of a snapshot, execute the following:: - rbd --pool {pool-name} children --image {image-name} --snap {snap-name} rbd children {pool-name}/{image-name}@{snapshot-name} For example:: - rbd --pool rbd children --image my-image --snap my-snapshot rbd children rbd/my-image@my-snapshot @@ -303,12 +285,10 @@ the image by copying the information from the snapshot to the clone. The time it takes to flatten a clone increases with the size of the snapshot. To delete a snapshot, you must flatten the child images first. :: - rbd --pool {pool-name} flatten --image {image-name} rbd flatten {pool-name}/{image-name} For example:: - rbd --pool rbd flatten --image my-image rbd flatten rbd/my-image .. note:: Since a flattened image contains all the information from the snapshot, diff --git a/doc/release-notes.rst b/doc/release-notes.rst index 24f73e88bbc33..b994a2b1e1e3d 100644 --- a/doc/release-notes.rst +++ b/doc/release-notes.rst @@ -2,6 +2,2009 @@ Release Notes =============== +v9.0.2 +====== + +This development release features more of the OSD work queue +unification, randomized osd scrub times, a huge pile of librbd fixes, +more MDS repair and snapshot fixes, and a significant amount of work +on the tests and build infrastructure. + +Notable Changes +--------------- + +* buffer: some cleanup (Michal Jarzabek) +* build: cmake: fix nss linking (Danny Al-Gaaf) +* build: cmake: misc fixes (Orit Wasserman, Casey Bodley) +* build: install-deps: misc fixes (Loic Dachary) +* build: make_dist_tarball.sh (Sage Weil) +* ceph-detect-init: added Linux Mint (Michal Jarzabek) +* ceph-detect-init: robust init system detection (Owen Synge) +* ceph-disk: ensure 'zap' only operates on a full disk (#11272 Loic Dachary) +* ceph-disk: misc fixes to respect init system (Loic Dachary, Owen Synge) +* ceph-disk: support NVMe device partitions (#11612 Ilja Slepnev) +* ceph: fix 'df' units (Zhe Zhang) +* ceph: fix parsing in interactive cli mode (#11279 Kefu Chai) +* ceph-objectstore-tool: many many changes (David Zafman) +* ceph-post-file: misc fixes (Joey McDonald, Sage Weil) +* client: avoid sending unnecessary FLUSHSNAP messages (Yan, Zheng) +* client: exclude setfilelock when calculating oldest tid (Yan, Zheng) +* client: fix error handling in check_pool_perm (John Spray) +* client: fsync waits only for inode's caps to flush (Yan, Zheng) +* client: invalidate kernel dcache when cache size exceeds limits (Yan, Zheng) +* client: make fsync wait for unsafe dir operations (Yan, Zheng) +* client: pin lookup dentry to avoid inode being freed (Yan, Zheng) +* common: detect overflow of int config values (#11484 Kefu Chai) +* common: fix json parsing of utf8 (#7387 Tim Serong) +* common: fix leak of pthread_mutexattr (#11762 Ketor Meng) +* crush: respect default replicated ruleset config on map creation (Ilya Dryomov) +* deb, rpm: move ceph-objectstore-tool to ceph (Ken Dreyer) +* doc: man page updates (Kefu Chai) +* doc: misc updates (#11396 Nilamdyuti, Fracois Lafont, Ken Dreyer, Kefu Chai) +* init-radosgw: merge with sysv version; fix enumeration (Sage Weil) +* librados: add config observer (Alistair Strachan) +* librbd: add const for single-client-only features (Josh Durgin) +* librbd: add deep-flatten operation (Jason Dillaman) +* librbd: avoid blocking aio API methods (#11056 Jason Dillaman) +* librbd: fix fast diff bugs (#11553 Jason Dillaman) +* librbd: fix image format detection (Zhiqiang Wang) +* librbd: fix lock ordering issue (#11577 Jason Dillaman) +* librbd: flatten/copyup fixes (Jason Dillaman) +* librbd: lockdep, helgrind validation (Jason Dillaman, Josh Durgin) +* librbd: only update image flags while hold exclusive lock (#11791 Jason Dillaman) +* librbd: return result code from close (#12069 Jason Dillaman) +* librbd: tolerate old osds when getting image metadata (#11549 Jason Dillaman) +* mds: do not add snapped items to bloom filter (Yan, Zheng) +* mds: fix handling for missing mydir dirfrag (#11641 John Spray) +* mds: fix rejoin (Yan, Zheng) +* mds: fix stra reintegration (Yan, Zheng) +* mds: fix suicide beason (John Spray) +* mds: misc repair improvements (John Spray) +* mds: misc snapshot fixes (Yan, Zheng) +* mds: respawn instead of suicide on blacklist (John Spray) +* misc coverity fixes (Danny Al-Gaaf) +* mon: add 'mon_metadata ' command (Kefu Chai) +* mon: add 'node ls ...' command (Kefu Chai) +* mon: disallow ec pools as tiers (#11650 Samuel Just) +* mon: fix mds beacon replies (#11590 Kefu Chai) +* mon: fix 'pg ls' sort order, state names (#11569 Kefu Chai) +* mon: normalize erasure-code profile for storage and comparison (Loic Dachary) +* mon: optionally specify osd id on 'osd create' (Mykola Golub) +* mon: 'osd tree' fixes (Kefu Chai) +* mon: prevent pool with snapshot state from being used as a tier (#11493 Sage Weil) +* mon: refine check_remove_tier checks (#11504 John Spray) +* mon: remove spurious who arg from 'mds rm ...' (John Spray) +* msgr: async: misc fixes (Haomai Wang) +* msgr: xio: fix ip and nonce (Raju Kurunkad) +* msgr: xio: improve lane assignment (Vu Pham) +* msgr: xio: misc fixes (Vu Pham, Cosey Bodley) +* osd: avoid transaction append in some cases (Sage Weil) +* osdc/Objecter: allow per-pool calls to op_cancel_writes (John Spray) +* osd: elminiate txn apend, ECSubWrite copy (Samuel Just) +* osd: filejournal: cleanup (David Zafman) +* osd: fix check_for_full (Henry Chang) +* osd: fix dirty accounting in make_writeable (Zhiqiang Wang) +* osd: fix osdmap dump of blacklist items (John Spray) +* osd: fix snap flushing from cache tier (again) (#11787 Samuel Just) +* osd: fix snap handling on promotion (#11296 Sam Just) +* osd: handle log split with overlapping entries (#11358 Samuel Just) +* osd: keyvaluestore: misc fixes (Varada Kari) +* osd: make suicide timeouts individually configurable (Samuel Just) +* osd: move scrub in OpWQ (Samuel Just) +* osd: pool size change triggers new interval (#11771 Samuel Just) +* osd: randomize scrub times (#10973 Kefu Chai) +* osd: refactor scrub and digest recording (Sage Weil) +* osd: refuse first write to EC object at non-zero offset (Jianpeng Ma) +* osd: stripe over small xattrs to fit in XFS's 255 byte inline limit (Sage Weil, Ning Yao) +* osd: sync object_map on syncfs (Samuel Just) +* osd: take excl lock of op is rw (Samuel Just) +* osd: WBThrottle cleanups (Jianpeng Ma) +* pycephfs: many fixes for bindings (Haomai Wang) +* rados: bench: add --no-verify option to improve performance (Piotr Dalek) +* rados: misc bench fixes (Dmitry Yatsushkevich) +* rbd: add disk usage tool (#7746 Jason Dillaman) +* rgw: alwasy check if token is expired (#11367 Anton Aksola, Riku Lehto) +* rgw: conversion tool to repair broken multipart objects (#12079 Yehuda Sadeh) +* rgw: do not enclose bucket header in quotes (#11860 Wido den Hollander) +* rgw: error out if frontend did not send all data (#11851 Yehuda Sadeh) +* rgw: fix assignment of copy obj attributes (#11563 Yehuda Sadeh) +* rgw: fix reset_loc (#11974 Yehuda Sadeh) +* rgw: improve content-length env var handling (#11419 Robin H. Johnson) +* rgw: only scan for objects not in a namespace (#11984 Yehuda Sadeh) +* rgw: remove trailing :port from HTTP_HOST header (Sage Weil) +* rgw: shard work over multiple librados instances (Pavan Rallabhandi) +* rgw: swift: enforce Content-Type in response (#12157 Radoslaw Zarzynski) +* rgw: use attrs from source bucket on copy (#11639 Javier M. Mellid) +* rocksdb: pass options as single string (Xiaoxi Chen) +* rpm: many spec file fixes (Owen Synge, Ken Dreyer) +* tests: fixes for rbd xstests (Douglas Fuller) +* tests: fix tiering health checks (Loic Dachary) +* tests for low-level performance (Haomai Wang) +* tests: many ec non-regression improvements (Loic Dachary) +* tests: many many ec test improvements (Loic Dachary) +* upstart: throttle restarts (#11798 Sage Weil, Greg Farnum) + + +v9.0.1 +====== + +This development release is delayed a bit due to tooling changes in the build +environment. As a result the next one (v9.0.2) will have a bit more work than +is usual. + +Highlights here include lots of RGW Swift fixes, RBD feature work +surrounding the new object map feature, more CephFS snapshot fixes, +and a few important CRUSH fixes. + +Notable Changes +--------------- + +* auth: cache/reuse crypto lib key objects, optimize msg signature check (Sage Weil) +* build: allow tcmalloc-minimal (Thorsten Behrens) +* build: do not build ceph-dencoder with tcmalloc (#10691 Boris Ranto) +* build: fix pg ref disabling (William A. Kennington III) +* build: install-deps.sh improvements (Loic Dachary) +* build: misc fixes (Boris Ranto, Ken Dreyer, Owen Synge) +* ceph-authtool: fix return code on error (Gerhard Muntingh) +* ceph-disk: fix zap sgdisk invocation (Owen Synge, Thorsten Behrens) +* ceph-disk: pass --cluster arg on prepare subcommand (Kefu Chai) +* ceph-fuse, libcephfs: drop inode when rmdir finishes (#11339 Yan, Zheng) +* ceph-fuse,libcephfs: fix uninline (#11356 Yan, Zheng) +* ceph-monstore-tool: fix store-copy (Huangjun) +* common: add perf counter descriptions (Alyona Kiseleva) +* common: fix throttle max change (Henry Chang) +* crush: fix crash from invalid 'take' argument (#11602 Shiva Rkreddy, Sage Weil) +* crush: fix divide-by-2 in straw2 (#11357 Yann Dupont, Sage Weil) +* deb: fix rest-bench-dbg and ceph-test-dbg dependendies (Ken Dreyer) +* doc: document region hostnames (Robin H. Johnson) +* doc: update release schedule docs (Loic Dachary) +* init-radosgw: run radosgw as root (#11453 Ken Dreyer) +* librados: fadvise flags per op (Jianpeng Ma) +* librbd: allow additional metadata to be stored with the image (Haomai Wang) +* librbd: better handling for dup flatten requests (#11370 Jason Dillaman) +* librbd: cancel in-flight ops on watch error (#11363 Jason Dillaman) +* librbd: default new images to format 2 (#11348 Jason Dillaman) +* librbd: fast diff implementation that leverages object map (Jason Dillaman) +* librbd: fix snapshot creation when other snap is active (#11475 Jason Dillaman) +* librbd: new diff_iterate2 API (Jason Dillaman) +* librbd: object map rebuild support (Jason Dillaman) +* logrotate.d: prefer service over invoke-rc.d (#11330 Win Hierman, Sage Weil) +* mds: avoid getting stuck in XLOCKDONE (#11254 Yan, Zheng) +* mds: fix integer truncateion on large client ids (Henry Chang) +* mds: many snapshot and stray fixes (Yan, Zheng) +* mds: persist completed_requests reliably (#11048 John Spray) +* mds: separate safe_pos in Journaler (#10368 John Spray) +* mds: snapshot rename support (#3645 Yan, Zheng) +* mds: warn when clients fail to advance oldest_client_tid (#10657 Yan, Zheng) +* misc cleanups and fixes (Danny Al-Gaaf) +* mon: fix average utilization calc for 'osd df' (Mykola Golub) +* mon: fix variance calc in 'osd df' (Sage Weil) +* mon: improve callout to crushtool (Mykola Golub) +* mon: prevent bucket deletion when referenced by a crush rule (#11602 Sage Weil) +* mon: prime pg_temp when CRUSH map changes (Sage Weil) +* monclient: flush_log (John Spray) +* msgr: async: many many fixes (Haomai Wang) +* msgr: simple: fix clear_pipe (#11381 Haomai Wang) +* osd: add latency perf counters for tier operations (Xinze Chi) +* osd: avoid multiple hit set insertions (Zhiqiang Wang) +* osd: break PG removal into multiple iterations (#10198 Guang Yang) +* osd: check scrub state when handling map (Jianpeng Ma) +* osd: fix endless repair when object is unrecoverable (Jianpeng Ma, Kefu Chai) +* osd: fix pg resurrection (#11429 Samuel Just) +* osd: ignore non-existent osds in unfound calc (#10976 Mykola Golub) +* osd: increase default max open files (Owen Synge) +* osd: prepopulate needs_recovery_map when only one peer has missing (#9558 Guang Yang) +* osd: relax reply order on proxy read (#11211 Zhiqiang Wang) +* osd: skip promotion for flush/evict op (Zhiqiang Wang) +* osd: write journal header on clean shutdown (Xinze Chi) +* qa: run-make-check.sh script (Loic Dachary) +* rados bench: misc fixes (Dmitry Yatsushkevich) +* rados: fix error message on failed pool removal (Wido den Hollander) +* radosgw-admin: add 'bucket check' function to repair bucket index (Yehuda Sadeh) +* rbd: allow unmapping by spec (Ilya Dryomov) +* rbd: deprecate --new-format option (Jason Dillman) +* rgw: do not set content-type if length is 0 (#11091 Orit Wasserman) +* rgw: don't use end_marker for namespaced object listing (#11437 Yehuda Sadeh) +* rgw: fail if parts not specified on multipart upload (#11435 Yehuda Sadeh) +* rgw: fix GET on swift account when limit == 0 (#10683 Radoslaw Zarzynski) +* rgw: fix broken stats in container listing (#11285 Radoslaw Zarzynski) +* rgw: fix bug in domain/subdomain splitting (Robin H. Johnson) +* rgw: fix civetweb max threads (#10243 Yehuda Sadeh) +* rgw: fix copy metadata, support X-Copied-From for swift (#10663 Radoslaw Zarzynski) +* rgw: fix locator for objects starting with _ (#11442 Yehuda Sadeh) +* rgw: fix mulitipart upload in retry path (#11604 Yehuda Sadeh) +* rgw: fix quota enforcement on POST (#11323 Sergey Arkhipov) +* rgw: fix return code on missing upload (#11436 Yehuda Sadeh) +* rgw: force content type header on responses with no body (#11438 Orit Wasserman) +* rgw: generate new object tag when setting attrs (#11256 Yehuda Sadeh) +* rgw: issue aio for first chunk before flush cached data (#11322 Guang Yang) +* rgw: make read user buckets backward compat (#10683 Radoslaw Zarzynski) +* rgw: merge manifests properly with prefix override (#11622 Yehuda Sadeh) +* rgw: return 412 on bad limit when listing buckets (#11613 Yehuda Sadeh) +* rgw: send ETag, Last-Modified for swift (#11087 Radoslaw Zarzynski) +* rgw: set content length on container GET, PUT, DELETE, HEAD (#10971, #11036 Radoslaw Zarzynski) +* rgw: support end marker on swift container GET (#10682 Radoslaw Zarzynski) +* rgw: swift: fix account listing (#11501 Radoslaw Zarzynski) +* rgw: swift: set content-length on keystone tokens (#11473 Herv Rousseau) +* rgw: use correct oid for gc chains (#11447 Yehuda Sadeh) +* rgw: use unique request id for civetweb (#10295 Orit Wasserman) +* rocksdb, leveldb: fix compact_on_mount (Xiaoxi Chen) +* rocksdb: add perf counters for get/put latency (Xinxin Shu) +* rpm: add suse firewall files (Tim Serong) +* rpm: misc systemd and suse fixes (Owen Synge, Nathan Cutler) + + + +v9.0.0 +====== + +This is the first development release for the Infernalis cycle, and +the first Ceph release to sport a version number from the new +numbering scheme. The "9" indicates this is the 9th release cycle--I +(for Infernalis) is the 9th letter. The first "0" indicates this is a +development release ("1" will mean release candidate and "2" will mean +stable release), and the final "0" indicates this is the first such +development release. + +A few highlights include: + +* a new 'ceph daemonperf' command to watch perfcounter stats in realtime +* reduced MDS memory usage +* many MDS snapshot fixes +* librbd can now store options in the image itself +* many fixes for RGW Swift API support +* OSD performance improvements +* many doc updates and misc bug fixes + +Notable Changes +--------------- + +* aarch64: add optimized version of crc32c (Yazen Ghannam, Steve Capper) +* auth: reinit NSS after fork() (#11128 Yan, Zheng) +* build: disable LTTNG by default (#11333 Josh Durgin) +* build: fix ppc build (James Page) +* build: install-deps: support OpenSUSE (Loic Dachary) +* build: misc cmake fixes (Matt Benjamin) +* ceph-disk: follow ceph-osd hints when creating journal (#9580 Sage Weil) +* ceph-disk: handle re-using existing partition (#10987 Loic Dachary) +* ceph-disk: improve parted output parsing (#10983 Loic Dachary) +* ceph-disk: make suppression work for activate-all and activate-journal (Dan van der Ster) +* ceph-disk: misc fixes (Alfredo Deza) +* ceph-fuse, libcephfs: don't clear COMPLETE when trimming null (Yan, Zheng) +* ceph-fuse, libcephfs: hold exclusive caps on dirs we "own" (#11226 Greg Farnum) +* ceph-fuse: do not require successful remount when unmounting (#10982 Greg Farnum) +* ceph: new 'ceph daemonperf' command (John Spray, Mykola Golub) +* common: PriorityQueue tests (Kefu Chai) +* common: add descriptions to perfcounters (Kiseleva Alyona) +* common: fix LTTNG vs fork issue (Josh Durgin) +* crush: fix has_v4_buckets (#11364 Sage Weil) +* crushtool: fix order of operations, usage (Sage Weil) +* debian: minor package reorg (Ken Dreyer) +* doc: docuemnt object corpus generation (#11099 Alexis Normand) +* doc: fix gender neutrality (Alexandre Maragone) +* doc: fix install doc (#10957 Kefu Chai) +* doc: fix sphinx issues (Kefu Chai) +* doc: mds data structure docs (Yan, Zheng) +* doc: misc updates (Nilamdyuti Goswami, Vartika Rai, Florian Haas, Loic Dachary, Simon Guinot, Andy Allan, Alistair Israel, Ken Dreyer, Robin Rehu, Lee Revell, Florian Marsylle, Thomas Johnson, Bosse Klykken, Travis Rhoden, Ian Kelling) +* doc: swift tempurls (#10184 Abhishek Lekshmanan) +* doc: switch doxygen integration back to breathe (#6115 Kefu Chai) +* erasure-code: update ISA-L to 2.13 (Yuan Zhou) +* gmock: switch to submodule (Danny Al-Gaaf, Loic Dachary) +* hadoop: add terasort test (Noah Watkins) +* java: fix libcephfs bindings (Noah Watkins) +* libcephfs,ceph-fuse: fix request resend on cap reconnect (#10912 Yan, Zheng) +* librados: define C++ flags from C constants (Josh Durgin) +* librados: fix last_force_resent handling (#11026 Jianpeng Ma) +* librados: fix memory leak from C_TwoContexts (Xiong Yiliang) +* librados: fix striper when stripe_count = 1 and stripe_unit != object_size (#11120 Yan, Zheng) +* librados: op perf counters (John Spray) +* librados: pybind: fix write() method return code (Javier Guerra) +* libradosstriper: fix leak (Danny Al-Gaaf) +* librbd: add purge_on_error cache behavior (Jianpeng Ma) +* librbd: misc aio fixes (#5488 Jason Dillaman) +* librbd: misc rbd fixes (#11478 #11113 #11342 #11380 Jason Dillaman, Zhiqiang Wang) +* librbd: readahead fixes (Zhiqiang Wang) +* librbd: store metadata, including config options, in image (Haomai Wang) +* mds: add 'damaged' state to MDSMap (John Spray) +* mds: add nicknames for perfcounters (John Spray) +* mds: disable problematic rstat propagation into snap parents (Yan, Zheng) +* mds: fix mydir replica issue with shutdown (#10743 John Spray) +* mds: fix out-of-order messages (#11258 Yan, Zheng) +* mds: fix shutdown with strays (#10744 John Spray) +* mds: fix snapshot fixes (Yan, Zheng) +* mds: fix stray handling (John Spray) +* mds: flush immediately in do_open_truncate (#11011 John Spray) +* mds: improve dump methods (John Spray) +* mds: misc journal cleanups and fixes (#10368 John Spray) +* mds: new SessionMap storage using omap (#10649 John Spray) +* mds: reduce memory consumption (Yan, Zheng) +* mds: throttle purge stray operations (#10390 John Spray) +* mds: tolerate clock jumping backwards (#11053 Yan, Zheng) +* misc coverity fixes (Danny Al-Gaaf) +* mon: do not deactivate last mds (#10862 John Spray) +* mon: make osd get pool 'all' only return applicable fields (#10891 Michal Jarzabek) +* mon: warn on bogus cache tier config (Jianpeng Ma) +* msg/async: misc bug fixes and updates (Haomai Wang) +* msg/simple: fix connect_seq assert (Haomai Wang) +* msg/xio: misc fixes (#10735 Matt Benjamin, Kefu Chai, Danny Al-Gaaf, Raju Kurunkad, Vu Pham) +* msg: unit tests (Haomai Wang) +* objectcacher: misc bug fixes (Jianpeng Ma) +* os/filestore: enlarge getxattr buffer size (Jianpeng Ma) +* osd: EIO injection (David Zhang) +* osd: add misc perfcounters (Xinze Chi) +* osd: add simple sleep injection in recovery (Sage Weil) +* osd: allow SEEK_HOLE/SEEK_DATA for sparse read (Zhiqiang Wang) +* osd: avoid dup omap sets for in pg metadata (Sage Weil) +* osd: clean up some constness, privateness (Kefu Chai) +* osd: erasure-code: drop entries according to LRU (Andreas-Joachim Peters) +* osd: fix negative degraded stats during backfill (Guang Yang) +* osd: misc fixes (Ning Yao, Kefu Chai, Xinze Chi, Zhiqiang Wang, Jianpeng Ma) +* pybind: pep8 cleanups (Danny Al-Gaaf) +* qa: fix filelock_interrupt.py test (Yan, Zheng) +* qa: improve ceph-disk tests (Loic Dachary) +* qa: improve docker build layers (Loic Dachary) +* rados: translate erno to string in CLI (#10877 Kefu Chai) +* rbd: accept map options config option (Ilya Dryomov) +* rbd: cli: fix arg parsing with --io-pattern (Dmitry Yatsushkevich) +* rbd: fix error messages (#2862 Rajesh Nambiar) +* rbd: update rbd man page (Ilya Dryomov) +* rbd: update xfstests tests (Douglas Fuller) +* rgw: add X-Timestamp for Swift containers (#10938 Radoslaw Zarzynski) +* rgw: add missing headers to Swift container details (#10666 Ahmad Faheem, Dmytro Iurchenko) +* rgw: add stats to headers for account GET (#10684 Yuan Zhou) +* rgw: do not prefecth data for HEAD requests (Guang Yang) +* rgw: don't clobber bucket/object owner when setting ACLs (#10978 Yehuda Sadeh) +* rgw: don't use rgw_socket_path if frontend is configured (#11160 Yehuda Sadeh) +* rgw: enforce Content-Lenth for POST on Swift cont/obj (#10661 Radoslaw Zarzynski) +* rgw: fix handling empty metadata items on Swift container (#11088 Radoslaw Zarzynski) +* rgw: fix log rotation (Wuxingyi) +* rgw: generate Date header for civetweb (#10873 Radoslaw Zarzynski) +* rgw: make init script wait for radosgw to stop (#11140 Dmitry Yatsushkevich) +* rgw: make quota/gc threads configurable (#11047 Guang Yang) +* rgw: pass in civetweb configurables (#10907 Yehuda Sadeh) +* rgw: rectify 202 Accepted in PUT response (#11148 Radoslaw Zarzynski) +* rgw: remove meta file after deleting bucket (#11149 Orit Wasserman) +* rgw: swift: allow setting attributes with COPY (#10662 Ahmad Faheem, Dmytro Iurchenko) +* rgw: swift: fix metadata handling on copy (#10645 Radoslaw Zarzynski) +* rgw: swift: send Last-Modified header (#10650 Radoslaw Zarzynski) +* rgw: update keystone cache with token info (#11125 Yehuda Sadeh) +* rgw: update to latest civetweb, enable config for IPv6 (#10965 Yehuda Sadeh) +* rocksdb: update to latest (Xiaoxi Chen) +* rpm: loosen ceph-test dependencies (Ken Dreyer) + +v0.94.3 hammer (draft) +====================== + +* The commands of "pg ls-by-{pool,primary,osd}" and "pg ls" now take "recovering" +instead of "recovery", to include the recovering pgs in the listed pgs. + +Notable Changes +--------------- +* librbd: aio calls may block (`issue#11770 `_, `pr#4875 `_, Jason Dillaman) +* osd: make the all osd/filestore thread pool suicide timeouts separately configurable (`issue#11701 `_, `pr#5159 `_, Samuel Just) +* mon: ceph fails to compile with boost 1.58 (`issue#11982 `_, `pr#5122 `_, Kefu Chai) +* tests: TEST_crush_reject_empty must not run a mon (`issue#12285,11975 `_, `pr#5208 `_, Kefu Chai) +* osd: FAILED assert(!old_value.deleted()) in upgrade:giant-x-hammer-distro-basic-multi run (`issue#11983 `_, `pr#5121 `_, Samuel Just) +* build/ops: linking ceph to tcmalloc causes segfault on SUSE SLE11-SP3 (`issue#12368 `_, `pr#5265 `_, Thorsten Behrens) +* common: utf8 and old gcc breakage on RHEL6.5 (`issue#7387 `_, `pr#4687 `_, Kefu Chai) +* crush: take crashes due to invalid arg (`issue#11740 `_, `pr#4891 `_, Sage Weil) +* rgw: need conversion tool to handle fixes following #11974 (`issue#12502 `_, `pr#5384 `_, Yehuda Sadeh) +* rgw: Swift API: support for 202 Accepted response code on container creation (`issue#12299 `_, `pr#5214 `_, Radoslaw Zarzynski) +* common: Log::reopen_log_file: take m_flush_mutex (`issue#12520 `_, `pr#5405 `_, Samuel Just) +* rgw: Properly respond to the Connection header with Civetweb (`issue#12398 `_, `pr#5284 `_, Wido den Hollander) +* rgw: multipart list part response returns incorrect field (`issue#12399 `_, `pr#5285 `_, Henry Chang) +* build/ops: ceph.spec.in: 95-ceph-osd.rules, mount.ceph, and mount.fuse.ceph not installed properly on SUSE (`issue#12397 `_, `pr#5283 `_, Nathan Cutler) +* rgw: radosgw-admin dumps user info twice (`issue#12400 `_, `pr#5286 `_, guce) +* doc: fix doc build (`issue#12180 `_, `pr#5095 `_, Kefu Chai) +* tests: backport 11493 fixes, and test, preventing ec cache pools (`issue#12314 `_, `pr#4961 `_, Samuel Just) +* rgw: does not send Date HTTP header when civetweb frontend is used (`issue#11872 `_, `pr#5228 `_, Radoslaw Zarzynski) +* mon: pg ls is broken (`issue#11910 `_, `pr#5160 `_, Kefu Chai) +* librbd: A client opening an image mid-resize can result in the object map being invalidated (`issue#12237 `_, `pr#5279 `_, Jason Dillaman) +* doc: missing man pages for ceph-create-keys, ceph-disk-* (`issue#11862 `_, `pr#4846 `_, Nathan Cutler) +* tools: ceph-post-file fails on rhel7 (`issue#11876 `_, `pr#5038 `_, Sage Weil) +* build/ops: rcceph script is buggy (`issue#12090 `_, `pr#5028 `_, Owen Synge) +* rgw: Bucket header is enclosed by quotes (`issue#11874 `_, `pr#4862 `_, Wido den Hollander) +* build/ops: packaging: add SuSEfirewall2 service files (`issue#12092 `_, `pr#5030 `_, Tim Serong) +* rgw: Keystone PKI token expiration is not enforced (`issue#11722 `_, `pr#4884 `_, Anton Aksola) +* build/ops: debian/control: ceph-common (>> 0.94.2) must be >= 0.94.2-2 (`issue#12529,11998 `_, `pr#5417 `_, Loic Dachary) +* mon: Clock skew causes missing summary and confuses Calamari (`issue#11879 `_, `pr#4868 `_, Thorsten Behrens) +* rgw: rados objects wronly deleted (`issue#12099 `_, `pr#5117 `_, wuxingyi) +* tests: kernel_untar_build fails on EL7 (`issue#12098 `_, `pr#5119 `_, Greg Farnum) +* fs: Fh ref count will leak if readahead does not need to do read from osd (`issue#12319 `_, `pr#5427 `_, Zhi Zhang) +* mon: OSDMonitor: allow addition of cache pool with non-empty snaps with co… (`issue#12595 `_, `pr#5252 `_, Samuel Just) +* mon: MDSMonitor: handle MDSBeacon messages properly (`issue#11979 `_, `pr#5123 `_, Kefu Chai) +* tools: ceph-disk: get_partition_type fails on /dev/cciss... (`issue#11760 `_, `pr#4892 `_, islepnev) +* build/ops: max files open limit for OSD daemon is too low (`issue#12087 `_, `pr#5026 `_, Owen Synge) +* mon: add an "osd crush tree" command (`issue#11833 `_, `pr#5248 `_, Kefu Chai) +* mon: mon crashes when "ceph osd tree 85 --format json" (`issue#11975 `_, `pr#4936 `_, Kefu Chai) +* build/ops: ceph / ceph-dbg steal ceph-objecstore-tool from ceph-test / ceph-test-dbg (`issue#11806 `_, `pr#5069 `_, Loic Dachary) +* rgw: DragonDisk fails to create directories via S3: MissingContentLength (`issue#12042 `_, `pr#5118 `_, Yehuda Sadeh) +* build/ops: /usr/bin/ceph from ceph-common is broken without installing ceph (`issue#11998 `_, `pr#5206 `_, Ken Dreyer) +* build/ops: systemd: Increase max files open limit for OSD daemon (`issue#11964 `_, `pr#5040 `_, Owen Synge) +* build/ops: rgw/logrotate.conf calls service with wrong init script name (`issue#12044 `_, `pr#5055 `_, wuxingyi) +* common: OPT_INT option interprets 3221225472 as -1073741824, and crashes in Throttle::Throttle() (`issue#11738 `_, `pr#4889 `_, Kefu Chai) +* doc: doc/release-notes: v0.94.2 (`issue#11492 `_, `pr#4934 `_, Sage Weil) +* common: admin_socket: close socket descriptor in destructor (`issue#11706 `_, `pr#4657 `_, Jon Bernard) +* rgw: Object copy bug (`issue#11755 `_, `pr#4885 `_, Javier M. Mellid) +* rgw: empty json response when getting user quota (`issue#12245 `_, `pr#5237 `_, wuxingyi) +* fs: cephfs Dumper tries to load whole journal into memory at once (`issue#11999 `_, `pr#5120 `_, John Spray) +* rgw: Fix tool for #11442 does not correctly fix objects created via multipart uploads (`issue#12242 `_, `pr#5229 `_, Yehuda Sadeh) +* rgw: Civetweb RGW appears to report full size of object as downloaded when only partially downloaded (`issue#12243 `_, `pr#5231 `_, Yehuda Sadeh) +* osd: stuck incomplete (`issue#12362 `_, `pr#5269 `_, Samuel Just) +* osd: start_flush: filter out removed snaps before determining snapc's (`issue#11911 `_, `pr#4899 `_, Samuel Just) +* librbd: internal.cc: 1967: FAILED assert(watchers.size() == 1) (`issue#12239 `_, `pr#5243 `_, Jason Dillaman) +* librbd: new QA client upgrade tests (`issue#12109 `_, `pr#5046 `_, Jason Dillaman) +* librbd: [ FAILED ] TestLibRBD.ExclusiveLockTransition (`issue#12238 `_, `pr#5241 `_, Jason Dillaman) +* rgw: Swift API: XML document generated in response for GET on account does not contain account name (`issue#12323 `_, `pr#5227 `_, Radoslaw Zarzynski) +* rgw: keystone does not support chunked input (`issue#12322 `_, `pr#5226 `_, Hervé Rousseau) +* mds: MDS is crashed (mds/CDir.cc: 1391: FAILED assert(!is_complete())) (`issue#11737 `_, `pr#4886 `_, Yan, Zheng) +* cli: ceph: cli interactive mode does not understand quotes (`issue#11736 `_, `pr#4776 `_, Kefu Chai) +* librbd: add valgrind memory checks for unit tests (`issue#12384 `_, `pr#5280 `_, Zhiqiang Wang) +* build/ops: admin/build-doc: script fails silently under certain circumstances (`issue#11902 `_, `pr#4877 `_, John Spray) +* osd: Fixes for rados ops with snaps (`issue#11908 `_, `pr#4902 `_, Samuel Just) +* build/ops: ceph.spec.in: ceph-common subpackage def needs tweaking for SUSE/openSUSE (`issue#12308 `_, `pr#4883 `_, Nathan Cutler) +* fs: client: reference counting 'struct Fh' (`issue#12088 `_, `pr#5222 `_, Yan, Zheng) +* build/ops: ceph.spec: update OpenSUSE BuildRequires (`issue#11611 `_, `pr#4667 `_, Loic Dachary) + +v0.94.2 Hammer +============== + +This Hammer point release fixes a few critical bugs in RGW that can +prevent objects starting with underscore from behaving properly and +that prevent garbage collection of deleted objects when using the +Civetweb standalone mode. + +All v0.94.x Hammer users are strongly encouraged to upgrade, and to +make note of the repair procedure below if RGW is in use. + +Upgrading from previous Hammer release +-------------------------------------- + +Bug #11442 introduced a change that made rgw objects that start with underscore +incompatible with previous versions. The fix to that bug reverts to the +previous behavior. In order to be able to access objects that start with an +underscore and were created in prior Hammer releases, following the upgrade it +is required to run (for each affected bucket):: + + $ radosgw-admin bucket check --check-head-obj-locator \ + --bucket= [--fix] + +Notable changes +--------------- + +* build: compilation error: No high-precision counter available (armhf, powerpc..) (#11432, James Page) +* ceph-dencoder links to libtcmalloc, and shouldn't (#10691, Boris Ranto) +* ceph-disk: disk zap sgdisk invocation (#11143, Owen Synge) +* ceph-disk: use a new disk as journal disk,ceph-disk prepare fail (#10983, Loic Dachary) +* ceph-objectstore-tool should be in the ceph server package (#11376, Ken Dreyer) +* librados: can get stuck in redirect loop if osdmap epoch == last_force_op_resend (#11026, Jianpeng Ma) +* librbd: A retransmit of proxied flatten request can result in -EINVAL (Jason Dillaman) +* librbd: ImageWatcher should cancel in-flight ops on watch error (#11363, Jason Dillaman) +* librbd: Objectcacher setting max object counts too low (#7385, Jason Dillaman) +* librbd: Periodic failure of TestLibRBD.DiffIterateStress (#11369, Jason Dillaman) +* librbd: Queued AIO reference counters not properly updated (#11478, Jason Dillaman) +* librbd: deadlock in image refresh (#5488, Jason Dillaman) +* librbd: notification race condition on snap_create (#11342, Jason Dillaman) +* mds: Hammer uclient checking (#11510, John Spray) +* mds: remove caps from revoking list when caps are voluntarily released (#11482, Yan, Zheng) +* messenger: double clear of pipe in reaper (#11381, Haomai Wang) +* mon: Total size of OSDs is a maginitude less than it is supposed to be. (#11534, Zhe Zhang) +* osd: don't check order in finish_proxy_read (#11211, Zhiqiang Wang) +* osd: handle old semi-deleted pgs after upgrade (#11429, Samuel Just) +* osd: object creation by write cannot use an offset on an erasure coded pool (#11507, Jianpeng Ma) +* rgw: Improve rgw HEAD request by avoiding read the body of the first chunk (#11001, Guang Yang) +* rgw: civetweb is hitting a limit (number of threads 1024) (#10243, Yehuda Sadeh) +* rgw: civetweb should use unique request id (#10295, Orit Wasserman) +* rgw: critical fixes for hammer (#11447, #11442, Yehuda Sadeh) +* rgw: fix swift COPY headers (#10662, #10663, #11087, #10645, Radoslaw Zarzynski) +* rgw: improve performance for large object (multiple chunks) GET (#11322, Guang Yang) +* rgw: init-radosgw: run RGW as root (#11453, Ken Dreyer) +* rgw: keystone token cache does not work correctly (#11125, Yehuda Sadeh) +* rgw: make quota/gc thread configurable for starting (#11047, Guang Yang) +* rgw: make swift responses of RGW return last-modified, content-length, x-trans-id headers.(#10650, Radoslaw Zarzynski) +* rgw: merge manifests correctly when there's prefix override (#11622, Yehuda Sadeh) +* rgw: quota not respected in POST object (#11323, Sergey Arkhipov) +* rgw: restore buffer of multipart upload after EEXIST (#11604, Yehuda Sadeh) +* rgw: shouldn't need to disable rgw_socket_path if frontend is configured (#11160, Yehuda Sadeh) +* rgw: swift: Response header of GET request for container does not contain X-Container-Object-Count, X-Container-Bytes-Used and x-trans-id headers (#10666, Dmytro Iurchenko) +* rgw: swift: Response header of POST request for object does not contain content-length and x-trans-id headers (#10661, Radoslaw Zarzynski) +* rgw: swift: response for GET/HEAD on container does not contain the X-Timestamp header (#10938, Radoslaw Zarzynski) +* rgw: swift: response for PUT on /container does not contain the mandatory Content-Length header when FCGI is used (#11036, #10971, Radoslaw Zarzynski) +* rgw: swift: wrong handling of empty metadata on Swift container (#11088, Radoslaw Zarzynski) +* tests: TestFlatIndex.cc races with TestLFNIndex.cc (#11217, Xinze Chi) +* tests: ceph-helpers kill_daemons fails when kill fails (#11398, Loic Dachary) + +For more detailed information, see :download:`the complete changelog `. + + +v0.94.1 Hammer +============== + +This bug fix release fixes a few critical issues with CRUSH. The most +important addresses a bug in feature bit enforcement that may prevent +pre-hammer clients from communicating with the cluster during an +upgrade. This only manifests in some cases (for example, when the +'rack' type is in use in the CRUSH map, and possibly other cases), but for +safety we strongly recommend that all users use 0.94.1 instead of 0.94 when +upgrading. + +There is also a fix in the new straw2 buckets when OSD weights are 0. + +We recommend that all v0.94 users upgrade. + +Notable changes +--------------- + +* crush: fix divide-by-0 in straw2 (#11357 Sage Weil) +* crush: fix has_v4_buckets (#11364 Sage Weil) +* osd: fix negative degraded objects during backfilling (#7737 Guang Yang) + +For more detailed information, see :download:`the complete changelog `. + + +v0.94 Hammer +============ + +This major release is expected to form the basis of the next long-term +stable series. It is intended to supersede v0.80.x Firefly. + +Highlights since Giant include: + +* *RADOS Performance*: a range of improvements have been made in the + OSD and client-side librados code that improve the throughput on + flash backends and improve parallelism and scaling on fast machines. +* *Simplified RGW deployment*: the ceph-deploy tool now has a new + 'ceph-deploy rgw create HOST' command that quickly deploys a + instance of the S3/Swift gateway using the embedded Civetweb server. + This is vastly simpler than the previous Apache-based deployment. + There are a few rough edges (e.g., around SSL support) but we + encourage users to try `the new method`_. +* *RGW object versioning*: RGW now supports the S3 object versioning + API, which preserves old version of objects instead of overwriting + them. +* *RGW bucket sharding*: RGW can now shard the bucket index for large + buckets across, improving performance for very large buckets. +* *RBD object maps*: RBD now has an object map function that tracks + which parts of the image are allocating, improving performance for + clones and for commands like export and delete. +* *RBD mandatory locking*: RBD has a new mandatory locking framework + (still disabled by default) that adds additional safeguards to + prevent multiple clients from using the same image at the same time. +* *RBD copy-on-read*: RBD now supports copy-on-read for image clones, + improving performance for some workloads. +* *CephFS snapshot improvements*: Many many bugs have been fixed with + CephFS snapshots. Although they are still disabled by default, + stability has improved significantly. +* *CephFS Recovery tools*: We have built some journal recovery and + diagnostic tools. Stability and performance of single-MDS systems is + vastly improved in Giant, and more improvements have been made now + in Hammer. Although we still recommend caution when storing + important data in CephFS, we do encourage testing for non-critical + workloads so that we can better guage the feature, usability, + performance, and stability gaps. +* *CRUSH improvements*: We have added a new straw2 bucket algorithm + that reduces the amount of data migration required when changes are + made to the cluster. +* *Shingled erasure codes (SHEC)*: The OSDs now have experimental + support for shingled erasure codes, which allow a small amount of + additional storage to be traded for improved recovery performance. +* *RADOS cache tiering*: A series of changes have been made in the + cache tiering code that improve performance and reduce latency. +* *RDMA support*: There is now experimental support the RDMA via the + Accelio (libxio) library. +* *New administrator commands*: The 'ceph osd df' command shows + pertinent details on OSD disk utilizations. The 'ceph pg ls ...' + command makes it much simpler to query PG states while diagnosing + cluster issues. + +.. _the new method: ../start/quick-ceph-deploy/#add-an-rgw-instance + +Other highlights since Firefly include: + +* *CephFS*: we have fixed a raft of bugs in CephFS and built some + basic journal recovery and diagnostic tools. Stability and + performance of single-MDS systems is vastly improved in Giant. + Although we do not yet recommend CephFS for production deployments, + we do encourage testing for non-critical workloads so that we can + better guage the feature, usability, performance, and stability + gaps. +* *Local Recovery Codes*: the OSDs now support an erasure-coding scheme + that stores some additional data blocks to reduce the IO required to + recover from single OSD failures. +* *Degraded vs misplaced*: the Ceph health reports from 'ceph -s' and + related commands now make a distinction between data that is + degraded (there are fewer than the desired number of copies) and + data that is misplaced (stored in the wrong location in the + cluster). The distinction is important because the latter does not + compromise data safety. +* *Tiering improvements*: we have made several improvements to the + cache tiering implementation that improve performance. Most + notably, objects are not promoted into the cache tier by a single + read; they must be found to be sufficiently hot before that happens. +* *Monitor performance*: the monitors now perform writes to the local + data store asynchronously, improving overall responsiveness. +* *Recovery tools*: the ceph-objectstore-tool is greatly expanded to + allow manipulation of an individual OSDs data store for debugging + and repair purposes. This is most heavily used by our QA + infrastructure to exercise recovery code. + +I would like to take this opportunity to call out the amazing growth +in contributors to Ceph beyond the core development team from Inktank. +Hammer features major new features and improvements from Intel, Fujitsu, +UnitedStack, Yahoo, UbuntuKylin, CohortFS, Mellanox, CERN, Deutsche +Telekom, Mirantis, and SanDisk. + +Dedication +---------- + +This release is dedicated in memoriam to Sandon Van Ness, aka +Houkouonchi, who unexpectedly passed away a few weeks ago. Sandon was +responsible for maintaining the large and complex Sepia lab that +houses the Ceph project's build and test infrastructure. His efforts +have made an important impact on our ability to reliably test Ceph +with a relatively small group of people. He was a valued member of +the team and we will miss him. H is also for Houkouonchi. + +Upgrading +--------- + +* If your existing cluster is running a version older than v0.80.x + Firefly, please first upgrade to the latest Firefly release before + moving on to Giant. We have not tested upgrades directly from + Emperor, Dumpling, or older releases. + + We *have* tested: + + * Firefly to Hammer + * Giant to Hammer + * Dumpling to Firefly to Hammer + +* Please upgrade daemons in the following order: + + #. Monitors + #. OSDs + #. MDSs and/or radosgw + + Note that the relative ordering of OSDs and monitors should not matter, but + we primarily tested upgrading monitors first. + +* The ceph-osd daemons will perform a disk-format upgrade improve the + PG metadata layout and to repair a minor bug in the on-disk format. + It may take a minute or two for this to complete, depending on how + many objects are stored on the node; do not be alarmed if they do + not marked "up" by the cluster immediately after starting. + +* If upgrading from v0.93, set + osd enable degraded writes = false + + on all osds prior to upgrading. The degraded writes feature has + been reverted due to 11155. + +* The LTTNG tracing in librbd and librados is disabled in the release packages + until we find a way to avoid violating distro security policies when linking + libust. + +Upgrading from v0.87.x Giant +---------------------------- + +* librbd and librados include lttng tracepoints on distros with + liblttng 2.4 or later (only Ubuntu Trusty for the ceph.com + packages). When running a daemon that uses these libraries, i.e. an + application that calls fork(2) or clone(2) without exec(3), you must + set LD_PRELOAD=liblttng-ust-fork.so.0 to prevent a crash in the + lttng atexit handler when the process exits. The only ceph tool that + requires this is rbd-fuse. + +* If rgw_socket_path is defined and rgw_frontends defines a + socket_port and socket_host, we now allow the rgw_frontends settings + to take precedence. This change should only affect users who have + made non-standard changes to their radosgw configuration. + +* If you are upgrading specifically from v0.92, you must stop all OSD + daemons and flush their journals (``ceph-osd -i NNN + --flush-journal``) before upgrading. There was a transaction + encoding bug in v0.92 that broke compatibility. Upgrading from v0.93, + v0.91, or anything earlier is safe. + +* The experimental 'keyvaluestore-dev' OSD backend has been renamed + 'keyvaluestore' (for simplicity) and marked as experimental. To + enable this untested feature and acknowledge that you understand + that it is untested and may destroy data, you need to add the + following to your ceph.conf:: + + enable experimental unrecoverable data corrupting featuers = keyvaluestore + +* The following librados C API function calls take a 'flags' argument whose value + is now correctly interpreted: + + rados_write_op_operate() + rados_aio_write_op_operate() + rados_read_op_operate() + rados_aio_read_op_operate() + + The flags were not correctly being translated from the librados constants to the + internal values. Now they are. Any code that is passing flags to these methods + should be audited to ensure that they are using the correct LIBRADOS_OP_FLAG_* + constants. + +* The 'rados' CLI 'copy' and 'cppool' commands now use the copy-from operation, + which means the latest CLI cannot run these commands against pre-firefly OSDs. + +* The librados watch/notify API now includes a watch_flush() operation to flush + the async queue of notify operations. This should be called by any watch/notify + user prior to rados_shutdown(). + +* The 'category' field for objects has been removed. This was originally added + to track PG stat summations over different categories of objects for use by + radosgw. It is no longer has any known users and is prone to abuse because it + can lead to a pg_stat_t structure that is unbounded. The librados API calls + that accept this field now ignore it, and the OSD no longers tracks the + per-category summations. + +* The output for 'rados df' has changed. The 'category' level has been + eliminated, so there is now a single stat object per pool. The structure of + the JSON output is different, and the plaintext output has one less column. + +* The 'rados create [category]' optional category argument is no + longer supported or recognized. + +* rados.py's Rados class no longer has a __del__ method; it was causing + problems on interpreter shutdown and use of threads. If your code has + Rados objects with limited lifetimes and you're concerned about locked + resources, call Rados.shutdown() explicitly. + +* There is a new version of the librados watch/notify API with vastly + improved semantics. Any applications using this interface are + encouraged to migrate to the new API. The old API calls are marked + as deprecated and will eventually be removed. + +* The librados rados_unwatch() call used to be safe to call on an + invalid handle. The new version has undefined behavior when passed + a bogus value (for example, when rados_watch() returns an error and + handle is not defined). + +* The structure of the formatted 'pg stat' command is changed for the + portion that counts states by name to avoid using the '+' character + (which appears in state names) as part of the XML token (it is not + legal). + +* Previously, the formatted output of 'ceph pg stat -f ...' was a full + pg dump that included all metadata about all PGs in the system. It + is now a concise summary of high-level PG stats, just like the + unformatted 'ceph pg stat' command. + +* All JSON dumps of floating point values were incorrecting surrounding the + value with quotes. These quotes have been removed. Any consumer of structured + JSON output that was consuming the floating point values was previously having + to interpret the quoted string and will most likely need to be fixed to take + the unquoted number. + +* New ability to list all objects from all namespaces that can fail or + return incomplete results when not all OSDs have been upgraded. + Features rados --all ls, rados cppool, rados export, rados + cache-flush-evict-all and rados cache-try-flush-evict-all can also + fail or return incomplete results. + +* Due to a change in the Linux kernel version 3.18 and the limits of the FUSE + interface, ceph-fuse needs be mounted as root on at least some systems. See + issues #9997, #10277, and #10542 for details. + +Upgrading from v0.80x Firefly (additional notes) +------------------------------------------------ + +* The client-side caching for librbd is now enabled by default (rbd + cache = true). A safety option (rbd cache writethrough until flush + = true) is also enabled so that writeback caching is not used until + the library observes a 'flush' command, indicating that the librbd + users is passing that operation through from the guest VM. This + avoids potential data loss when used with older versions of qemu + that do not support flush. + + leveldb_write_buffer_size = 8*1024*1024 = 33554432 // 8MB + leveldb_cache_size = 512*1024*1204 = 536870912 // 512MB + leveldb_block_size = 64*1024 = 65536 // 64KB + leveldb_compression = false + leveldb_log = "" + + OSDs will still maintain the following osd-specific defaults: + + leveldb_log = "" + +* The 'rados getxattr ...' command used to add a gratuitous newline to the attr + value; it now does not. + +* The ``*_kb perf`` counters on the monitor have been removed. These are + replaced with a new set of ``*_bytes`` counters (e.g., ``cluster_osd_kb`` is + replaced by ``cluster_osd_bytes``). + +* The ``rd_kb`` and ``wr_kb`` fields in the JSON dumps for pool stats (accessed + via the ``ceph df detail -f json-pretty`` and related commands) have been + replaced with corresponding ``*_bytes`` fields. Similarly, the + ``total_space``, ``total_used``, and ``total_avail`` fields are replaced with + ``total_bytes``, ``total_used_bytes``, and ``total_avail_bytes`` fields. + +* The ``rados df --format=json`` output ``read_bytes`` and ``write_bytes`` + fields were incorrectly reporting ops; this is now fixed. + +* The ``rados df --format=json`` output previously included ``read_kb`` and + ``write_kb`` fields; these have been removed. Please use ``read_bytes`` and + ``write_bytes`` instead (and divide by 1024 if appropriate). + +* The experimental keyvaluestore-dev OSD backend had an on-disk format + change that prevents existing OSD data from being upgraded. This + affects developers and testers only. + +* mon-specific and osd-specific leveldb options have been removed. + From this point onward users should use the `leveldb_*` generic + options and add the options in the appropriate sections of their + configuration files. Monitors will still maintain the following + monitor-specific defaults: + + leveldb_write_buffer_size = 8*1024*1024 = 33554432 // 8MB + leveldb_cache_size = 512*1024*1204 = 536870912 // 512MB + leveldb_block_size = 64*1024 = 65536 // 64KB + leveldb_compression = false + leveldb_log = "" + + OSDs will still maintain the following osd-specific defaults: + + leveldb_log = "" + +* CephFS support for the legacy anchor table has finally been removed. + Users with file systems created before firefly should ensure that inodes + with multiple hard links are modified *prior* to the upgrade to ensure that + the backtraces are written properly. For example:: + + sudo find /mnt/cephfs -type f -links +1 -exec touch \{\} \; + +* We disallow nonsensical 'tier cache-mode' transitions. From this point + onward, 'writeback' can only transition to 'forward' and 'forward' + can transition to 1) 'writeback' if there are dirty objects, or 2) any if + there are no dirty objects. + + +Notable changes since v0.93 +--------------------------- + +* build: a few cmake fixes (Matt Benjamin) +* build: fix build on RHEL/CentOS 5.9 (Rohan Mars) +* build: reorganize Makefile to allow modular builds (Boris Ranto) +* ceph-fuse: be more forgiving on remount (#10982 Greg Farnum) +* ceph: improve CLI parsing (#11093 David Zafman) +* common: fix cluster logging to default channel (#11177 Sage Weil) +* crush: fix parsing of straw2 buckets (#11015 Sage Weil) +* doc: update man pages (David Zafman) +* librados: fix leak in C_TwoContexts (Xiong Yiliang) +* librados: fix leak in watch/notify path (Sage Weil) +* librbd: fix and improve AIO cache invalidation (#10958 Jason Dillaman) +* librbd: fix memory leak (Jason Dillaman) +* librbd: fix ordering/queueing of resize operations (Jason Dillaman) +* librbd: validate image is r/w on resize/flatten (Jason Dillaman) +* librbd: various internal locking fixes (Jason Dillaman) +* lttng: tracing is disabled until we streamline dependencies (Josh Durgin) +* mon: add bootstrap-rgw profile (Sage Weil) +* mon: do not pollute mon dir with CSV files from CRUSH check (Loic Dachary) +* mon: fix clock drift time check interval (#10546 Joao Eduardo Luis) +* mon: fix units in store stats (Joao Eduardo Luis) +* mon: improve error handling on erasure code profile set (#10488, #11144 Loic Dachary) +* mon: set {read,write}_tier on 'osd tier add-cache ...' (Jianpeng Ma) +* ms: xio: fix misc bugs (Matt Benjamin, Vu Pham) +* osd: DBObjectMap: fix locking to prevent rare crash (#9891 Samuel Just) +* osd: fix and document last_epoch_started semantics (Samuel Just) +* osd: fix divergent entry handling on PG split (Samuel Just) +* osd: fix leak on shutdown (Kefu Chai) +* osd: fix recording of digest on scrub (Samuel Just) +* osd: fix whiteout handling (Sage Weil) +* rbd: allow v2 striping parameters for clones and imports (Jason Dillaman) +* rbd: fix formatted output of image features (Jason Dillaman) +* rbd: updat eman page (Ilya Dryomov) +* rgw: don't overwrite bucket/object owner when setting ACLs (#10978 Yehuda Sadeh) +* rgw: enable IPv6 for civetweb (#10965 Yehuda Sadeh) +* rgw: fix sysvinit script when rgw_socket_path is not defined (#11159 Yehuda Sadeh, Dan Mick) +* rgw: pass civetweb configurables through (#10907 Yehuda Sadeh) +* rgw: use new watch/notify API (Yehuda Sadeh, Sage Weil) +* osd: reverted degraded writes feature due to 11155 + +Notable changes since v0.87.x Giant +----------------------------------- + +* add experimental features option (Sage Weil) +* arch: fix NEON feaeture detection (#10185 Loic Dachary) +* asyncmsgr: misc fixes (Haomai Wang) +* buffer: add 'shareable' construct (Matt Benjamin) +* buffer: add list::get_contiguous (Sage Weil) +* buffer: avoid rebuild if buffer already contiguous (Jianpeng Ma) +* build: CMake support (Ali Maredia, Casey Bodley, Adam Emerson, Marcus Watts, Matt Benjamin) +* build: a few cmake fixes (Matt Benjamin) +* build: aarch64 build fixes (Noah Watkins, Haomai Wang) +* build: adjust build deps for yasm, virtualenv (Jianpeng Ma) +* build: fix 'make check' races (#10384 Loic Dachary) +* build: fix build on RHEL/CentOS 5.9 (Rohan Mars) +* build: fix pkg names when libkeyutils is missing (Pankag Garg, Ken Dreyer) +* build: improve build dependency tooling (Loic Dachary) +* build: reorganize Makefile to allow modular builds (Boris Ranto) +* build: support for jemalloc (Shishir Gowda) +* ceph-disk: Scientific Linux support (Dan van der Ster) +* ceph-disk: allow journal partition re-use (#10146 Loic Dachary, Dav van der Ster) +* ceph-disk: call partx/partprobe consistency (#9721 Loic Dachary) +* ceph-disk: do not re-use partition if encryption is required (Loic Dachary) +* ceph-disk: fix dmcrypt key permissions (Loic Dachary) +* ceph-disk: fix umount race condition (#10096 Blaine Gardner) +* ceph-disk: improved systemd support (Owen Synge) +* ceph-disk: init=none option (Loic Dachary) +* ceph-disk: misc fixes (Christos Stavrakakis) +* ceph-disk: respect --statedir for keyring (Loic Dachary) +* ceph-disk: set guid if reusing journal partition (Dan van der Ster) +* ceph-disk: support LUKS for encrypted partitions (Andrew Bartlett, Loic Dachary) +* ceph-fuse, libcephfs: POSIX file lock support (Yan, Zheng) +* ceph-fuse, libcephfs: allow xattr caps in inject_release_failure (#9800 John Spray) +* ceph-fuse, libcephfs: fix I_COMPLETE_ORDERED checks (#9894 Yan, Zheng) +* ceph-fuse, libcephfs: fix cap flush overflow (Greg Farnum, Yan, Zheng) +* ceph-fuse, libcephfs: fix root inode xattrs (Yan, Zheng) +* ceph-fuse, libcephfs: preserve dir ordering (#9178 Yan, Zheng) +* ceph-fuse, libcephfs: trim inodes before reconnecting to MDS (Yan, Zheng) +* ceph-fuse,libcephfs: add support for O_NOFOLLOW and O_PATH (Greg Farnum) +* ceph-fuse,libcephfs: resend requests before completing cap reconnect (#10912 Yan, Zheng) +* ceph-fuse: be more forgiving on remount (#10982 Greg Farnum) +* ceph-fuse: fix dentry invalidation on 3.18+ kernels (#9997 Yan, Zheng) +* ceph-fuse: fix kernel cache trimming (#10277 Yan, Zheng) +* ceph-fuse: select kernel cache invalidation mechanism based on kernel version (Greg Farnum) +* ceph-monstore-tool: fix shutdown (#10093 Loic Dachary) +* ceph-monstore-tool: fix/improve CLI (Joao Eduardo Luis) +* ceph-objectstore-tool: fix import (#10090 David Zafman) +* ceph-objectstore-tool: improved import (David Zafman) +* ceph-objectstore-tool: many improvements and tests (David Zafman) +* ceph-objectstore-tool: many many improvements (David Zafman) +* ceph-objectstore-tool: misc improvements, fixes (#9870 #9871 David Zafman) +* ceph.spec: package rbd-replay-prep (Ken Dreyer) +* ceph: add 'ceph osd df [tree]' command (#10452 Mykola Golub) +* ceph: do not parse injectargs twice (Loic Dachary) +* ceph: fix 'ceph tell ...' command validation (#10439 Joao Eduardo Luis) +* ceph: improve 'ceph osd tree' output (Mykola Golub) +* ceph: improve CLI parsing (#11093 David Zafman) +* ceph: make 'ceph -s' output more readable (Sage Weil) +* ceph: make 'ceph -s' show PG state counts in sorted order (Sage Weil) +* ceph: make 'ceph tell mon.* version' work (Mykola Golub) +* ceph: new 'ceph tell mds.$name_or_rank_or_gid' (John Spray) +* ceph: show primary-affinity in 'ceph osd tree' (Mykola Golub) +* ceph: test robustness (Joao Eduardo Luis) +* ceph_objectstore_tool: behave with sharded flag (#9661 David Zafman) +* cephfs-journal-tool: add recover_dentries function (#9883 John Spray) +* cephfs-journal-tool: fix journal import (#10025 John Spray) +* cephfs-journal-tool: skip up to expire_pos (#9977 John Spray) +* cleanup rados.h definitions with macros (Ilya Dryomov) +* common: add 'perf reset ...' admin command (Jianpeng Ma) +* common: add TableFormatter (Andreas Peters) +* common: add newline to flushed json output (Sage Weil) +* common: check syncfs() return code (Jianpeng Ma) +* common: do not unlock rwlock on destruction (Federico Simoncelli) +* common: filtering for 'perf dump' (John Spray) +* common: fix Formatter factory breakage (#10547 Loic Dachary) +* common: fix block device discard check (#10296 Sage Weil) +* common: make json-pretty output prettier (Sage Weil) +* common: remove broken CEPH_LOCKDEP optoin (Kefu Chai) +* common: shared_cache unit tests (Cheng Cheng) +* common: support new gperftools header locations (Key Dreyer) +* config: add $cctid meta variable (Adam Crume) +* crush: fix buffer overrun for poorly formed rules (#9492 Johnu George) +* crush: fix detach_bucket (#10095 Sage Weil) +* crush: fix parsing of straw2 buckets (#11015 Sage Weil) +* crush: fix several bugs in adjust_item_weight (Rongze Zhu) +* crush: fix tree bucket behavior (Rongze Zhu) +* crush: improve constness (Loic Dachary) +* crush: new and improved straw2 bucket type (Sage Weil, Christina Anderson, Xiaoxi Chen) +* crush: straw bucket weight calculation fixes (#9998 Sage Weil) +* crush: update tries stats for indep rules (#10349 Loic Dachary) +* crush: use larger choose_tries value for erasure code rulesets (#10353 Loic Dachary) +* crushtool: add --location command (Sage Weil, Loic Dachary) +* debian,rpm: move RBD udev rules to ceph-common (#10864 Ken Dreyer) +* debian: split python-ceph into python-{rbd,rados,cephfs} (Boris Ranto) +* default to libnss instead of crypto++ (Federico Gimenez) +* doc: CephFS disaster recovery guidance (John Spray) +* doc: CephFS for early adopters (John Spray) +* doc: add build-doc guidlines for Fedora and CentOS/RHEL (Nilamdyuti Goswami) +* doc: add dumpling to firefly upgrade section (#7679 John Wilkins) +* doc: ceph osd reweight vs crush weight (Laurent Guerby) +* doc: do not suggest dangerous XFS nobarrier option (Dan van der Ster) +* doc: document erasure coded pool operations (#9970 Loic Dachary) +* doc: document the LRC per-layer plugin configuration (Yuan Zhou) +* doc: enable rbd cache on openstack deployments (Sebastien Han) +* doc: erasure code doc updates (Loic Dachary) +* doc: file system osd config settings (Kevin Dalley) +* doc: fix OpenStack Glance docs (#10478 Sebastien Han) +* doc: improved installation nots on CentOS/RHEL installs (John Wilkins) +* doc: key/value store config reference (John Wilkins) +* doc: misc cleanups (Adam Spiers, Sebastien Han, Nilamdyuti Goswami, Ken Dreyer, John Wilkins) +* doc: misc improvements (Nilamdyuti Goswami, John Wilkins, Chris Holcombe) +* doc: misc updates (#9793 #9922 #10204 #10203 Travis Rhoden, Hazem, Ayari, Florian Coste, Andy Allan, Frank Yu, Baptiste Veuillez-Mainard, Yuan Zhou, Armando Segnini, Robert Jansen, Tyler Brekke, Viktor Suprun) +* doc: misc updates (Alfredo Deza, VRan Liu) +* doc: misc updates (Nilamdyuti Goswami, John Wilkins) +* doc: new man pages (Nilamdyuti Goswami) +* doc: preflight doc fixes (John Wilkins) +* doc: replace cloudfiles with swiftclient Python Swift example (Tim Freund) +* doc: update PG count guide (Gerben Meijer, Laurent Guerby, Loic Dachary) +* doc: update man pages (David Zafman) +* doc: update openstack docs for Juno (Sebastien Han) +* doc: update release descriptions (Ken Dreyer) +* doc: update sepia hardware inventory (Sandon Van Ness) +* erasure-code: add mSHEC erasure code support (Takeshi Miyamae) +* erasure-code: improved docs (#10340 Loic Dachary) +* erasure-code: set max_size to 20 (#10363 Loic Dachary) +* fix cluster logging from non-mon daemons (Sage Weil) +* init-ceph: check for systemd-run before using it (Boris Ranto) +* install-deps.sh: do not require sudo when root (Loic Dachary) +* keyvaluestore: misc fixes (Haomai Wang) +* keyvaluestore: performance improvements (Haomai Wang) +* libcephfs,ceph-fuse: add 'status' asok (John Spray) +* libcephfs,ceph-fuse: fix getting zero-length xattr (#10552 Yan, Zheng) +* libcephfs: fix dirfrag trimming (#10387 Yan, Zheng) +* libcephfs: fix mount timeout (#10041 Yan, Zheng) +* libcephfs: fix test (#10415 Yan, Zheng) +* libcephfs: fix use-afer-free on umount (#10412 Yan, Zheng) +* libcephfs: include ceph and git version in client metadata (Sage Weil) +* librados, osd: new watch/notify implementation (Sage Weil) +* librados: add blacklist_add convenience method (Jason Dillaman) +* librados: add rados_pool_get_base_tier() call (Adam Crume) +* librados: add watch_flush() operation (Sage Weil, Haomai Wang) +* librados: avoid memcpy on getxattr, read (Jianpeng Ma) +* librados: cap buffer length (Loic Dachary) +* librados: create ioctx by pool id (Jason Dillaman) +* librados: do notify completion in fast-dispatch (Sage Weil) +* librados: drop 'category' feature (Sage Weil) +* librados: expose rados_{read|write}_op_assert_version in C API (Kim Vandry) +* librados: fix infinite loop with skipped map epochs (#9986 Ding Dinghua) +* librados: fix iterator operator= bugs (#10082 David Zafman, Yehuda Sadeh) +* librados: fix leak in C_TwoContexts (Xiong Yiliang) +* librados: fix leak in watch/notify path (Sage Weil) +* librados: fix null deref when pool DNE (#9944 Sage Weil) +* librados: fix objecter races (#9617 Josh Durgin) +* librados: fix pool deletion handling (#10372 Sage Weil) +* librados: fix pool name caching (#10458 Radoslaw Zarzynski) +* librados: fix resource leak, misc bugs (#10425 Radoslaw Zarzynski) +* librados: fix some watch/notify locking (Jason Dillaman, Josh Durgin) +* librados: fix timer race from recent refactor (Sage Weil) +* librados: new fadvise API (Ma Jianpeng) +* librados: only export public API symbols (Jason Dillaman) +* librados: remove shadowed variable (Kefu Chain) +* librados: translate op flags from C APIs (Matthew Richards) +* libradosstriper: fix remove() (Dongmao Zhang) +* libradosstriper: fix shutdown hang (Dongmao Zhang) +* libradosstriper: fix stat strtoll (Dongmao Zhang) +* libradosstriper: fix trunc method (#10129 Sebastien Ponce) +* libradosstriper: fix write_full when ENOENT (#10758 Sebastien Ponce) +* libradosstriper: misc fixes (Sebastien Ponce) +* librbd: CRC protection for RBD image map (Jason Dillaman) +* librbd: add missing python docstrings (Jason Dillaman) +* librbd: add per-image object map for improved performance (Jason Dillaman) +* librbd: add readahead (Adam Crume) +* librbd: add support for an "object map" indicating which objects exist (Jason Dillaman) +* librbd: adjust internal locking (Josh Durgin, Jason Dillaman) +* librbd: better handling of watch errors (Jason Dillaman) +* librbd: complete pending ops before closing image (#10299 Josh Durgin) +* librbd: coordinate maint operations through lock owner (Jason Dillaman) +* librbd: copy-on-read (Min Chen, Li Wang, Yunchuan Wen, Cheng Cheng, Jason Dillaman) +* librbd: differentiate between R/O vs R/W features (Jason Dillaman) +* librbd: don't close a closed parent in failure path (#10030 Jason Dillaman) +* librbd: enforce write ordering with a snapshot (Jason Dillaman) +* librbd: exclusive image locking (Jason Dillaman) +* librbd: fadvise API (Ma Jianpeng) +* librbd: fadvise-style hints; add misc hints for certain operations (Jianpeng Ma) +* librbd: fix and improve AIO cache invalidation (#10958 Jason Dillaman) +* librbd: fix cache tiers in list_children and snap_unprotect (Adam Crume) +* librbd: fix coverity false-positives (Jason Dillaman) +* librbd: fix diff test (#10002 Josh Durgin) +* librbd: fix list_children from invalid pool ioctxs (#10123 Jason Dillaman) +* librbd: fix locking for readahead (#10045 Jason Dillaman) +* librbd: fix memory leak (Jason Dillaman) +* librbd: fix ordering/queueing of resize operations (Jason Dillaman) +* librbd: fix performance regression in ObjectCacher (#9513 Adam Crume) +* librbd: fix snap create races (Jason Dillaman) +* librbd: fix write vs import race (#10590 Jason Dillaman) +* librbd: flush AIO operations asynchronously (#10714 Jason Dillaman) +* librbd: gracefully handle deleted/renamed pools (#10270 Jason Dillaman) +* librbd: lttng tracepoints (Adam Crume) +* librbd: make async versions of long-running maint operations (Jason Dillaman) +* librbd: misc fixes (Xinxin Shu, Jason Dillaman) +* librbd: mock tests (Jason Dillaman) +* librbd: only export public API symbols (Jason Dillaman) +* librbd: optionally blacklist clients before breaking locks (#10761 Jason Dillaman) +* librbd: prevent copyup during shrink (Jason Dillaman) +* librbd: refactor unit tests to use fixtures (Jason Dillaman) +* librbd: validate image is r/w on resize/flatten (Jason Dillaman) +* librbd: various internal locking fixes (Jason Dillaman) +* many coverity fixes (Danny Al-Gaaf) +* many many coverity cleanups (Danny Al-Gaaf) +* mds: 'flush journal' admin command (John Spray) +* mds: ENOSPC and OSDMap epoch barriers (#7317 John Spray) +* mds: a whole bunch of initial scrub infrastructure (Greg Farnum) +* mds: add cephfs-table-tool (John Spray) +* mds: asok command for fetching subtree map (John Spray) +* mds: avoid sending traceless replies in most cases (Yan, Zheng) +* mds: constify MDSCacheObjects (John Spray) +* mds: dirfrag buf fix (Yan, Zheng) +* mds: disallow most commands on inactive MDS's (Greg Farnum) +* mds: drop dentries, leases on deleted directories (#10164 Yan, Zheng) +* mds: export dir asok command (John Spray) +* mds: fix MDLog IO callback deadlock (John Spray) +* mds: fix compat_version for MClientSession (#9945 John Spray) +* mds: fix deadlock during journal probe vs purge (#10229 Yan, Zheng) +* mds: fix race trimming log segments (Yan, Zheng) +* mds: fix reply snapbl (Yan, Zheng) +* mds: fix sessionmap lifecycle bugs (Yan, Zheng) +* mds: fix stray/purge perfcounters (#10388 John Spray) +* mds: handle heartbeat_reset during shutdown (#10382 John Spray) +* mds: handle zero-size xattr (#10335 Yan, Zheng) +* mds: initialize root inode xattr version (Yan, Zheng) +* mds: introduce auth caps (John Spray) +* mds: many many snapshot-related fixes (Yan, Zheng) +* mds: misc bugs (Greg Farnum, John Spray, Yan, Zheng, Henry Change) +* mds: refactor, improve Session storage (John Spray) +* mds: store backtrace for stray dir (Yan, Zheng) +* mds: subtree quota support (Yunchuan Wen) +* mds: verify backtrace when fetching dirfrag (#9557 Yan, Zheng) +* memstore: free space tracking (John Spray) +* misc cleanup (Danny Al-Gaaf, David Anderson) +* misc coverity fixes (Danny Al-Gaaf) +* misc coverity fixes (Danny Al-Gaaf) +* misc: various valgrind fixes and cleanups (Danny Al-Gaaf) +* mon: 'osd crush reweight-all' command (Sage Weil) +* mon: add 'ceph osd rename-bucket ...' command (Loic Dachary) +* mon: add bootstrap-rgw profile (Sage Weil) +* mon: add max pgs per osd warning (Sage Weil) +* mon: add noforward flag for some mon commands (Mykola Golub) +* mon: allow adding tiers to fs pools (#10135 John Spray) +* mon: allow full flag to be manually cleared (#9323 Sage Weil) +* mon: clean up auth list output (Loic Dachary) +* mon: delay failure injection (Joao Eduardo Luis) +* mon: disallow empty pool names (#10555 Wido den Hollander) +* mon: do not deactivate last mds (#10862 John Spray) +* mon: do not pollute mon dir with CSV files from CRUSH check (Loic Dachary) +* mon: drop old ceph_mon_store_converter (Sage Weil) +* mon: fix 'ceph pg dump_stuck degraded' (Xinxin Shu) +* mon: fix 'mds fail' for standby MDSs (John Spray) +* mon: fix 'osd crush link' id resolution (John Spray) +* mon: fix 'profile osd' use of config-key function on mon (#10844 Joao Eduardo Luis) +* mon: fix *_ratio* units and types (Sage Weil) +* mon: fix JSON dumps to dump floats as flots and not strings (Sage Weil) +* mon: fix MDS health status from peons (#10151 John Spray) +* mon: fix caching for min_last_epoch_clean (#9987 Sage Weil) +* mon: fix clock drift time check interval (#10546 Joao Eduardo Luis) +* mon: fix compatset initalization during mkfs (Joao Eduardo Luis) +* mon: fix error output for add_data_pool (#9852 Joao Eduardo Luis) +* mon: fix feature tracking during elections (Joao Eduardo Luis) +* mon: fix formatter 'pg stat' command output (Sage Weil) +* mon: fix mds gid/rank/state parsing (John Spray) +* mon: fix misc error paths (Joao Eduardo Luis) +* mon: fix paxos off-by-one corner case (#9301 Sage Weil) +* mon: fix paxos timeouts (#10220 Joao Eduardo Luis) +* mon: fix stashed monmap encoding (#5203 Xie Rui) +* mon: fix units in store stats (Joao Eduardo Luis) +* mon: get canonical OSDMap from leader (#10422 Sage Weil) +* mon: ignore failure reports from before up_from (#10762 Dan van der Ster, Sage Weil) +* mon: implement 'fs reset' command (John Spray) +* mon: improve error handling on erasure code profile set (#10488, #11144 Loic Dachary) +* mon: improved corrupt CRUSH map detection (Joao Eduardo Luis) +* mon: include entity name in audit log for forwarded requests (#9913 Joao Eduardo Luis) +* mon: include pg_temp count in osdmap summary (Sage Weil) +* mon: log health summary to cluster log (#9440 Joao Eduardo Luis) +* mon: make 'mds fail' idempotent (John Spray) +* mon: make pg dump {sum,pgs,pgs_brief} work for format=plain (#5963 #6759 Mykola Golub) +* mon: new 'ceph pool ls [detail]' command (Sage Weil) +* mon: new pool safety flags nodelete, nopgchange, nosizechange (#9792 Mykola Golub) +* mon: new, friendly 'ceph pg ls ...' command (Xinxin Shu) +* mon: paxos: allow reads while proposing (#9321 #9322 Joao Eduardo Luis) +* mon: prevent MDS transition from STOPPING (#10791 Greg Farnum) +* mon: propose all pending work in one transaction (Sage Weil) +* mon: remove pg_temps for nonexistent pools (Joao Eduardo Luis) +* mon: require mon_allow_pool_delete option to remove pools (Sage Weil) +* mon: respect down flag when promoting standbys (John Spray) +* mon: set globalid prealloc to larger value (Sage Weil) +* mon: set {read,write}_tier on 'osd tier add-cache ...' (Jianpeng Ma) +* mon: skip zeroed osd stats in get_rule_avail (#10257 Joao Eduardo Luis) +* mon: validate min_size range (Jianpeng Ma) +* mon: wait for writeable before cross-proposing (#9794 Joao Eduardo Luis) +* mount.ceph: fix suprious error message (#10351 Yan, Zheng) +* ms: xio: fix misc bugs (Matt Benjamin, Vu Pham) +* msgr: async: bind threads to CPU cores, improved poll (Haomai Wang) +* msgr: async: many fixes, unit tests (Haomai Wang) +* msgr: async: several fixes (Haomai Wang) +* msgr: asyncmessenger: add kqueue support (#9926 Haomai Wang) +* msgr: avoid useless new/delete (Haomai Wang) +* msgr: fix RESETSESSION bug (#10080 Greg Farnum) +* msgr: fix crc configuration (Mykola Golub) +* msgr: fix delay injection bug (#9910 Sage Weil, Greg Farnum) +* msgr: misc unit tests (Haomai Wang) +* msgr: new AsymcMessenger alternative implementation (Haomai Wang) +* msgr: prefetch data when doing recv (Yehuda Sadeh) +* msgr: simple: fix rare deadlock (Greg Farnum) +* msgr: simple: retry binding to port on failure (#10029 Wido den Hollander) +* msgr: xio: XioMessenger RDMA support (Casey Bodley, Vu Pham, Matt Benjamin) +* objectstore: deprecate collection attrs (Sage Weil) +* osd, librados: fadvise-style librados hints (Jianpeng Ma) +* osd, librados: fix xattr_cmp_u64 (Dongmao Zhang) +* osd, librados: revamp PG listing API to handle namespaces (#9031 #9262 #9438 David Zafman) +* osd, mds: 'ops' as shorthand for 'dump_ops_in_flight' on asok (Sage Weil) +* osd, mon: add checksums to all OSDMaps (Sage Weil) +* osd, mon: send intiial pg create time from mon to osd (#9887 David Zafman) +* osd,mon: add 'norebalance' flag (Kefu Chai) +* osd,mon: specify OSD features explicitly in MOSDBoot (#10911 Sage Weil) +* osd: DBObjectMap: fix locking to prevent rare crash (#9891 Samuel Just) +* osd: EIO on whole-object reads when checksum is wrong (Sage Weil) +* osd: add erasure code corpus (Loic Dachary) +* osd: add fadvise flags to ObjectStore API (Jianpeng Ma) +* osd: add get_latest_osdmap asok command (#9483 #9484 Mykola Golub) +* osd: add misc tests (Loic Dachary, Danny Al-Gaaf) +* osd: add option to prioritize heartbeat network traffic (Jian Wen) +* osd: add support for the SHEC erasure-code algorithm (Takeshi Miyamae, Loic Dachary) +* osd: allow deletion of objects with watcher (#2339 Sage Weil) +* osd: allow recovery while below min_size (Samuel Just) +* osd: allow recovery with fewer than min_size OSDs (Samuel Just) +* osd: allow sparse read for Push/Pull (Haomai Wang) +* osd: allow whiteout deletion in cache pool (Sage Weil) +* osd: allow writes to degraded objects (Samuel Just) +* osd: allow writes to degraded objects (Samuel Just) +* osd: avoid publishing unchanged PG stats (Sage Weil) +* osd: batch pg log trim (Xinze Chi) +* osd: cache pool: ignore min flush age when cache is full (Xinze Chi) +* osd: cache recent ObjectContexts (Dong Yuan) +* osd: cache reverse_nibbles hash value (Dong Yuan) +* osd: clean up internal ObjectStore interface (Sage Weil) +* osd: cleanup boost optionals (William Kennington) +* osd: clear cache on interval change (Samuel Just) +* osd: do no proxy reads unless target OSDs are new (#10788 Sage Weil) +* osd: do not abort deep scrub on missing hinfo (#10018 Loic Dachary) +* osd: do not update digest on inconsistent object (#10524 Samuel Just) +* osd: don't record digests for snapdirs (#10536 Samuel Just) +* osd: drop upgrade support for pre-dumpling (Sage Weil) +* osd: enable and use posix_fadvise (Sage Weil) +* osd: erasure coding: allow bench.sh to test ISA backend (Yuan Zhou) +* osd: erasure-code: encoding regression tests, corpus (#9420 Loic Dachary) +* osd: erasure-code: enforce chunk size alignment (#10211 Loic Dachary) +* osd: erasure-code: jerasure support for NEON (Loic Dachary) +* osd: erasure-code: relax cauchy w restrictions (#10325 David Zhang, Loic Dachary) +* osd: erasure-code: update gf-complete to latest upstream (Loic Dachary) +* osd: expose non-journal backends via ceph-osd CLI (Hoamai Wang) +* osd: filejournal: don't cache journal when not using direct IO (Jianpeng Ma) +* osd: fix JSON output for stray OSDs (Loic Dachary) +* osd: fix OSDCap parser on old (el6) boost::spirit (#10757 Kefu Chai) +* osd: fix OSDCap parsing on el6 (#10757 Kefu Chai) +* osd: fix ObjectStore::Transaction encoding version (#10734 Samuel Just) +* osd: fix WBTHrottle perf counters (Haomai Wang) +* osd: fix and document last_epoch_started semantics (Samuel Just) +* osd: fix auth object selection during repair (#10524 Samuel Just) +* osd: fix backfill bug (#10150 Samuel Just) +* osd: fix bug in pending digest updates (#10840 Samuel Just) +* osd: fix cancel_proxy_read_ops (Sage Weil) +* osd: fix cleanup of interrupted pg deletion (#10617 Sage Weil) +* osd: fix divergent entry handling on PG split (Samuel Just) +* osd: fix ghobject_t formatted output to include shard (#10063 Loic Dachary) +* osd: fix ioprio option (Mykola Golub) +* osd: fix ioprio options (Loic Dachary) +* osd: fix journal shutdown race (Sage Weil) +* osd: fix journal wrapping bug (#10883 David Zafman) +* osd: fix leak in SnapTrimWQ (#10421 Kefu Chai) +* osd: fix leak on shutdown (Kefu Chai) +* osd: fix memstore free space calculation (Xiaoxi Chen) +* osd: fix mixed-version peering issues (Samuel Just) +* osd: fix object age eviction (Zhiqiang Wang) +* osd: fix object atime calculation (Xinze Chi) +* osd: fix object digest update bug (#10840 Samuel Just) +* osd: fix occasional peering stalls (#10431 Sage Weil) +* osd: fix ordering issue with new transaction encoding (#10534 Dong Yuan) +* osd: fix osd peer check on scrub messages (#9555 Sage Weil) +* osd: fix past_interval display bug (#9752 Loic Dachary) +* osd: fix past_interval generation (#10427 #10430 David Zafman) +* osd: fix pgls filter ops (#9439 David Zafman) +* osd: fix recording of digest on scrub (Samuel Just) +* osd: fix scrub delay bug (#10693 Samuel Just) +* osd: fix scrub vs try-flush bug (#8011 Samuel Just) +* osd: fix short read handling on push (#8121 David Zafman) +* osd: fix stderr with -f or -d (Dan Mick) +* osd: fix transaction accounting (Jianpeng Ma) +* osd: fix watch reconnect race (#10441 Sage Weil) +* osd: fix watch timeout cache state update (#10784 David Zafman) +* osd: fix whiteout handling (Sage Weil) +* osd: flush snapshots from cache tier immediately (Sage Weil) +* osd: force promotion of watch/notify ops (Zhiqiang Wang) +* osd: handle no-op write with snapshot (#10262 Sage Weil) +* osd: improve idempotency detection across cache promotion/demotion (#8935 Sage Weil, Samuel Just) +* osd: include activating peers in blocked_by (#10477 Sage Weil) +* osd: jerasure and gf-complete updates from upstream (#10216 Loic Dachary) +* osd: journal: check fsync/fdatasync result (Jianpeng Ma) +* osd: journal: fix alignment checks, avoid useless memmove (Jianpeng Ma) +* osd: journal: fix hang on shutdown (#10474 David Zafman) +* osd: journal: fix header.committed_up_to (Xinze Chi) +* osd: journal: fix journal zeroing when direct IO is enabled (Xie Rui) +* osd: journal: initialize throttle (Ning Yao) +* osd: journal: misc bug fixes (#6003 David Zafman, Samuel Just) +* osd: journal: update committed_thru after replay (#6756 Samuel Just) +* osd: keyvaluestore: cleanup dead code (Ning Yao) +* osd: keyvaluestore: fix getattr semantics (Haomai Wang) +* osd: keyvaluestore: fix key ordering (#10119 Haomai Wang) +* osd: keyvaluestore_dev: optimization (Chendi Xue) +* osd: limit in-flight read requests (Jason Dillaman) +* osd: log when scrub or repair starts (Loic Dachary) +* osd: make misdirected op checks robust for EC pools (#9835 Sage Weil) +* osd: memstore: fix size limit (Xiaoxi Chen) +* osd: misc FIEMAP fixes (Ma Jianpeng) +* osd: misc cleanup (Xinze Chi, Yongyue Sun) +* osd: misc optimizations (Xinxin Shu, Zhiqiang Wang, Xinze Chi) +* osd: misc scrub fixes (#10017 Loic Dachary) +* osd: new 'activating' state between peering and active (Sage Weil) +* osd: new optimized encoding for ObjectStore::Transaction (Dong Yuan) +* osd: optimize Finisher (Xinze Chi) +* osd: optimize WBThrottle map with unordered_map (Ning Yao) +* osd: optimize filter_snapc (Ning Yao) +* osd: preserve reqids for idempotency checks for promote/demote (Sage Weil, Zhiqiang Wang, Samuel Just) +* osd: proxy read support (Zhiqiang Wang) +* osd: proxy reads during cache promote (Zhiqiang Wang) +* osd: remove dead locking code (Xinxin Shu) +* osd: remove legacy classic scrub code (Sage Weil) +* osd: remove unused fields in MOSDSubOp (Xiaoxi Chen) +* osd: removed some dead code (Xinze Chi) +* osd: replace MOSDSubOp messages with simpler, optimized MOSDRepOp (Xiaoxi Chen) +* osd: restrict scrub to certain times of day (Xinze Chi) +* osd: rocksdb: fix shutdown (Hoamai Wang) +* osd: store PG metadata in per-collection objects for better concurrency (Sage Weil) +* osd: store whole-object checksums on scrub, write_full (Sage Weil) +* osd: support for discard for journal trim (Jianpeng Ma) +* osd: use FIEMAP_FLAGS_SYNC instead of fsync (Jianpeng Ma) +* osd: verify kernel is new enough before using XFS extsize ioctl, enable by default (#9956 Sage Weil) +* pybind: fix memory leak in librados bindings (Billy Olsen) +* pyrados: add object lock support (#6114 Mehdi Abaakouk) +* pyrados: fix misnamed wait_* routings (#10104 Dan Mick) +* pyrados: misc cleanups (Kefu Chai) +* qa: add large auth ticket tests (Ilya Dryomov) +* qa: fix mds tests (#10539 John Spray) +* qa: fix osd create dup tests (#10083 Loic Dachary) +* qa: ignore duplicates in rados ls (Josh Durgin) +* qa: improve hadoop tests (Noah Watkins) +* qa: many 'make check' improvements (Loic Dachary) +* qa: misc tests (Loic Dachary, Yan, Zheng) +* qa: parallelize make check (Loic Dachary) +* qa: reorg fs quota tests (Greg Farnum) +* qa: tolerate nearly-full disk for make check (Loic Dachary) +* rados: fix put of /dev/null (Loic Dachary) +* rados: fix usage (Jianpeng Ma) +* rados: parse command-line arguments more strictly (#8983 Adam Crume) +* rados: use copy-from operation for copy, cppool (Sage Weil) +* radosgw-admin: add replicalog update command (Yehuda Sadeh) +* rbd-fuse: clean up on shutdown (Josh Durgin) +* rbd-fuse: fix memory leak (Adam Crume) +* rbd-replay-many (Adam Crume) +* rbd-replay: --anonymize flag to rbd-replay-prep (Adam Crume) +* rbd: add 'merge-diff' function (MingXin Liu, Yunchuan Wen, Li Wang) +* rbd: allow v2 striping parameters for clones and imports (Jason Dillaman) +* rbd: fix 'rbd diff' for non-existent objects (Adam Crume) +* rbd: fix buffer handling on image import (#10590 Jason Dillaman) +* rbd: fix error when striping with format 1 (Sebastien Han) +* rbd: fix export for image sizes over 2GB (Vicente Cheng) +* rbd: fix formatted output of image features (Jason Dillaman) +* rbd: leave exclusive lockin goff by default (Jason Dillaman) +* rbd: updat eman page (Ilya Dryomov) +* rbd: update init-rbdmap to fix dup mount point (Karel Striegel) +* rbd: use IO hints for import, export, and bench operations (#10462 Jason Dillaman) +* rbd: use rolling average for rbd bench-write throughput (Jason Dillaman) +* rbd_recover_tool: RBD image recovery tool (Min Chen) +* rgw: S3-style object versioning support (Yehuda Sadeh) +* rgw: add location header when object is in another region (VRan Liu) +* rgw: change multipart upload id magic (#10271 Yehuda Sadeh) +* rgw: check keystone auth for S3 POST requests (#10062 Abhishek Lekshmanan) +* rgw: check timestamp on s3 keystone auth (#10062 Abhishek Lekshmanan) +* rgw: conditional PUT on ETag (#8562 Ray Lv) +* rgw: create subuser if needed when creating user (#10103 Yehuda Sadeh) +* rgw: decode http query params correction (#10271 Yehuda Sadeh) +* rgw: don't overwrite bucket/object owner when setting ACLs (#10978 Yehuda Sadeh) +* rgw: enable IPv6 for civetweb (#10965 Yehuda Sadeh) +* rgw: extend replica log API (purge-all) (Yehuda Sadeh) +* rgw: fail S3 POST if keystone not configured (#10688 Valery Tschopp, Yehuda Sadeh) +* rgw: fix If-Modified-Since (VRan Liu) +* rgw: fix XML header on get ACL request (#10106 Yehuda Sadeh) +* rgw: fix bucket removal with data purge (Yehuda Sadeh) +* rgw: fix content length check (#10701 Axel Dunkel, Yehuda Sadeh) +* rgw: fix content-length update (#9576 Yehuda Sadeh) +* rgw: fix disabling of max_size quota (#9907 Dong Lei) +* rgw: fix error codes (#10334 #10329 Yehuda Sadeh) +* rgw: fix incorrect len when len is 0 (#9877 Yehuda Sadeh) +* rgw: fix object copy content type (#9478 Yehuda Sadeh) +* rgw: fix partial GET in swift (#10553 Yehuda Sadeh) +* rgw: fix replica log indexing (#8251 Yehuda Sadeh) +* rgw: fix shutdown (#10472 Yehuda Sadeh) +* rgw: fix swift metadata header name (Dmytro Iurchenko) +* rgw: fix sysvinit script when rgw_socket_path is not defined (#11159 Yehuda Sadeh, Dan Mick) +* rgw: fix user stags in get-user-info API (#9359 Ray Lv) +* rgw: include XML ns on get ACL request (#10106 Yehuda Sadeh) +* rgw: index swift keys appropriately (#10471 Yehuda Sadeh) +* rgw: make sysvinit script set ulimit -n properly (Sage Weil) +* rgw: misc fixes (#10307 Yehuda Sadeh) +* rgw: only track cleanup for objects we write (#10311 Yehuda Sadeh) +* rgw: pass civetweb configurables through (#10907 Yehuda Sadeh) +* rgw: prevent illegal bucket policy that doesn't match placement rule (Yehuda Sadeh) +* rgw: remove multipart entries from bucket index on abort (#10719 Yehuda Sadeh) +* rgw: remove swift user manifest (DLO) hash calculation (#9973 Yehuda Sadeh) +* rgw: respond with 204 to POST on containers (#10667 Yuan Zhou) +* rgw: return timestamp on GET/HEAD (#8911 Yehuda Sadeh) +* rgw: reuse fcgx connection struct (#10194 Yehuda Sadeh) +* rgw: run radosgw as apache with systemd (#10125 Loic Dachary) +* rgw: send explicit HTTP status string (Yehuda Sadeh) +* rgw: set ETag on object copy (#9479 Yehuda Sadeh) +* rgw: set length for keystone token validation request (#7796 Yehuda Sadeh, Mark Kirkwood) +* rgw: support X-Storage-Policy header for Swift storage policy compat (Yehuda Sadeh) +* rgw: support multiple host names (#7467 Yehuda Sadeh) +* rgw: swift: dump container's custom metadata (#10665 Ahmad Faheem, Dmytro Iurchenko) +* rgw: swift: support Accept header for response format (#10746 Dmytro Iurchenko) +* rgw: swift: support for X-Remove-Container-Meta-{key} (#10475 Dmytro Iurchenko) +* rgw: tweak error codes (#10329 #10334 Yehuda Sadeh) +* rgw: update bucket index on attr changes, for multi-site sync (#5595 Yehuda Sadeh) +* rgw: use \r\n for http headers (#9254 Yehuda Sadeh) +* rgw: use gc for multipart abort (#10445 Aaron Bassett, Yehuda Sadeh) +* rgw: use new watch/notify API (Yehuda Sadeh, Sage Weil) +* rpm: misc fixes (Key Dreyer) +* rpm: move rgw logrotate to radosgw subpackage (Ken Dreyer) +* systemd: better systemd unit files (Owen Synge) +* sysvinit: fix race in 'stop' (#10389 Loic Dachary) +* test: fix bufferlist tests (Jianpeng Ma) +* tests: ability to run unit tests under docker (Loic Dachary) +* tests: centos-6 dockerfile (#10755 Loic Dachary) +* tests: improve docker-based tests (Loic Dachary) +* tests: unit tests for shared_cache (Dong Yuan) +* udev: fix rules for CentOS7/RHEL7 (Loic Dachary) +* use clock_gettime instead of gettimeofday (Jianpeng Ma) +* vstart.sh: set up environment for s3-tests (Luis Pabon) +* vstart.sh: work with cmake (Yehuda Sadeh) + + + + + + +v0.93 +===== + +This is the first release candidate for Hammer, and includes all of +the features that will be present in the final release. We welcome +and encourage any and all testing in non-production clusters to identify +any problems with functionality, stability, or performance before the +final Hammer release. + +We suggest some caution in one area: librbd. There is a lot of new +functionality around object maps and locking that is disabled by +default but may still affect stability for existing images. We are +continuing to shake out those bugs so that the final Hammer release +(probably v0.94) will be rock solid. + +Major features since Giant include: + +* cephfs: journal scavenger repair tool (John Spray) +* crush: new and improved straw2 bucket type (Sage Weil, Christina Anderson, Xiaoxi Chen) +* doc: improved guidance for CephFS early adopters (John Spray) +* librbd: add per-image object map for improved performance (Jason Dillaman) +* librbd: copy-on-read (Min Chen, Li Wang, Yunchuan Wen, Cheng Cheng) +* librados: fadvise-style IO hints (Jianpeng Ma) +* mds: many many snapshot-related fixes (Yan, Zheng) +* mon: new 'ceph osd df' command (Mykola Golub) +* mon: new 'ceph pg ls ...' command (Xinxin Shu) +* osd: improved performance for high-performance backends +* osd: improved recovery behavior (Samuel Just) +* osd: improved cache tier behavior with reads (Zhiqiang Wang) +* rgw: S3-compatible bucket versioning support (Yehuda Sadeh) +* rgw: large bucket index sharding (Guang Yang, Yehuda Sadeh) +* RDMA "xio" messenger support (Matt Benjamin, Vu Pham) + +Upgrading +--------- + +* If you are upgrading from v0.92, you must stop all OSD daemons and flush their + journals (``ceph-osd -i NNN --flush-journal``) before upgrading. There was + a transaction encoding bug in v0.92 that broke compatibility. Upgrading from + v0.91 or anything earlier is safe. + +* No special restrictions when upgrading from firefly or giant. + +Notable Changes +--------------- + +* build: CMake support (Ali Maredia, Casey Bodley, Adam Emerson, Marcus Watts, Matt Benjamin) +* ceph-disk: do not re-use partition if encryption is required (Loic Dachary) +* ceph-disk: support LUKS for encrypted partitions (Andrew Bartlett, Loic Dachary) +* ceph-fuse,libcephfs: add support for O_NOFOLLOW and O_PATH (Greg Farnum) +* ceph-fuse,libcephfs: resend requests before completing cap reconnect (#10912 Yan, Zheng) +* ceph-fuse: select kernel cache invalidation mechanism based on kernel version (Greg Farnum) +* ceph-objectstore-tool: improved import (David Zafman) +* ceph-objectstore-tool: misc improvements, fixes (#9870 #9871 David Zafman) +* ceph: add 'ceph osd df [tree]' command (#10452 Mykola Golub) +* ceph: fix 'ceph tell ...' command validation (#10439 Joao Eduardo Luis) +* ceph: improve 'ceph osd tree' output (Mykola Golub) +* cephfs-journal-tool: add recover_dentries function (#9883 John Spray) +* common: add newline to flushed json output (Sage Weil) +* common: filtering for 'perf dump' (John Spray) +* common: fix Formatter factory breakage (#10547 Loic Dachary) +* common: make json-pretty output prettier (Sage Weil) +* crush: new and improved straw2 bucket type (Sage Weil, Christina Anderson, Xiaoxi Chen) +* crush: update tries stats for indep rules (#10349 Loic Dachary) +* crush: use larger choose_tries value for erasure code rulesets (#10353 Loic Dachary) +* debian,rpm: move RBD udev rules to ceph-common (#10864 Ken Dreyer) +* debian: split python-ceph into python-{rbd,rados,cephfs} (Boris Ranto) +* doc: CephFS disaster recovery guidance (John Spray) +* doc: CephFS for early adopters (John Spray) +* doc: fix OpenStack Glance docs (#10478 Sebastien Han) +* doc: misc updates (#9793 #9922 #10204 #10203 Travis Rhoden, Hazem, Ayari, Florian Coste, Andy Allan, Frank Yu, Baptiste Veuillez-Mainard, Yuan Zhou, Armando Segnini, Robert Jansen, Tyler Brekke, Viktor Suprun) +* doc: replace cloudfiles with swiftclient Python Swift example (Tim Freund) +* erasure-code: add mSHEC erasure code support (Takeshi Miyamae) +* erasure-code: improved docs (#10340 Loic Dachary) +* erasure-code: set max_size to 20 (#10363 Loic Dachary) +* libcephfs,ceph-fuse: fix getting zero-length xattr (#10552 Yan, Zheng) +* librados: add blacklist_add convenience method (Jason Dillaman) +* librados: expose rados_{read|write}_op_assert_version in C API (Kim Vandry) +* librados: fix pool name caching (#10458 Radoslaw Zarzynski) +* librados: fix resource leak, misc bugs (#10425 Radoslaw Zarzynski) +* librados: fix some watch/notify locking (Jason Dillaman, Josh Durgin) +* libradosstriper: fix write_full when ENOENT (#10758 Sebastien Ponce) +* librbd: CRC protection for RBD image map (Jason Dillaman) +* librbd: add per-image object map for improved performance (Jason Dillaman) +* librbd: add support for an "object map" indicating which objects exist (Jason Dillaman) +* librbd: adjust internal locking (Josh Durgin, Jason Dillaman) +* librbd: better handling of watch errors (Jason Dillaman) +* librbd: coordinate maint operations through lock owner (Jason Dillaman) +* librbd: copy-on-read (Min Chen, Li Wang, Yunchuan Wen, Cheng Cheng, Jason Dillaman) +* librbd: enforce write ordering with a snapshot (Jason Dillaman) +* librbd: fadvise-style hints; add misc hints for certain operations (Jianpeng Ma) +* librbd: fix coverity false-positives (Jason Dillaman) +* librbd: fix snap create races (Jason Dillaman) +* librbd: flush AIO operations asynchronously (#10714 Jason Dillaman) +* librbd: make async versions of long-running maint operations (Jason Dillaman) +* librbd: mock tests (Jason Dillaman) +* librbd: optionally blacklist clients before breaking locks (#10761 Jason Dillaman) +* librbd: prevent copyup during shrink (Jason Dillaman) +* mds: add cephfs-table-tool (John Spray) +* mds: avoid sending traceless replies in most cases (Yan, Zheng) +* mds: export dir asok command (John Spray) +* mds: fix stray/purge perfcounters (#10388 John Spray) +* mds: handle heartbeat_reset during shutdown (#10382 John Spray) +* mds: many many snapshot-related fixes (Yan, Zheng) +* mds: refactor, improve Session storage (John Spray) +* misc coverity fixes (Danny Al-Gaaf) +* mon: add noforward flag for some mon commands (Mykola Golub) +* mon: disallow empty pool names (#10555 Wido den Hollander) +* mon: do not deactivate last mds (#10862 John Spray) +* mon: drop old ceph_mon_store_converter (Sage Weil) +* mon: fix 'ceph pg dump_stuck degraded' (Xinxin Shu) +* mon: fix 'profile osd' use of config-key function on mon (#10844 Joao Eduardo Luis) +* mon: fix compatset initalization during mkfs (Joao Eduardo Luis) +* mon: fix feature tracking during elections (Joao Eduardo Luis) +* mon: fix mds gid/rank/state parsing (John Spray) +* mon: ignore failure reports from before up_from (#10762 Dan van der Ster, Sage Weil) +* mon: improved corrupt CRUSH map detection (Joao Eduardo Luis) +* mon: include pg_temp count in osdmap summary (Sage Weil) +* mon: log health summary to cluster log (#9440 Joao Eduardo Luis) +* mon: make 'mds fail' idempotent (John Spray) +* mon: make pg dump {sum,pgs,pgs_brief} work for format=plain (#5963 #6759 Mykola Golub) +* mon: new pool safety flags nodelete, nopgchange, nosizechange (#9792 Mykola Golub) +* mon: new, friendly 'ceph pg ls ...' command (Xinxin Shu) +* mon: prevent MDS transition from STOPPING (#10791 Greg Farnum) +* mon: propose all pending work in one transaction (Sage Weil) +* mon: remove pg_temps for nonexistent pools (Joao Eduardo Luis) +* mon: require mon_allow_pool_delete option to remove pools (Sage Weil) +* mon: set globalid prealloc to larger value (Sage Weil) +* mon: skip zeroed osd stats in get_rule_avail (#10257 Joao Eduardo Luis) +* mon: validate min_size range (Jianpeng Ma) +* msgr: async: bind threads to CPU cores, improved poll (Haomai Wang) +* msgr: fix crc configuration (Mykola Golub) +* msgr: misc unit tests (Haomai Wang) +* msgr: xio: XioMessenger RDMA support (Casey Bodley, Vu Pham, Matt Benjamin) +* osd, librados: fadvise-style librados hints (Jianpeng Ma) +* osd, librados: fix xattr_cmp_u64 (Dongmao Zhang) +* osd,mon: add 'norebalance' flag (Kefu Chai) +* osd,mon: specify OSD features explicitly in MOSDBoot (#10911 Sage Weil) +* osd: add option to prioritize heartbeat network traffic (Jian Wen) +* osd: add support for the SHEC erasure-code algorithm (Takeshi Miyamae, Loic Dachary) +* osd: allow recovery while below min_size (Samuel Just) +* osd: allow recovery with fewer than min_size OSDs (Samuel Just) +* osd: allow writes to degraded objects (Samuel Just) +* osd: allow writes to degraded objects (Samuel Just) +* osd: avoid publishing unchanged PG stats (Sage Weil) +* osd: cache recent ObjectContexts (Dong Yuan) +* osd: clear cache on interval change (Samuel Just) +* osd: do no proxy reads unless target OSDs are new (#10788 Sage Weil) +* osd: do not update digest on inconsistent object (#10524 Samuel Just) +* osd: don't record digests for snapdirs (#10536 Samuel Just) +* osd: fix OSDCap parser on old (el6) boost::spirit (#10757 Kefu Chai) +* osd: fix OSDCap parsing on el6 (#10757 Kefu Chai) +* osd: fix ObjectStore::Transaction encoding version (#10734 Samuel Just) +* osd: fix auth object selection during repair (#10524 Samuel Just) +* osd: fix bug in pending digest updates (#10840 Samuel Just) +* osd: fix cancel_proxy_read_ops (Sage Weil) +* osd: fix cleanup of interrupted pg deletion (#10617 Sage Weil) +* osd: fix journal wrapping bug (#10883 David Zafman) +* osd: fix leak in SnapTrimWQ (#10421 Kefu Chai) +* osd: fix memstore free space calculation (Xiaoxi Chen) +* osd: fix mixed-version peering issues (Samuel Just) +* osd: fix object digest update bug (#10840 Samuel Just) +* osd: fix ordering issue with new transaction encoding (#10534 Dong Yuan) +* osd: fix past_interval generation (#10427 #10430 David Zafman) +* osd: fix short read handling on push (#8121 David Zafman) +* osd: fix watch timeout cache state update (#10784 David Zafman) +* osd: force promotion of watch/notify ops (Zhiqiang Wang) +* osd: improve idempotency detection across cache promotion/demotion (#8935 Sage Weil, Samuel Just) +* osd: include activating peers in blocked_by (#10477 Sage Weil) +* osd: jerasure and gf-complete updates from upstream (#10216 Loic Dachary) +* osd: journal: check fsync/fdatasync result (Jianpeng Ma) +* osd: journal: fix hang on shutdown (#10474 David Zafman) +* osd: journal: fix header.committed_up_to (Xinze Chi) +* osd: journal: initialize throttle (Ning Yao) +* osd: journal: misc bug fixes (#6003 David Zafman, Samuel Just) +* osd: misc cleanup (Xinze Chi, Yongyue Sun) +* osd: new 'activating' state between peering and active (Sage Weil) +* osd: preserve reqids for idempotency checks for promote/demote (Sage Weil, Zhiqiang Wang, Samuel Just) +* osd: remove dead locking code (Xinxin Shu) +* osd: restrict scrub to certain times of day (Xinze Chi) +* osd: rocksdb: fix shutdown (Hoamai Wang) +* pybind: fix memory leak in librados bindings (Billy Olsen) +* qa: fix mds tests (#10539 John Spray) +* qa: ignore duplicates in rados ls (Josh Durgin) +* qa: improve hadoop tests (Noah Watkins) +* qa: reorg fs quota tests (Greg Farnum) +* rados: fix usage (Jianpeng Ma) +* radosgw-admin: add replicalog update command (Yehuda Sadeh) +* rbd-fuse: clean up on shutdown (Josh Durgin) +* rbd: add 'merge-diff' function (MingXin Liu, Yunchuan Wen, Li Wang) +* rbd: fix buffer handling on image import (#10590 Jason Dillaman) +* rbd: leave exclusive lockin goff by default (Jason Dillaman) +* rbd: update init-rbdmap to fix dup mount point (Karel Striegel) +* rbd: use IO hints for import, export, and bench operations (#10462 Jason Dillaman) +* rbd_recover_tool: RBD image recovery tool (Min Chen) +* rgw: S3-style object versioning support (Yehuda Sadeh) +* rgw: check keystone auth for S3 POST requests (#10062 Abhishek Lekshmanan) +* rgw: extend replica log API (purge-all) (Yehuda Sadeh) +* rgw: fail S3 POST if keystone not configured (#10688 Valery Tschopp, Yehuda Sadeh) +* rgw: fix XML header on get ACL request (#10106 Yehuda Sadeh) +* rgw: fix bucket removal with data purge (Yehuda Sadeh) +* rgw: fix replica log indexing (#8251 Yehuda Sadeh) +* rgw: fix swift metadata header name (Dmytro Iurchenko) +* rgw: remove multipart entries from bucket index on abort (#10719 Yehuda Sadeh) +* rgw: respond with 204 to POST on containers (#10667 Yuan Zhou) +* rgw: reuse fcgx connection struct (#10194 Yehuda Sadeh) +* rgw: support multiple host names (#7467 Yehuda Sadeh) +* rgw: swift: dump container's custom metadata (#10665 Ahmad Faheem, Dmytro Iurchenko) +* rgw: swift: support Accept header for response format (#10746 Dmytro Iurchenko) +* rgw: swift: support for X-Remove-Container-Meta-{key} (#10475 Dmytro Iurchenko) +* rpm: move rgw logrotate to radosgw subpackage (Ken Dreyer) +* tests: centos-6 dockerfile (#10755 Loic Dachary) +* tests: unit tests for shared_cache (Dong Yuan) +* vstart.sh: work with cmake (Yehuda Sadeh) + + + +v0.92 +===== + +This is the second-to-last chunk of new stuff before Hammer. Big items +include additional checksums on OSD objects, proxied reads in the +cache tier, image locking in RBD, optimized OSD Transaction and +replication messages, and a big pile of RGW and MDS bug fixes. + +Upgrading +--------- + +* The experimental 'keyvaluestore-dev' OSD backend has been renamed + 'keyvaluestore' (for simplicity) and marked as experimental. To + enable this untested feature and acknowledge that you understand + that it is untested and may destroy data, you need to add the + following to your ceph.conf:: + + enable experimental unrecoverable data corrupting featuers = keyvaluestore + +* The following librados C API function calls take a 'flags' argument whose value + is now correctly interpreted: + + rados_write_op_operate() + rados_aio_write_op_operate() + rados_read_op_operate() + rados_aio_read_op_operate() + + The flags were not correctly being translated from the librados constants to the + internal values. Now they are. Any code that is passing flags to these methods + should be audited to ensure that they are using the correct LIBRADOS_OP_FLAG_* + constants. + +* The 'rados' CLI 'copy' and 'cppool' commands now use the copy-from operation, + which means the latest CLI cannot run these commands against pre-firefly OSDs. + +* The librados watch/notify API now includes a watch_flush() operation to flush + the async queue of notify operations. This should be called by any watch/notify + user prior to rados_shutdown(). + +Notable Changes +--------------- + +* add experimental features option (Sage Weil) +* build: fix 'make check' races (#10384 Loic Dachary) +* build: fix pkg names when libkeyutils is missing (Pankag Garg, Ken Dreyer) +* ceph: make 'ceph -s' show PG state counts in sorted order (Sage Weil) +* ceph: make 'ceph tell mon.* version' work (Mykola Golub) +* ceph-monstore-tool: fix/improve CLI (Joao Eduardo Luis) +* ceph: show primary-affinity in 'ceph osd tree' (Mykola Golub) +* common: add TableFormatter (Andreas Peters) +* common: check syncfs() return code (Jianpeng Ma) +* doc: do not suggest dangerous XFS nobarrier option (Dan van der Ster) +* doc: misc updates (Nilamdyuti Goswami, John Wilkins) +* install-deps.sh: do not require sudo when root (Loic Dachary) +* libcephfs: fix dirfrag trimming (#10387 Yan, Zheng) +* libcephfs: fix mount timeout (#10041 Yan, Zheng) +* libcephfs: fix test (#10415 Yan, Zheng) +* libcephfs: fix use-afer-free on umount (#10412 Yan, Zheng) +* libcephfs: include ceph and git version in client metadata (Sage Weil) +* librados: add watch_flush() operation (Sage Weil, Haomai Wang) +* librados: avoid memcpy on getxattr, read (Jianpeng Ma) +* librados: create ioctx by pool id (Jason Dillaman) +* librados: do notify completion in fast-dispatch (Sage Weil) +* librados: remove shadowed variable (Kefu Chain) +* librados: translate op flags from C APIs (Matthew Richards) +* librbd: differentiate between R/O vs R/W features (Jason Dillaman) +* librbd: exclusive image locking (Jason Dillaman) +* librbd: fix write vs import race (#10590 Jason Dillaman) +* librbd: gracefully handle deleted/renamed pools (#10270 Jason Dillaman) +* mds: asok command for fetching subtree map (John Spray) +* mds: constify MDSCacheObjects (John Spray) +* misc: various valgrind fixes and cleanups (Danny Al-Gaaf) +* mon: fix 'mds fail' for standby MDSs (John Spray) +* mon: fix stashed monmap encoding (#5203 Xie Rui) +* mon: implement 'fs reset' command (John Spray) +* mon: respect down flag when promoting standbys (John Spray) +* mount.ceph: fix suprious error message (#10351 Yan, Zheng) +* msgr: async: many fixes, unit tests (Haomai Wang) +* msgr: simple: retry binding to port on failure (#10029 Wido den Hollander) +* osd: add fadvise flags to ObjectStore API (Jianpeng Ma) +* osd: add get_latest_osdmap asok command (#9483 #9484 Mykola Golub) +* osd: EIO on whole-object reads when checksum is wrong (Sage Weil) +* osd: filejournal: don't cache journal when not using direct IO (Jianpeng Ma) +* osd: fix ioprio option (Mykola Golub) +* osd: fix scrub delay bug (#10693 Samuel Just) +* osd: fix watch reconnect race (#10441 Sage Weil) +* osd: handle no-op write with snapshot (#10262 Sage Weil) +* osd: journal: fix journal zeroing when direct IO is enabled (Xie Rui) +* osd: keyvaluestore: cleanup dead code (Ning Yao) +* osd, mds: 'ops' as shorthand for 'dump_ops_in_flight' on asok (Sage Weil) +* osd: memstore: fix size limit (Xiaoxi Chen) +* osd: misc scrub fixes (#10017 Loic Dachary) +* osd: new optimized encoding for ObjectStore::Transaction (Dong Yuan) +* osd: optimize filter_snapc (Ning Yao) +* osd: optimize WBThrottle map with unordered_map (Ning Yao) +* osd: proxy reads during cache promote (Zhiqiang Wang) +* osd: proxy read support (Zhiqiang Wang) +* osd: remove legacy classic scrub code (Sage Weil) +* osd: remove unused fields in MOSDSubOp (Xiaoxi Chen) +* osd: replace MOSDSubOp messages with simpler, optimized MOSDRepOp (Xiaoxi Chen) +* osd: store whole-object checksums on scrub, write_full (Sage Weil) +* osd: verify kernel is new enough before using XFS extsize ioctl, enable by default (#9956 Sage Weil) +* rados: use copy-from operation for copy, cppool (Sage Weil) +* rgw: change multipart upload id magic (#10271 Yehuda Sadeh) +* rgw: decode http query params correction (#10271 Yehuda Sadeh) +* rgw: fix content length check (#10701 Axel Dunkel, Yehuda Sadeh) +* rgw: fix partial GET in swift (#10553 Yehuda Sadeh) +* rgw: fix shutdown (#10472 Yehuda Sadeh) +* rgw: include XML ns on get ACL request (#10106 Yehuda Sadeh) +* rgw: misc fixes (#10307 Yehuda Sadeh) +* rgw: only track cleanup for objects we write (#10311 Yehuda Sadeh) +* rgw: tweak error codes (#10329 #10334 Yehuda Sadeh) +* rgw: use gc for multipart abort (#10445 Aaron Bassett, Yehuda Sadeh) +* sysvinit: fix race in 'stop' (#10389 Loic Dachary) +* test: fix bufferlist tests (Jianpeng Ma) +* tests: improve docker-based tests (Loic Dachary) + + +v0.91 +===== + +We are quickly approaching the Hammer feature freeze but have a few +more dev releases to go before we get there. The headline items are +subtree-based quota support in CephFS (ceph-fuse/libcephfs client +support only for now), a rewrite of the watch/notify librados API used +by RBD and RGW, OSDMap checksums to ensure that maps are always +consistent inside the cluster, new API calls in librados and librbd +for IO hinting modeled after posix_fadvise, and improved storage of +per-PG state. + +We expect two more releases before the Hammer feature freeze (v0.93). + +Upgrading +--------- + +* The 'category' field for objects has been removed. This was originally added + to track PG stat summations over different categories of objects for use by + radosgw. It is no longer has any known users and is prone to abuse because it + can lead to a pg_stat_t structure that is unbounded. The librados API calls + that accept this field now ignore it, and the OSD no longers tracks the + per-category summations. + +* The output for 'rados df' has changed. The 'category' level has been + eliminated, so there is now a single stat object per pool. The structure of + the JSON output is different, and the plaintext output has one less column. + +* The 'rados create [category]' optional category argument is no + longer supported or recognized. + +* rados.py's Rados class no longer has a __del__ method; it was causing + problems on interpreter shutdown and use of threads. If your code has + Rados objects with limited lifetimes and you're concerned about locked + resources, call Rados.shutdown() explicitly. + +* There is a new version of the librados watch/notify API with vastly + improved semantics. Any applications using this interface are + encouraged to migrate to the new API. The old API calls are marked + as deprecated and will eventually be removed. + +* The librados rados_unwatch() call used to be safe to call on an + invalid handle. The new version has undefined behavior when passed + a bogus value (for example, when rados_watch() returns an error and + handle is not defined). + +* The structure of the formatted 'pg stat' command is changed for the + portion that counts states by name to avoid using the '+' character + (which appears in state names) as part of the XML token (it is not + legal). + +Notable Changes +--------------- + +* asyncmsgr: misc fixes (Haomai Wang) +* buffer: add 'shareable' construct (Matt Benjamin) +* build: aarch64 build fixes (Noah Watkins, Haomai Wang) +* build: support for jemalloc (Shishir Gowda) +* ceph-disk: allow journal partition re-use (#10146 Loic Dachary, Dav van der Ster) +* ceph-disk: misc fixes (Christos Stavrakakis) +* ceph-fuse: fix kernel cache trimming (#10277 Yan, Zheng) +* ceph-objectstore-tool: many many improvements (David Zafman) +* common: support new gperftools header locations (Key Dreyer) +* crush: straw bucket weight calculation fixes (#9998 Sage Weil) +* doc: misc improvements (Nilamdyuti Goswami, John Wilkins, Chris Holcombe) +* libcephfs,ceph-fuse: add 'status' asok (John Spray) +* librados, osd: new watch/notify implementation (Sage Weil) +* librados: drop 'category' feature (Sage Weil) +* librados: fix pool deletion handling (#10372 Sage Weil) +* librados: new fadvise API (Ma Jianpeng) +* libradosstriper: fix remove() (Dongmao Zhang) +* librbd: complete pending ops before closing image (#10299 Josh Durgin) +* librbd: fadvise API (Ma Jianpeng) +* mds: ENOSPC and OSDMap epoch barriers (#7317 John Spray) +* mds: dirfrag buf fix (Yan, Zheng) +* mds: disallow most commands on inactive MDS's (Greg Farnum) +* mds: drop dentries, leases on deleted directories (#10164 Yan, Zheng) +* mds: handle zero-size xattr (#10335 Yan, Zheng) +* mds: subtree quota support (Yunchuan Wen) +* memstore: free space tracking (John Spray) +* misc cleanup (Danny Al-Gaaf, David Anderson) +* mon: 'osd crush reweight-all' command (Sage Weil) +* mon: allow full flag to be manually cleared (#9323 Sage Weil) +* mon: delay failure injection (Joao Eduardo Luis) +* mon: fix paxos timeouts (#10220 Joao Eduardo Luis) +* mon: get canonical OSDMap from leader (#10422 Sage Weil) +* msgr: fix RESETSESSION bug (#10080 Greg Farnum) +* objectstore: deprecate collection attrs (Sage Weil) +* osd, mon: add checksums to all OSDMaps (Sage Weil) +* osd: allow deletion of objects with watcher (#2339 Sage Weil) +* osd: allow sparse read for Push/Pull (Haomai Wang) +* osd: cache reverse_nibbles hash value (Dong Yuan) +* osd: drop upgrade support for pre-dumpling (Sage Weil) +* osd: enable and use posix_fadvise (Sage Weil) +* osd: erasure-code: enforce chunk size alignment (#10211 Loic Dachary) +* osd: erasure-code: jerasure support for NEON (Loic Dachary) +* osd: erasure-code: relax cauchy w restrictions (#10325 David Zhang, Loic Dachary) +* osd: erasure-code: update gf-complete to latest upstream (Loic Dachary) +* osd: fix WBTHrottle perf counters (Haomai Wang) +* osd: fix backfill bug (#10150 Samuel Just) +* osd: fix occasional peering stalls (#10431 Sage Weil) +* osd: fix scrub vs try-flush bug (#8011 Samuel Just) +* osd: fix stderr with -f or -d (Dan Mick) +* osd: misc FIEMAP fixes (Ma Jianpeng) +* osd: optimize Finisher (Xinze Chi) +* osd: store PG metadata in per-collection objects for better concurrency (Sage Weil) +* pyrados: add object lock support (#6114 Mehdi Abaakouk) +* pyrados: fix misnamed wait_* routings (#10104 Dan Mick) +* pyrados: misc cleanups (Kefu Chai) +* qa: add large auth ticket tests (Ilya Dryomov) +* qa: many 'make check' improvements (Loic Dachary) +* qa: misc tests (Loic Dachary, Yan, Zheng) +* rgw: conditional PUT on ETag (#8562 Ray Lv) +* rgw: fix error codes (#10334 #10329 Yehuda Sadeh) +* rgw: index swift keys appropriately (#10471 Yehuda Sadeh) +* rgw: prevent illegal bucket policy that doesn't match placement rule (Yehuda Sadeh) +* rgw: run radosgw as apache with systemd (#10125 Loic Dachary) +* rgw: support X-Storage-Policy header for Swift storage policy compat (Yehuda Sadeh) +* rgw: use \r\n for http headers (#9254 Yehuda Sadeh) +* rpm: misc fixes (Key Dreyer) + + +v0.90 +===== + +This is the last development release before Christmas. There are some +API cleanups for librados and librbd, and lots of bug fixes across the +board for the OSD, MDS, RGW, and CRUSH. The OSD also gets support for +discard (potentially helpful on SSDs, although it is off by default), and there +are several improvements to ceph-disk. + +The next two development releases will be getting a slew of new +functionality for hammer. Stay tuned! + +Upgrading +--------- + +* Previously, the formatted output of 'ceph pg stat -f ...' was a full + pg dump that included all metadata about all PGs in the system. It + is now a concise summary of high-level PG stats, just like the + unformatted 'ceph pg stat' command. + +* All JSON dumps of floating point values were incorrecting surrounding the + value with quotes. These quotes have been removed. Any consumer of structured + JSON output that was consuming the floating point values was previously having + to interpret the quoted string and will most likely need to be fixed to take + the unquoted number. + +Notable Changes +--------------- + +* arch: fix NEON feaeture detection (#10185 Loic Dachary) +* build: adjust build deps for yasm, virtualenv (Jianpeng Ma) +* build: improve build dependency tooling (Loic Dachary) +* ceph-disk: call partx/partprobe consistency (#9721 Loic Dachary) +* ceph-disk: fix dmcrypt key permissions (Loic Dachary) +* ceph-disk: fix umount race condition (#10096 Blaine Gardner) +* ceph-disk: init=none option (Loic Dachary) +* ceph-monstore-tool: fix shutdown (#10093 Loic Dachary) +* ceph-objectstore-tool: fix import (#10090 David Zafman) +* ceph-objectstore-tool: many improvements and tests (David Zafman) +* ceph.spec: package rbd-replay-prep (Ken Dreyer) +* common: add 'perf reset ...' admin command (Jianpeng Ma) +* common: do not unlock rwlock on destruction (Federico Simoncelli) +* common: fix block device discard check (#10296 Sage Weil) +* common: remove broken CEPH_LOCKDEP optoin (Kefu Chai) +* crush: fix tree bucket behavior (Rongze Zhu) +* doc: add build-doc guidlines for Fedora and CentOS/RHEL (Nilamdyuti Goswami) +* doc: enable rbd cache on openstack deployments (Sebastien Han) +* doc: improved installation nots on CentOS/RHEL installs (John Wilkins) +* doc: misc cleanups (Adam Spiers, Sebastien Han, Nilamdyuti Goswami, Ken Dreyer, John Wilkins) +* doc: new man pages (Nilamdyuti Goswami) +* doc: update release descriptions (Ken Dreyer) +* doc: update sepia hardware inventory (Sandon Van Ness) +* librados: only export public API symbols (Jason Dillaman) +* libradosstriper: fix stat strtoll (Dongmao Zhang) +* libradosstriper: fix trunc method (#10129 Sebastien Ponce) +* librbd: fix list_children from invalid pool ioctxs (#10123 Jason Dillaman) +* librbd: only export public API symbols (Jason Dillaman) +* many coverity fixes (Danny Al-Gaaf) +* mds: 'flush journal' admin command (John Spray) +* mds: fix MDLog IO callback deadlock (John Spray) +* mds: fix deadlock during journal probe vs purge (#10229 Yan, Zheng) +* mds: fix race trimming log segments (Yan, Zheng) +* mds: store backtrace for stray dir (Yan, Zheng) +* mds: verify backtrace when fetching dirfrag (#9557 Yan, Zheng) +* mon: add max pgs per osd warning (Sage Weil) +* mon: fix *_ratio* units and types (Sage Weil) +* mon: fix JSON dumps to dump floats as flots and not strings (Sage Weil) +* mon: fix formatter 'pg stat' command output (Sage Weil) +* msgr: async: several fixes (Haomai Wang) +* msgr: simple: fix rare deadlock (Greg Farnum) +* osd: batch pg log trim (Xinze Chi) +* osd: clean up internal ObjectStore interface (Sage Weil) +* osd: do not abort deep scrub on missing hinfo (#10018 Loic Dachary) +* osd: fix ghobject_t formatted output to include shard (#10063 Loic Dachary) +* osd: fix osd peer check on scrub messages (#9555 Sage Weil) +* osd: fix pgls filter ops (#9439 David Zafman) +* osd: flush snapshots from cache tier immediately (Sage Weil) +* osd: keyvaluestore: fix getattr semantics (Haomai Wang) +* osd: keyvaluestore: fix key ordering (#10119 Haomai Wang) +* osd: limit in-flight read requests (Jason Dillaman) +* osd: log when scrub or repair starts (Loic Dachary) +* osd: support for discard for journal trim (Jianpeng Ma) +* qa: fix osd create dup tests (#10083 Loic Dachary) +* rgw: add location header when object is in another region (VRan Liu) +* rgw: check timestamp on s3 keystone auth (#10062 Abhishek Lekshmanan) +* rgw: make sysvinit script set ulimit -n properly (Sage Weil) +* systemd: better systemd unit files (Owen Synge) +* tests: ability to run unit tests under docker (Loic Dachary) + + v0.89 ===== @@ -184,6 +2187,155 @@ Notable Changes * vstart.sh: set up environment for s3-tests (Luis Pabon) +v0.87.2 Giant +============= + +This is the second (and possibly final) point release for Giant. + +We recommend all v0.87.x Giant users upgrade to this release. + +Notable Changes +--------------- + +* ceph-objectstore-tool: only output unsupported features when incompatible (#11176 David Zafman) +* common: do not implicitly unlock rwlock on destruction (Federico Simoncelli) +* common: make wait timeout on empty queue configurable (#10818 Samuel Just) +* crush: pick ruleset id that matches and rule id (Xiaoxi Chen) +* crush: set_choose_tries = 100 for new erasure code rulesets (#10353 Loic Dachary) +* librados: check initialized atomic safely (#9617 Josh Durgin) +* librados: fix failed tick_event assert (#11183 Zhiqiang Wang) +* librados: fix looping on skipped maps (#9986 Ding Dinghua) +* librados: fix op submit with timeout (#10340 Samuel Just) +* librados: pybind: fix memory leak (#10723 Billy Olsen) +* librados: pybind: keep reference to callbacks (#10775 Josh Durgin) +* librados: translate operation flags from C APIs (Matthew Richards) +* libradosstriper: fix write_full on ENOENT (#10758 Sebastien Ponce) +* libradosstriper: use strtoll instead of strtol (Dongmao Zhang) +* mds: fix assertion caused by system time moving backwards (#11053 Yan, Zheng) +* mon: allow injection of random delays on writes (Joao Eduardo Luis) +* mon: do not trust small osd epoch cache values (#10787 Sage Weil) +* mon: fail non-blocking flush if object is being scrubbed (#8011 Samuel Just) +* mon: fix division by zero in stats dump (Joao Eduardo Luis) +* mon: fix get_rule_avail when no osds (#10257 Joao Eduardo Luis) +* mon: fix timeout rounds period (#10546 Joao Eduardo Luis) +* mon: ignore osd failures before up_from (#10762 Dan van der Ster, Sage Weil) +* mon: paxos: reset accept timeout before writing to store (#10220 Joao Eduardo Luis) +* mon: return if fs exists on 'fs new' (Joao Eduardo Luis) +* mon: use EntityName when expanding profiles (#10844 Joao Eduardo Luis) +* mon: verify cross-service proposal preconditions (#10643 Joao Eduardo Luis) +* mon: wait for osdmon to be writeable when requesting proposal (#9794 Joao Eduardo Luis) +* mount.ceph: avoid spurious error message about /etc/mtab (#10351 Yan, Zheng) +* msg/simple: allow RESETSESSION when we forget an endpoint (#10080 Greg Farnum) +* msg/simple: discard delay queue before incoming queue (#9910 Sage Weil) +* osd: clear_primary_state when leaving Primary (#10059 Samuel Just) +* osd: do not ignore deleted pgs on startup (#10617 Sage Weil) +* osd: fix FileJournal wrap to get header out first (#10883 David Zafman) +* osd: fix PG leak in SnapTrimWQ (#10421 Kefu Chai) +* osd: fix journalq population in do_read_entry (#6003 Samuel Just) +* osd: fix operator== for op_queue_age_hit and fs_perf_stat (#10259 Samuel Just) +* osd: fix rare assert after split (#10430 David Zafman) +* osd: get pgid ancestor from last_map when building past intervals (#10430 David Zafman) +* osd: include rollback_info_trimmed_to in {read,write}_log (#10157 Samuel Just) +* osd: lock header_lock in DBObjectMap::sync (#9891 Samuel Just) +* osd: requeue blocked op before flush it was blocked on (#10512 Sage Weil) +* osd: tolerate missing object between list and attr get on backfill (#10150 Samuel Just) +* osd: use correct atime for eviction decision (Xinze Chi) +* rgw: flush XML header on get ACL request (#10106 Yehuda Sadeh) +* rgw: index swift keys appropriately (#10471 Hemant Bruman, Yehuda Sadeh) +* rgw: send cancel for bucket index pending ops (#10770 Baijiaruo, Yehuda Sadeh) +* rgw: swift: support X_Remove_Container-Meta-{key} (#01475 Dmytro Iurchenko) + +For more detailed information, see :download:`the complete changelog `. + +v0.87.1 Giant +============= + +This is the first (and possibly final) point release for Giant. Our focus +on stability fixes will be directed towards Hammer and Firefly. + +We recommend that all v0.87 Giant users upgrade to this release. + +Upgrading +--------- + +* Due to a change in the Linux kernel version 3.18 and the limits of the FUSE + interface, ceph-fuse needs be mounted as root on at least some systems. See + issues #9997, #10277, and #10542 for details. + +Notable Changes +--------------- + +* build: disable stack-execute bit on assembler objects (#10114 Dan Mick) +* build: support boost 1.57.0 (#10688 Ken Dreyer) +* ceph-disk: fix dmcrypt file permissions (#9785 Loic Dachary) +* ceph-disk: run partprobe after zap, behave with partx or partprobe (#9665 #9721 Loic Dachary) +* cephfs-journal-tool: fix import for aged journals (#9977 John Spray) +* cephfs-journal-tool: fix journal import (#10025 John Spray) +* ceph-fuse: use remount to trim kernel dcache (#10277 Yan, Zheng) +* common: add cctid meta variable (#6228 Adam Crume) +* common: fix dump of shard for ghobject_t (#10063 Loic Dachary) +* crush: fix bucket weight underflow (#9998 Pawel Sadowski) +* erasure-code: enforce chunk size alignment (#10211 Loic Dachary) +* erasure-code: regression test suite (#9420 Loic Dachary) +* erasure-code: relax caucy w restrictions (#10325 Loic Dachary) +* libcephfs,ceph-fuse: allow xattr caps on inject_release_failure (#9800 John Spray) +* libcephfs,ceph-fuse: fix cap flush tid comparison (#9869 Greg Farnum) +* libcephfs,ceph-fuse: new flag to indicated sorted dcache (#9178 Yan, Zheng) +* libcephfs,ceph-fuse: prune cache before reconnecting to MDS (Yan, Zheng) +* librados: limit number of in-flight read requests (#9854 Jason Dillaman) +* libradospy: fix thread shutdown (#8797 Dan Mick) +* libradosstriper: fix locking issue in truncate (#10129 Sebastien Ponce) +* librbd: complete pending ops before closing mage (#10299 Jason Dillaman) +* librbd: fix error path on image open failure (#10030 Jason Dillaman) +* librbd: gracefully handle deleted/renamed pools (#10270 Jason Dillaman) +* librbd: handle errors when creating ioctx while listing children (#10123 Jason Dillaman) +* mds: fix compat version in MClientSession (#9945 John Spray) +* mds: fix journaler write error handling (#10011 John Spray) +* mds: fix locking for file size recovery (#10229 Yan, Zheng) +* mds: handle heartbeat_reset during shutdown (#10382 John Spray) +* mds: store backtrace for straydir (Yan, Zheng) +* mon: allow tiers for FS pools (#10135 John Spray) +* mon: fix caching of last_epoch_clean, osdmap trimming (#9987 Sage Weil) +* mon: fix 'fs ls' on peons (#10288 John Spray) +* mon: fix MDS health status from peons (#10151 John Spray) +* mon: fix paxos off-by-one (#9301 Sage Weil) +* msgr: simple: do not block on takeover while holding global lock (#9921 Greg Farnum) +* osd: deep scrub must not abort if hinfo is missing (#10018 Loic Dachary) +* osd: fix misdirected op detection (#9835 Sage Weil) +* osd: fix past_interval display for acting (#9752 Loic Dachary) +* osd: fix PG peering backoff when behind on osdmaps (#10431 Sage Weil) +* osd: handle no-op write with snapshot case (#10262 Ssage Weil) +* osd: use fast-dispatch (Sage Weil, Greg Farnum) +* rados: fix write to /dev/null (Loic Dachary) +* radosgw-admin: create subuser when needed (#10103 Yehuda Sadeh) +* rbd: avoid invalidating aio_write buffer during image import (#10590 Jason Dillaman) +* rbd: fix export with images > 2GB (Vicente Cheng) +* rgw: change multipart upload id magic (#10271 Georgios Dimitrakakis, Yehuda Sadeh) +* rgw: check keystone auth for S3 POST (#10062 Abhishek Lekshmanan) +* rgw: check timestamp for S3 keystone auth (#10062 Abhishek Lekshmanan) +* rgw: fix partial GET with swift (#10553 Yehuda Sadeh) +* rgw: fix quota disable (#9907 Dong Lei) +* rgw: fix rare corruption of object metadata on put (#9576 Yehuda Sadeh) +* rgw: fix S3 object copy content-type (#9478 Yehuda Sadeh) +* rgw: headers end with \r\n (#9254 Benedikt Fraunhofer, Yehuda Sadeh) +* rgw: remove swift user manifest DLO hash calculation (#9973 Yehuda Sadeh) +* rgw: return correct len when len is 0 (#9877 Yehuda Sadeh) +* rgw: return X-Timestamp field (#8911 Yehuda Sadeh) +* rgw: run radosgw as apache with systemd (#10125) +* rgw: sent ETag on S3 object copy (#9479 Yehuda Sadeh) +* rgw: sent HTTP status reason explicitly in fastcgi (Yehuda Sadeh) +* rgw: set length for keystone token validation (#7796 Mark Kirkwood, Yehuda Sadeh) +* rgw: set ulimit -n on sysvinit before starting daemon (#9587 Sage Weil) +* rgw: update bucket index on set_attrs (#5595 Yehuda Sadeh) +* rgw: update swift subuser permission masks when authenticating (#9918 Yehuda Sadeh) +* rgw: URL decode HTTP query params correction (#10271 Georgios Dimitrakakis, Yehuda Sadeh) +* rgw: use cached attrs while reading object attrs (#10307 Yehuda Sadeh) +* rgw: use strict_strtoll for content length (#10701 Axel Dunkel, Yehuda Sadeh) + +For more detailed information, see :download:`the complete changelog `. + + + v0.87 Giant =========== @@ -1314,6 +3466,260 @@ Notable Changes * rgw: send user manifest header (Yehuda Sadeh) * test_librbd_fsx: test krbd as well as librbd (Ilya Dryomov) +v0.80.10 Firefly +================ + +This is a bugfix release for Firefly. + +We recommend that all Firefly users upgrade. + +For more detailed information, see :download:`the complete changelog `. + +Notable Changes +--------------- + +* build/ops: ceph.spec.in: package mkcephfs on EL6 (`issue#11955 `_, `pr#4924 `_, Ken Dreyer) +* build/ops: debian: ceph-test and rest-bench debug packages should require their respective binary packages (`issue#11673 `_, `pr#4766 `_, Ken Dreyer) +* build/ops: run RGW as root (`issue#11453 `_, `pr#4638 `_, Ken Dreyer) +* common: messages/MWatchNotify: include an error code in the message (`issue#9193 `_, `pr#3944 `_, Sage Weil) +* common: Rados.shutdown() dies with Illegal instruction (core dumped) (`issue#10153 `_, `pr#3963 `_, Federico Simoncelli) +* common: SimpleMessenger: allow RESETSESSION whenever we forget an endpoint (`issue#10080 `_, `pr#3915 `_, Greg Farnum) +* common: WorkQueue: make wait timeout on empty queue configurable (`issue#10817 `_, `pr#3941 `_, Samuel Just) +* crush: set_choose_tries = 100 for erasure code rulesets (`issue#10353 `_, `pr#3824 `_, Loic Dachary) +* doc: backport ceph-disk man page to Firefly (`issue#10724 `_, `pr#3936 `_, Nilamdyuti Goswami) +* doc: Fix ceph command manpage to match ceph -h (`issue#10676 `_, `pr#3996 `_, David Zafman) +* fs: mount.ceph: avoid spurious error message (`issue#10351 `_, `pr#3927 `_, Yan, Zheng) +* librados: Fix memory leak in python rados bindings (`issue#10723 `_, `pr#3935 `_, Josh Durgin) +* librados: fix resources leakage in RadosClient::connect() (`issue#10425 `_, `pr#3828 `_, Radoslaw Zarzynski) +* librados: Translate operation flags from C APIs (`issue#10497 `_, `pr#3930 `_, Matt Richards) +* librbd: acquire cache_lock before refreshing parent (`issue#5488 `_, `pr#4206 `_, Jason Dillaman) +* librbd: snap_remove should ignore -ENOENT errors (`issue#11113 `_, `pr#4245 `_, Jason Dillaman) +* mds: fix assertion caused by system clock backwards (`issue#11053 `_, `pr#3970 `_, Yan, Zheng) +* mon: ignore osd failures from before up_from (`issue#10762 `_, `pr#3937 `_, Sage Weil) +* mon: MonCap: take EntityName instead when expanding profiles (`issue#10844 `_, `pr#3942 `_, Joao Eduardo Luis) +* mon: Monitor: fix timecheck rounds period (`issue#10546 `_, `pr#3932 `_, Joao Eduardo Luis) +* mon: OSDMonitor: do not trust small values in osd epoch cache (`issue#10787 `_, `pr#3823 `_, Sage Weil) +* mon: OSDMonitor: fallback to json-pretty in case of invalid formatter (`issue#9538 `_, `pr#4475 `_, Loic Dachary) +* mon: PGMonitor: several stats output error fixes (`issue#10257 `_, `pr#3826 `_, Joao Eduardo Luis) +* objecter: fix map skipping (`issue#9986 `_, `pr#3952 `_, Ding Dinghua) +* osd: cache tiering: fix the atime logic of the eviction (`issue#9915 `_, `pr#3949 `_, Zhiqiang Wang) +* osd: cancel_pull: requeue waiters (`issue#11244 `_, `pr#4415 `_, Samuel Just) +* osd: check that source OSD is valid for MOSDRepScrub (`issue#9555 `_, `pr#3947 `_, Sage Weil) +* osd: DBObjectMap: lock header_lock on sync() (`issue#9891 `_, `pr#3948 `_, Samuel Just) +* osd: do not ignore deleted pgs on startup (`issue#10617 `_, `pr#3933 `_, Sage Weil) +* osd: ENOENT on clone (`issue#11199 `_, `pr#4385 `_, Samuel Just) +* osd: erasure-code-profile set races with erasure-code-profile rm (`issue#11144 `_, `pr#4383 `_, Loic Dachary) +* osd: FAILED assert(soid < scrubber.start || soid >= scrubber.end) (`issue#11156 `_, `pr#4185 `_, Samuel Just) +* osd: FileJournal: fix journalq population in do_read_entry() (`issue#6003 `_, `pr#3960 `_, Samuel Just) +* osd: fix negative degraded objects during backfilling (`issue#7737 `_, `pr#4021 `_, Guang Yang) +* osd: get the currently atime of the object in cache pool for eviction (`issue#9985 `_, `pr#3950 `_, Sage Weil) +* osd: load_pgs: we need to handle the case where an upgrade from earlier versions which ignored non-existent pgs resurrects a pg with a prehistoric osdmap (`issue#11429 `_, `pr#4556 `_, Samuel Just) +* osd: ObjectStore: Don't use largest_data_off to calc data_align. (`issue#10014 `_, `pr#3954 `_, Jianpeng Ma) +* osd: osd_types: op_queue_age_hist and fs_perf_stat should be in osd_stat_t::o... (`issue#10259 `_, `pr#3827 `_, Samuel Just) +* osd: PG::actingset should be used when checking the number of acting OSDs for... (`issue#11454 `_, `pr#4453 `_, Guang Yang) +* osd: PG::all_unfound_are_queried_or_lost for non-existent osds (`issue#10976 `_, `pr#4416 `_, Mykola Golub) +* osd: PG: always clear_primary_state (`issue#10059 `_, `pr#3955 `_, Samuel Just) +* osd: PGLog.h: 279: FAILED assert(log.log.size() == log_keys_debug.size()) (`issue#10718 `_, `pr#4382 `_, Samuel Just) +* osd: PGLog: include rollback_info_trimmed_to in (read|write)_log (`issue#10157 `_, `pr#3964 `_, Samuel Just) +* osd: pg stuck stale after create with activation delay (`issue#11197 `_, `pr#4384 `_, Samuel Just) +* osd: ReplicatedPG: fail a non-blocking flush if the object is being scrubbed (`issue#8011 `_, `pr#3943 `_, Samuel Just) +* osd: ReplicatedPG::on_change: clean up callbacks_for_degraded_object (`issue#8753 `_, `pr#3940 `_, Samuel Just) +* osd: ReplicatedPG::scan_range: an object can disappear between the list and t... (`issue#10150 `_, `pr#3962 `_, Samuel Just) +* osd: requeue blocked op before flush it was blocked on (`issue#10512 `_, `pr#3931 `_, Sage Weil) +* rgw: check for timestamp for s3 keystone auth (`issue#10062 `_, `pr#3958 `_, Abhishek Lekshmanan) +* rgw: civetweb should use unique request id (`issue#11720 `_, `pr#4780 `_, Orit Wasserman) +* rgw: don't allow negative / invalid content length (`issue#11890 `_, `pr#4829 `_, Yehuda Sadeh) +* rgw: fail s3 POST auth if keystone not configured (`issue#10698 `_, `pr#3966 `_, Yehuda Sadeh) +* rgw: flush xml header on get acl request (`issue#10106 `_, `pr#3961 `_, Yehuda Sadeh) +* rgw: generate new tag for object when setting object attrs (`issue#11256 `_, `pr#4571 `_, Yehuda Sadeh) +* rgw: generate the "Date" HTTP header for civetweb. (`issue#11871,11891 `_, `pr#4851 `_, Radoslaw Zarzynski) +* rgw: keystone token cache does not work correctly (`issue#11125 `_, `pr#4414 `_, Yehuda Sadeh) +* rgw: merge manifests correctly when there's prefix override (`issue#11622 `_, `pr#4697 `_, Yehuda Sadeh) +* rgw: send appropriate op to cancel bucket index pending operation (`issue#10770 `_, `pr#3938 `_, Yehuda Sadeh) +* rgw: shouldn't need to disable rgw_socket_path if frontend is configured (`issue#11160 `_, `pr#4275 `_, Yehuda Sadeh) +* rgw: Swift API. Dump container's custom metadata. (`issue#10665 `_, `pr#3934 `_, Dmytro Iurchenko) +* rgw: Swift API. Support for X-Remove-Container-Meta-{key} header. (`issue#10475 `_, `pr#3929 `_, Dmytro Iurchenko) +* rgw: use correct objv_tracker for bucket instance (`issue#11416 `_, `pr#4379 `_, Yehuda Sadeh) +* tests: force checkout of submodules (`issue#11157 `_, `pr#4079 `_, Loic Dachary) +* tools: Backport ceph-objectstore-tool changes to firefly (`issue#12327 `_, `pr#3866 `_, David Zafman) +* tools: ceph-objectstore-tool: Output only unsupported features when incomatible (`issue#11176 `_, `pr#4126 `_, David Zafman) +* tools: ceph-objectstore-tool: Use exit status 11 for incompatible import attemp... (`issue#11139 `_, `pr#4129 `_, David Zafman) +* tools: Fix do_autogen.sh so that -L is allowed (`issue#11303 `_, `pr#4247 `_, Alfredo Deza) + +v0.80.9 Firefly +=============== + +This is a bugfix release for firefly. It fixes a performance +regression in librbd, an important CRUSH misbehavior (see below), and +several RGW bugs. We have also backported support for flock/fcntl +locks to ceph-fuse and libcephfs. + +We recommend that all Firefly users upgrade. + +For more detailed information, see :download:`the complete changelog `. + +Adjusting CRUSH maps +-------------------- + +* This point release fixes several issues with CRUSH that trigger + excessive data migration when adjusting OSD weights. These are most + obvious when a very small weight change (e.g., a change from 0 to + .01) triggers a large amount of movement, but the same set of bugs + can also lead to excessive (though less noticeable) movement in + other cases. + + However, because the bug may already have affected your cluster, + fixing it may trigger movement *back* to the more correct location. + For this reason, you must manually opt-in to the fixed behavior. + + In order to set the new tunable to correct the behavior:: + + ceph osd crush set-tunable straw_calc_version 1 + + Note that this change will have no immediate effect. However, from + this point forward, any 'straw' bucket in your CRUSH map that is + adjusted will get non-buggy internal weights, and that transition + may trigger some rebalancing. + + You can estimate how much rebalancing will eventually be necessary + on your cluster with:: + + ceph osd getcrushmap -o /tmp/cm + crushtool -i /tmp/cm --num-rep 3 --test --show-mappings > /tmp/a 2>&1 + crushtool -i /tmp/cm --set-straw-calc-version 1 -o /tmp/cm2 + crushtool -i /tmp/cm2 --reweight -o /tmp/cm2 + crushtool -i /tmp/cm2 --num-rep 3 --test --show-mappings > /tmp/b 2>&1 + wc -l /tmp/a # num total mappings + diff -u /tmp/a /tmp/b | grep -c ^+ # num changed mappings + + Divide the total number of lines in /tmp/a with the number of lines + changed. We've found that most clusters are under 10%. + + You can force all of this rebalancing to happen at once with:: + + ceph osd crush reweight-all + + Otherwise, it will happen at some unknown point in the future when + CRUSH weights are next adjusted. + +Notable Changes +--------------- + +* ceph-fuse: flock, fcntl lock support (Yan, Zheng, Greg Farnum) +* crush: fix straw bucket weight calculation, add straw_calc_version tunable (#10095 Sage Weil) +* crush: fix tree bucket (Rongzu Zhu) +* crush: fix underflow of tree weights (Loic Dachary, Sage Weil) +* crushtool: add --reweight (Sage Weil) +* librbd: complete pending operations before losing image (#10299 Jason Dillaman) +* librbd: fix read caching performance regression (#9854 Jason Dillaman) +* librbd: gracefully handle deleted/renamed pools (#10270 Jason Dillaman) +* mon: fix dump of chooseleaf_vary_r tunable (Sage Weil) +* osd: fix PG ref leak in snaptrimmer on peering (#10421 Kefu Chai) +* osd: handle no-op write with snapshot (#10262 Sage Weil) +* radosgw-admin: create subuser when creating user (#10103 Yehuda Sadeh) +* rgw: change multipart uplaod id magic (#10271 Georgio Dimitrakakis, Yehuda Sadeh) +* rgw: don't overwrite bucket/object owner when setting ACLs (#10978 Yehuda Sadeh) +* rgw: enable IPv6 for embedded civetweb (#10965 Yehuda Sadeh) +* rgw: fix partial swift GET (#10553 Yehuda Sadeh) +* rgw: fix quota disable (#9907 Dong Lei) +* rgw: index swift keys appropriately (#10471 Hemant Burman, Yehuda Sadeh) +* rgw: make setattrs update bucket index (#5595 Yehuda Sadeh) +* rgw: pass civetweb configurables (#10907 Yehuda Sadeh) +* rgw: remove swift user manifest (DLO) hash calculation (#9973 Yehuda Sadeh) +* rgw: return correct len for 0-len objects (#9877 Yehuda Sadeh) +* rgw: S3 object copy content-type fix (#9478 Yehuda Sadeh) +* rgw: send ETag on S3 object copy (#9479 Yehuda Sadeh) +* rgw: send HTTP status reason explicitly in fastcgi (Yehuda Sadeh) +* rgw: set ulimit -n from sysvinit (el6) init script (#9587 Sage Weil) +* rgw: update swift subuser permission masks when authenticating (#9918 Yehuda Sadeh) +* rgw: URL decode query params correctly (#10271 Georgio Dimitrakakis, Yehuda Sadeh) +* rgw: use attrs when reading object attrs (#10307 Yehuda Sadeh) +* rgw: use \r\n for http headers (#9254 Benedikt Fraunhofer, Yehuda Sadeh) + + +v0.80.8 Firefly +=============== + +This is a long-awaited bugfix release for firefly. It has several +imporant (but relatively rare) OSD peering fixes, performance issues +when snapshots are trimmed, several RGW fixes, a paxos corner case +fix, and some packaging updates. + +We recommend that all users for v0.80.x firefly upgrade when it is +convenient to do so. + +For more detailed information, see :download:`the complete changelog `. + +Notable Changes +--------------- + +* build: remove stack-execute bit from assembled code sections (#10114 Dan Mick) +* ceph-disk: fix dmcrypt key permissions (#9785 Loic Dachary) +* ceph-disk: fix keyring location (#9653 Loic Dachary) +* ceph-disk: make partition checks more robust (#9721 #9665 Loic Dachary) +* ceph: cleanly shut down librados context on shutdown (#8797 Dan Mick) +* common: add $cctid config metavariable (#6228 Adam Crume) +* crush: align rule and ruleset ids (#9675 Xiaoxi Chen) +* crush: fix negative weight bug during create_or_move_item (#9998 Pawel Sadowski) +* crush: fix potential buffer overflow in erasure rules (#9492 Johnu George) +* debian: fix python-ceph -> ceph file movement (Sage Weil) +* libcephfs,ceph-fuse: fix flush tid wraparound bug (#9869 Greg Farnum, Yan, Zheng) +* libcephfs: close fd befure umount (#10415 Yan, Zheng) +* librados: fix crash from C API when read timeout is enabled (#9582 Sage Weil) +* librados: handle reply race with pool deletion (#10372 Sage Weil) +* librbd: cap memory utilization for read requests (Jason Dillaman) +* librbd: do not close a closed parent image on failure (#10030 Jason Dillaman) +* librbd: fix diff tests (#10002 Josh Durgin) +* librbd: protect list_children from invalid pools (#10123 Jason Dillaman) +* make check improvemens (Loic Dachary) +* mds: fix ctime updates (#9514 Greg Farnum) +* mds: fix journal import tool (#10025 John Spray) +* mds: fix rare NULL deref in cap flush handler (Greg Farnum) +* mds: handle unknown lock messages (Yan, Zheng) +* mds: store backtrace for straydir (Yan, Zheng) +* mon: abort startup if disk is full (#9502 Joao Eduardo Luis) +* mon: add paxos instrumentation (Sage Weil) +* mon: fix double-free in rare OSD startup path (Sage Weil) +* mon: fix osdmap trimming (#9987 Sage Weil) +* mon: fix paxos corner cases (#9301 #9053 Sage Weil) +* osd: cancel callback on blacklisted watchers (#8315 Samuel Just) +* osd: cleanly abort set-alloc-hint operations during upgrade (#9419 David Zafman) +* osd: clear rollback PG metadata on PG deletion (#9293 Samuel Just) +* osd: do not abort deep scrub if hinfo is missing (#10018 Loic Dachary) +* osd: erasure-code regression tests (Loic Dachary) +* osd: fix distro metadata reporting for SUSE (#8654 Danny Al-Gaaf) +* osd: fix full OSD checks during backfill (#9574 Samuel Just) +* osd: fix ioprio parsing (#9677 Loic Dachary) +* osd: fix journal direct-io shutdown (#9073 Mark Kirkwood, Ma Jianpeng, Somnath Roy) +* osd: fix journal dump (Ma Jianpeng) +* osd: fix occasional stall during peering or activation (Sage Weil) +* osd: fix past_interval display bug (#9752 Loic Dachary) +* osd: fix rare crash triggered by admin socket dump_ops_in_filght (#9916 Dong Lei) +* osd: fix snap trimming performance issues (#9487 #9113 Samuel Just, Sage Weil, Dan van der Ster, Florian Haas) +* osd: fix snapdir handling on cache eviction (#8629 Sage Weil) +* osd: handle map gaps in map advance code (Sage Weil) +* osd: handle undefined CRUSH results in interval check (#9718 Samuel Just) +* osd: include shard in JSON dump of ghobject (#10063 Loic Dachary) +* osd: make backfill reservation denial handling more robust (#9626 Samuel Just) +* osd: make misdirected op checks handle EC + primary affinity (#9835 Samuel Just, Sage Weil) +* osd: mount XFS with inode64 by default (Sage Weil) +* osd: other misc bugs (#9821 #9875 Samuel Just) +* rgw: add .log to default log path (#9353 Alexandre Marangone) +* rgw: clean up fcgi request context (#10194 Yehuda Sadeh) +* rgw: convet header underscores to dashes (#9206 Yehuda Sadeh) +* rgw: copy object data if copy target is in different pool (#9039 Yehuda Sadeh) +* rgw: don't try to authenticate CORS peflight request (#8718 Robert Hubbard, Yehuda Sadeh) +* rgw: fix civetweb URL decoding (#8621 Yehuda Sadeh) +* rgw: fix hash calculation during PUT (Yehuda Sadeh) +* rgw: fix misc bugs (#9089 #9201 Yehuda Sadeh) +* rgw: fix object tail test (#9226 Sylvain Munaut, Yehuda Sadeh) +* rgw: make sysvinit script run rgw under systemd context as needed (#10125 Loic Dachary) +* rgw: separate civetweb log from rgw log (Yehuda Sadeh) +* rgw: set length for keystone token validations (#7796 Mark Kirkwood, Yehuda Sadeh) +* rgw: subuser creation fixes (#8587 Yehuda Sadeh) +* rpm: misc packaging improvements (Sandon Van Ness, Dan Mick, Erik Logthenberg, Boris Ranto) +* rpm: use standard udev rules for CentOS7/RHEL7 (#9747 Loic Dachary) + v0.80.7 Firefly =============== @@ -3389,6 +5795,51 @@ Notable Changes * sysvinit: add condrestart command (Dan van der Ster) +v0.67.12 "Dumpling" (draft) +=========================== + +This stable update for Dumpling fixes a few longstanding issues with +backfill in the OSD that can lead to stalled IOs. There is also a fix +for memory utilization for reads in librbd when caching is enabled, +and then several other small fixes across the rest of the system. + +Dumpling users who have encountered IO stalls during backfill and who +do not expect to upgrade to Firefly soon should upgrade. Everyone +else should upgrade to Firefly already. This is likely to be the last stable +release for the 0.67.x Dumpling series. + + +Notable Changes +--------------- + +* buffer: fix buffer rebuild alignment corner case (#6614 #6003 Loic Dachary, Samuel Just) +* ceph-disk: reprobe partitions after zap (#9665 #9721 Loic Dachary) +* ceph-disk: use partx instead of partprobe when appropriate (Loic Dachary) +* common: add $cctid meta variable (#6228 Adam Crume) +* crush: fix get_full_location_ordered (Sage Weil) +* crush: pick ruleset id that matches rule_id (#9675 Xiaoxi Chen) +* libcephfs: fix tid wrap bug (#9869 Greg Farnum) +* libcephfs: get osd location on -1 should return EINVAL (Sage Weil) +* librados: fix race condition with C API and op timeouts (#9582 Sage Weil) +* librbd: constrain max number of in-flight read requests (#9854 Jason Dillaman) +* librbd: enforce cache size on read requests (Jason Dillaman) +* librbd: fix invalid close in image open failure path (#10030 Jason Dillaman) +* librbd: fix read hang on sparse files (Jason Dillaman) +* librbd: gracefully handle deleted/renamed pools (#10270 #10122 Jason Dillaman) +* librbd: protect list_children from invalid child pool ioctxs (#10123 Jason Dillaman) +* mds: fix ctime updates from clients without dirty caps (#9514 Greg Farnum) +* mds: fix rare NULL dereference in cap update path (Greg Farnum) +* mds: fix assertion caused by system clock backwards (#11053 Yan, Zheng) +* mds: store backtrace on straydir (Yan, Zheng) +* osd: fix journal committed_thru update after replay (#6756 Samuel Just) +* osd: fix memory leak, busy loop on snap trim (#9113 Samuel Just) +* osd: fix misc peering, recovery bugs (#10168 Samuel Just) +* osd: fix purged_snap field on backfill start (#9487 Sage Weil, Samuel Just) +* osd: handle no-op write with snapshot corner case (#10262 Sage Weil, Loic Dachary) +* osd: respect RWORDERED rados flag (Sage Weil) +* osd: several backfill fixes and refactors (Samuel Just, David Zafman) +* rgw: send http status reason explicitly in fastcgi (Yehuda Sadeh) + v0.67.11 "Dumpling" =================== diff --git a/doc/releases.rst b/doc/releases.rst new file mode 100644 index 0000000000000..dc11a6a7cad0f --- /dev/null +++ b/doc/releases.rst @@ -0,0 +1,258 @@ +============= +Ceph Releases +============= + +Timeline +-------- + ++----------------------------+-----------+-----------+-----------+-----------+-----------+ +| |`Dumpling`_|`Emperor`_ |`Firefly`_ |`Giant`_ |`Hammer`_ | +| |LTS |Stable |LTS |Stable |LTS | ++----------------------------+-----------+-----------+-----------+-----------+-----------+ +| First release | August | November | May | October | April | +| | 2013 | 2013 | 2014 | 2014 | 2015 | ++----------------------------+-----------+-----------+-----------+-----------+-----------+ +| Estimated retirement | March | | January | | November | +| | 2015 | | 2016 | | 2016 | ++----------------------------+-----------+-----------+-----------+-----------+-----------+ +| Actual retirement | May | May | | April | | +| | 2015 | 2014 | | 2015 | | ++----------------------------+-----------+-----------+-----------+-----------+-----------+ + ++----------------+-----------+-----------+-----------+-----------+-----------+-----------+ +| |Development|`Dumpling`_|`Emperor`_ |`Firefly`_ |`Giant`_ |`Hammer`_ | +| |Testing |LTS |Stable |LTS |Stable |LTS | ++----------------+-----------+-----------+-----------+-----------+-----------+-----------+ +| July 2015 |`9.0.2`_ | | |`0.80.10`_ | | | ++----------------+-----------+-----------+-----------+-----------+-----------+-----------+ +| June 2015 |`9.0.1`_ | | | | |`0.94.2`_ | ++----------------+-----------+-----------+-----------+-----------+-----------+-----------+ +| May 2015 |`9.0.0`_ | | | | | | ++----------------+-----------+-----------+-----------+-----------+-----------+-----------+ +| April 2015 | | | | |`0.87.2`_ |`0.94.1`_ | +| +-----------+-----------+-----------+-----------+-----------+-----------+ +| | | | | | |`0.94`_ | ++----------------+-----------+-----------+-----------+-----------+-----------+-----------+ +| March 2015 | | | |`0.80.9`_ | | | ++----------------+-----------+-----------+-----------+-----------+-----------+-----------+ +| February 2015 |`0.93`_ | | | |`0.87.1`_ | | +| +-----------+-----------+-----------+-----------+-----------+-----------+ +| |`0.92`_ | | | | | | ++----------------+-----------+-----------+-----------+-----------+-----------+-----------+ +| January 2015 |`0.91`_ | | |`0.80.8`_ | | | ++----------------+-----------+-----------+-----------+-----------+-----------+-----------+ +| December 2014 |`0.90`_ | | | | | | +| +-----------+-----------+-----------+-----------+-----------+-----------+ +| |`0.89`_ | | | | | | ++----------------+-----------+-----------+-----------+-----------+-----------+-----------+ +| November 2014 |`0.88`_ | | | | | | ++----------------+-----------+-----------+-----------+-----------+-----------+-----------+ +| October 2014 |`0.86`_ | | |`0.80.7`_ |`0.87`_ | | +| +-----------+-----------+-----------+-----------+-----------+-----------+ +| | | | |`0.80.6`_ | | | ++----------------+-----------+-----------+-----------+-----------+-----------+-----------+ +| September 2014 |`0.85`_ |`0.67.11`_ | | | | | ++----------------+-----------+-----------+-----------+-----------+-----------+-----------+ +| August 2014 |`0.84`_ |`0.67.10`_ | | | | | ++----------------+-----------+-----------+-----------+-----------+-----------+-----------+ +| July 2014 |`0.83`_ | | |`0.80.5`_ | | | +| +-----------+-----------+-----------+-----------+-----------+-----------+ +| | | | |`0.80.4`_ | | | +| +-----------+-----------+-----------+-----------+-----------+-----------+ +| | | | |`0.80.3`_ | | | +| +-----------+-----------+-----------+-----------+-----------+-----------+ +| | | | |`0.80.2`_ | | | ++----------------+-----------+-----------+-----------+-----------+-----------+-----------+ +| June 2014 |`0.82`_ | | | | | | +| +-----------+-----------+-----------+-----------+-----------+-----------+ +| |`0.81`_ | | | | | | ++----------------+-----------+-----------+-----------+-----------+-----------+-----------+ +| May 2014 | |`0.67.9`_ | |`0.80.1`_ | | | +| +-----------+-----------+-----------+-----------+-----------+-----------+ +| | |`0.67.8`_ | |`0.80`_ | | | ++----------------+-----------+-----------+-----------+-----------+-----------+-----------+ +| April 2014 |`0.79`_ | | | | | | ++----------------+-----------+-----------+-----------+-----------+-----------+-----------+ +| March 2014 |`0.78`_ | | | | | | ++----------------+-----------+-----------+-----------+-----------+-----------+-----------+ +| February 2014 |`0.77`_ |`0.67.7`_ | | | | | +| +-----------+-----------+-----------+-----------+-----------+-----------+ +| | |`0.67.6`_ | | | | | ++----------------+-----------+-----------+-----------+-----------+-----------+-----------+ +| January 2014 |`0.76`_ | | | | | | +| +-----------+-----------+-----------+-----------+-----------+-----------+ +| |`0.75`_ | | | | | | ++----------------+-----------+-----------+-----------+-----------+-----------+-----------+ +| December 2013 |`0.74`_ |`0.67.5`_ |`0.72.2`_ | | | | +| +-----------+-----------+-----------+-----------+-----------+-----------+ +| |`0.73`_ | | | | | | ++----------------+-----------+-----------+-----------+-----------+-----------+-----------+ +| November 2013 | | |`0.72.1`_ | | | | +| +-----------+-----------+-----------+-----------+-----------+-----------+ +| | | |`0.72`_ | | | | ++----------------+-----------+-----------+-----------+-----------+-----------+-----------+ +| October 2013 |`0.71`_ |`0.67.4`_ | | | | | +| +-----------+-----------+-----------+-----------+-----------+-----------+ +| |`0.70`_ | | | | | | ++----------------+-----------+-----------+-----------+-----------+-----------+-----------+ +| September 2013 |`0.69`_ | | | | | | +| +-----------+-----------+-----------+-----------+-----------+-----------+ +| |`0.68`_ |`0.67.3`_ | | | | | ++----------------+-----------+-----------+-----------+-----------+-----------+-----------+ +| August 2013 | |`0.67.2`_ | | | | | +| +-----------+-----------+-----------+-----------+-----------+-----------+ +| | |`0.67.1`_ | | | | | +| +-----------+-----------+-----------+-----------+-----------+-----------+ +| | |`0.67`_ | | | | | ++----------------+-----------+-----------+-----------+-----------+-----------+-----------+ + +.. _9.0.2: ../release-notes#v9-0-2 +.. _9.0.1: ../release-notes#v9-0-1 +.. _9.0.0: ../release-notes#v9-0-0 + +.. _0.94.2: ../release-notes#v0-94-2-hammer +.. _0.94.1: ../release-notes#v0-94-1-hammer +.. _0.94: ../release-notes#v0-94-hammer +.. _Hammer: ../release-notes#v0-94-hammer + +.. _0.93: ../release-notes#v0-93 +.. _0.92: ../release-notes#v0-92 +.. _0.91: ../release-notes#v0-91 +.. _0.90: ../release-notes#v0-90 +.. _0.89: ../release-notes#v0-89 +.. _0.88: ../release-notes#v0-88 + +.. _0.87.2: ../release-notes#v0-87-2-giant +.. _0.87.1: ../release-notes#v0-87-1-giant +.. _0.87: ../release-notes#v0-87-giant +.. _Giant: ../release-notes#v0-87-giant + +.. _0.86: ../release-notes#v0-86 +.. _0.85: ../release-notes#v0-85 +.. _0.84: ../release-notes#v0-84 +.. _0.83: ../release-notes#v0-83 +.. _0.82: ../release-notes#v0-82 +.. _0.81: ../release-notes#v0-81 + +.. _0.80.10: ../release-notes#v0-80-10-firefly +.. _0.80.9: ../release-notes#v0-80-9-firefly +.. _0.80.8: ../release-notes#v0-80-8-firefly +.. _0.80.7: ../release-notes#v0-80-7-firefly +.. _0.80.6: ../release-notes#v0-80-6-firefly +.. _0.80.5: ../release-notes#v0-80-5-firefly +.. _0.80.4: ../release-notes#v0-80-4-firefly +.. _0.80.3: ../release-notes#v0-80-3-firefly +.. _0.80.2: ../release-notes#v0-80-2-firefly +.. _0.80.1: ../release-notes#v0-80-1-firefly +.. _0.80: ../release-notes#v0-80-firefly +.. _Firefly: ../release-notes#v0-80-firefly + +.. _0.79: ../release-notes#v0-79 +.. _0.78: ../release-notes#v0-78 +.. _0.77: ../release-notes#v0-77 +.. _0.76: ../release-notes#v0-76 +.. _0.75: ../release-notes#v0-75 +.. _0.74: ../release-notes#v0-74 +.. _0.73: ../release-notes#v0-73 + +.. _0.72.2: ../release-notes#v0-72-2-emperor +.. _0.72.1: ../release-notes#v0-72-1-emperor +.. _0.72: ../release-notes#v0-72-emperor +.. _Emperor: ../release-notes#v0-72-emperor + +.. _0.71: ../release-notes#v0-71 +.. _0.70: ../release-notes#v0-70 +.. _0.69: ../release-notes#v0-69 +.. _0.68: ../release-notes#v0-68 + +.. _0.67.11: ../release-notes#v0-67-11-dumpling +.. _0.67.10: ../release-notes#v0-67-10-dumpling +.. _0.67.9: ../release-notes#v0-67-9-dumpling +.. _0.67.8: ../release-notes#v0-67-8-dumpling +.. _0.67.7: ../release-notes#v0-67-7-dumpling +.. _0.67.6: ../release-notes#v0-67-6-dumpling +.. _0.67.5: ../release-notes#v0-67-5-dumpling +.. _0.67.4: ../release-notes#v0-67-4-dumpling +.. _0.67.3: ../release-notes#v0-67-3-dumpling +.. _0.67.2: ../release-notes#v0-67-2-dumpling +.. _0.67.1: ../release-notes#v0-67-1-dumpling +.. _0.67: ../release-notes#v0-67-dumpling +.. _Dumpling: ../release-notes#v0-67-dumpling + +Understanding the release cycle +------------------------------- + +The development release cycle is two to four weeks long. Each cycle +freezes the master development branch and applies `integration and +upgrade tests `_ for the +duration of one cycle before it is released and the next release's +code is frozen for testing. Once released, there is no effort to +backport fixes; developer focus in on the next development release +which is usually only a few weeks away. + +There are three to four stable releases a year. Each stable release +will receive a name (e.g., 'Firefly') and bug fix backports at least +until the next stable release is out. + +Every other stable releases is a LTS (Long Term Stable) and will +receive updates until two LTS are published. For instance Dumpling is +retired when Hammer is published, Firefly is retired when Jewel is +published etc. The rationale is that backports to a LTS (Dumpling for +instance) are expected to happen until the next LTS is published +(Firefly is the LTS following Dumpling), to fix bugs and possibly +backport important features. After the next LTS is published, there +backports are still expected to fix bugs with a focus on whatever can +prevent upgrades to the next LTS (in our example, fixes to Dumpling +were published after Firefly was released and until Hammer was +published, primarily to ensure Dumpling cluster can smoothly migrate +to Firefly). + +* LTS : until the next two LTS are published +* Stable release : until the next stable release is published +* Development / testing release : no backports + +For each stable release: + +* `Integration and upgrade tests + `_ are run on a regular basis + and `their results `_ analyzed by Ceph + developers. +* `Issues `_ + fixed in the development branch is scheduled to be backported to the + release. +* When an issue found in the release is `reported + `_ it will be + triaged by Ceph developers. +* The `stable releases and backport team `_ + publishes ``point releases`` including fixes that have been backported to the release. + +In the timeline, the life time of a LTS is calculated to be +approximately 18 months after the month of the first release. For +instance, Dumpling is published August 2013 and 18 months starting +September 2013 is February 2015, therefore by March 2015 Dumpling +should be retired. The lifetime of a release may vary because it +depend on how quickly the stable releases are published. For instance +although Dumpling theoritical retirement was March 2015, it was +extended to May 2015. + +Release numbers conventions +--------------------------- + +The first Ceph release back in Jan of 2008 was 0.1. That made sense at +the time. The versioning scheme did not change until April 2015, +when 0.94.1 (the first Hammer point release) was published. To avoid reaching +0.99 (and 0.100 or 1.00?) we have a new strategy. + +* x.0.z - development releases (for early testers and the brave at heart) +* x.1.z - release candidates (for test clusters, brave users) +* x.2.z - stable/bugfix releases (for users) + +``x`` will start at 9 for Infernalis (``I`` is the 9th letter), making +our first development release of the 9th release cycle 9.0.0. +Subsequent development releases will be 9.0.1, 9.0.2, etc. + +After a couple months we'll have a 9.1.0 (and maybe 9.1.1) release candidate. + +A few weeks after that we'll have the Infernalis release 9.2.0, followed +by stable bug fix updates 9.2.1, 9.2.2, etc., and then begin work on the +Jewel (10.y.z) release. diff --git a/doc/start/documenting-ceph.rst b/doc/start/documenting-ceph.rst index 3aba99c70f58d..d57192e8d1c2c 100644 --- a/doc/start/documenting-ceph.rst +++ b/doc/start/documenting-ceph.rst @@ -446,10 +446,13 @@ Push the Change Once you have one or more commits, you must push them from the local copy of the repository to ``github``. A graphical tool like ``git-gui`` provides a user -interface for pushing to the repository. :: +interface for pushing to the repository. If you created a branch previously:: - git push + git push origin wip-doc-{your-branch-name} + +Otherwise:: + git push Make a Pull Request diff --git a/doc/start/get-involved.rst b/doc/start/get-involved.rst index 3e8110e51f74f..cfe3f4d64ddeb 100644 --- a/doc/start/get-involved.rst +++ b/doc/start/get-involved.rst @@ -69,10 +69,6 @@ These are exciting times in the Ceph community! Get involved! | | at http://github.com. See `Ceph Source Code`_ | | | | for details on cloning from github. | | +----------------------+-------------------------------------------------+-----------------------------------------------+ -| **Support** | If you have a very specific problem, an | http://inktank.com | -| | immediate need, or if your deployment requires | | -| | significant help, consider commercial support_. | | -+----------------------+-------------------------------------------------+-----------------------------------------------+ @@ -91,6 +87,5 @@ These are exciting times in the Ceph community! Get involved! .. _Mailing list archives: http://lists.ceph.com/ .. _Blog: http://ceph.com/community/blog/ .. _Tracker: http://tracker.ceph.com/ -.. _Support: http://ceph.com/help/professional/ .. _Ceph Source Code: http://github.com/ceph/ceph diff --git a/doc/start/quick-ceph-deploy.rst b/doc/start/quick-ceph-deploy.rst index e73b6ffe6dd50..335b450351660 100644 --- a/doc/start/quick-ceph-deploy.rst +++ b/doc/start/quick-ceph-deploy.rst @@ -90,35 +90,20 @@ configuration details, perform the following steps using ``ceph-deploy``. to re-install Ceph. -#. Add the initial monitor(s) and gather the keys (new in - ``ceph-deploy`` v1.1.3). :: +#. Add the initial monitor(s) and gather the keys:: ceph-deploy mon create-initial - **Note:** In earlier versions of ``ceph-deploy``, you must create the - initial monitor(s) and gather keys in two discrete steps. First, create - the monitor. :: - - ceph-deploy mon create {ceph-node} - - For example:: - - ceph-deploy mon create node1 - - Then, gather the keys. :: - - ceph-deploy gatherkeys {ceph-node} - - For example:: - - ceph-deploy gatherkeys node1 - Once you complete the process, your local directory should have the following keyrings: - ``{cluster-name}.client.admin.keyring`` - ``{cluster-name}.bootstrap-osd.keyring`` - ``{cluster-name}.bootstrap-mds.keyring`` + - ``{cluster-name}.bootstrap-rgw.keyring`` + +.. note:: The bootstrap-rgw keyring is only created during installation of clusters + running Hammer or newer #. Add two OSDs. For fast setup, this quick start uses a directory rather @@ -289,6 +274,38 @@ For example:: with multiple metadata servers. +Add an RGW Instance +------------------- + +To use the :term:`Ceph Object Gateway` component of Ceph, you must deploy an +instance of :term:`RGW`. Execute the following to create an new instance of +RGW:: + + ceph-deploy rgw create {gateway-node} + +For example:: + + ceph-deploy rgw create node1 + +.. note:: This functionality is new with the **Hammer** release, and also with + ``ceph-deploy`` v1.5.23. + +By default, the :term:`RGW` instance will listen on port 7480. This can be +changed by editing ceph.conf on the node running the :term:`RGW` as follows: + +.. code-block:: ini + + [client] + rgw frontends = civetweb port=80 + +To use an IPv6 address, use: + +.. code-block:: ini + + [client] + rgw frontends = civetweb port=[::]:80 + + Adding Monitors --------------- @@ -300,11 +317,11 @@ of monitors (i.e., 1, 2:3, 3:4, 3:5, 4:6, etc.) to form a quorum. Add two Ceph Monitors to your cluster. :: - ceph-deploy mon create {ceph-node} + ceph-deploy mon add {ceph-node} For example:: - ceph-deploy mon create node2 node3 + ceph-deploy mon add node2 node3 Once you have added your new Ceph Monitors, Ceph will begin synchronizing the monitors and form a quorum. You can check the quorum status by executing diff --git a/doc/start/quick-rbd.rst b/doc/start/quick-rbd.rst index 39890426f67b1..a7fb0d9b33459 100644 --- a/doc/start/quick-rbd.rst +++ b/doc/start/quick-rbd.rst @@ -45,7 +45,7 @@ Install Ceph The ``ceph-deploy`` utility copies the keyring to the ``/etc/ceph`` directory. Ensure that the keyring file has appropriate read permissions - (e.g., ``sudo chmod + r /etc/ceph/ceph.client.admin.keyring``). + (e.g., ``sudo chmod +r /etc/ceph/ceph.client.admin.keyring``). Configure a Block Device @@ -57,7 +57,7 @@ Configure a Block Device #. On the ``ceph-client`` node, map the image to a block device. :: - sudo rbd map foo --pool rbd --name client.admin [-m {mon-IP}] [-k /path/to/ceph.client.admin.keyring] + sudo rbd map foo --name client.admin [-m {mon-IP}] [-k /path/to/ceph.client.admin.keyring] #. Use the block device by creating a file system on the ``ceph-client`` node. :: diff --git a/doc/start/quick-rgw.rst b/doc/start/quick-rgw.rst index 1de018e0d5c25..5f9cfc1bea480 100644 --- a/doc/start/quick-rgw.rst +++ b/doc/start/quick-rgw.rst @@ -2,10 +2,27 @@ Quick Ceph Object Storage =========================== -At this time, ``ceph-deploy`` does not provide a rapid installation for -:term:`Ceph Object Storage`. To install a :term:`Ceph Object Gateway`, -see `Install Ceph Object Gateway`_. To configure a Ceph Object Gateway, -see `Configuring Ceph Object Gateway`_. +To use the :term:`Ceph Object Storage` Quick Start guide, you must have executed the +procedures in the `Storage Cluster Quick Start`_ guide first. Make sure that you +have at least one :term:`RGW` instance running. -.. _Install Ceph Object Gateway: ../../install/install-ceph-gateway -.. _Configuring Ceph Object Gateway: ../../radosgw/config \ No newline at end of file +Configure new RGW instance +========================== + +The :term:`RGW` instance created by the `Storage Cluster Quick Start`_ will run using +the embedded CivetWeb webserver. ``ceph-deploy`` will create the instance and start +it automatically with default parameters. + +To administer the :term:`RGW` instance, see details in the the +`RGW Admin Guide`_. + +Additional details may be found in the `Configuring Ceph Object Gateway`_ guide, but +the steps specific to Apache are no longer needed. + +.. note:: Deploying RGW using ``ceph-deploy`` and using the CivetWeb webserver instead + of Apache is new functionality as of **Hammer** release. + + +.. _Storage Cluster Quick Start: ../quick-ceph-deploy +.. _RGW Admin Guide: ../../radosgw/admin +.. _Configuring Ceph Object Gateway: ../../radosgw/config diff --git a/doc/start/quick-start-preflight.rst b/doc/start/quick-start-preflight.rst index 5af42769877ef..ff55bf2a8664a 100644 --- a/doc/start/quick-start-preflight.rst +++ b/doc/start/quick-start-preflight.rst @@ -63,7 +63,7 @@ following steps: Paste the following example code. Replace ``{ceph-release}`` with the recent major release of Ceph (e.g., ``firefly``). Replace ``{distro}`` with your Linux distribution (e.g., ``el6`` for CentOS 6, - ``el7`` for CentOS 7, ``rhel6.5`` for + ``el7`` for CentOS 7, ``rhel6`` for Red Hat 6.5, ``rhel7`` for Red Hat 7, and ``fc19`` or ``fc20`` for Fedora 19 or Fedora 20. Finally, save the contents to the ``/etc/yum.repos.d/ceph.repo`` file. :: @@ -166,10 +166,9 @@ Enable Password-less SSH ------------------------ Since ``ceph-deploy`` will not prompt for a password, you must generate -SSH keys on the admin node and distribute the public key to each Ceph node. - -.. note:: ``ceph-deploy`` v1.1.3 and later releases will attempt to generate - the SSH keys for initial monitors. +SSH keys on the admin node and distribute the public key to each Ceph +node. ``ceph-deploy`` will attempt to generate the SSH keys for initial +monitors. #. Generate the SSH keys, but do not use ``sudo`` or the ``root`` user. Leave the passphrase empty:: @@ -240,7 +239,7 @@ Open Required Ports ------------------- Ceph Monitors communicate using port ``6789`` by default. Ceph OSDs communicate -in a port range of ``6800:7810`` by default. See the `Network Configuration +in a port range of ``6800:7300`` by default. See the `Network Configuration Reference`_ for details. Ceph OSDs can use multiple network connections to communicate with clients, monitors, other OSDs for replication, and other OSDs for heartbeats. @@ -250,12 +249,12 @@ strict. You may need to adjust your firewall settings allow inbound requests so that clients in your network can communicate with daemons on your Ceph nodes. For ``firewalld`` on RHEL 7, add port ``6789`` for Ceph Monitor nodes and ports -``6800:7100`` for Ceph OSDs to the public zone and ensure that you make the +``6800:7300`` for Ceph OSDs to the public zone and ensure that you make the setting permanent so that it is enabled on reboot. For example:: sudo firewall-cmd --zone=public --add-port=6789/tcp --permanent -For ``iptables``, add port ``6789`` for Ceph Monitors and ports ``6800:7100`` +For ``iptables``, add port ``6789`` for Ceph Monitors and ports ``6800:7300`` for Ceph OSDs. For example:: sudo iptables -A INPUT -i {iface} -p tcp -s {ip-address}/{netmask} --dport 6789 -j ACCEPT @@ -295,6 +294,22 @@ To configure SELinux persistently (recommended if SELinux is an issue), modify the configuration file at ``/etc/selinux/config``. +Priorities/Preferences +---------------------- + +Ensure that your package manager has priority/preferences packages installed and +enabled. On CentOS, you may need to install EPEL. On RHEL, you may need to +enable optional repositories. :: + + sudo yum install yum-plugin-priorities + +For example, on RHEL 7 server, execute the following to install +``yum-plugin-priorities`` and enable the ``rhel-7-server-optional-rpms`` +repository:: + + sudo yum install yum-plugin-priorities --enablerepo=rhel-7-server-optional-rpms + + Summary ======= diff --git a/etc/sysconfig/SuSEfirewall2.d/services/ceph-mon b/etc/sysconfig/SuSEfirewall2.d/services/ceph-mon new file mode 100644 index 0000000000000..7a28e73a74025 --- /dev/null +++ b/etc/sysconfig/SuSEfirewall2.d/services/ceph-mon @@ -0,0 +1,5 @@ +## Name: Ceph MON +## Description: Open port for Ceph Monitor + +# space separated list of allowed TCP ports +TCP="6789" diff --git a/etc/sysconfig/SuSEfirewall2.d/services/ceph-osd-mds b/etc/sysconfig/SuSEfirewall2.d/services/ceph-osd-mds new file mode 100644 index 0000000000000..0109fde41be05 --- /dev/null +++ b/etc/sysconfig/SuSEfirewall2.d/services/ceph-osd-mds @@ -0,0 +1,5 @@ +## Name: Ceph OSD/MDS +## Description: Open ports for Ceph OSDs and Metadata Servers (max: 166 per node) + +# space separated list of allowed TCP ports +TCP="6800:7300" diff --git a/examples/librados/hello_world.cc b/examples/librados/hello_world.cc index 0e6a7acb182e7..cb5476ffc8427 100644 --- a/examples/librados/hello_world.cc +++ b/examples/librados/hello_world.cc @@ -132,7 +132,7 @@ int main(int argc, const char **argv) /* * now that we have the data to write, let's send it to an object. - * We'll use the asynchronous interface for simplicity. + * We'll use the synchronous interface for simplicity. */ ret = io_ctx.write_full(object_name, bl); if (ret < 0) { diff --git a/install-deps.sh b/install-deps.sh index 4785d00501371..5ad41c69cd989 100755 --- a/install-deps.sh +++ b/install-deps.sh @@ -1,8 +1,8 @@ -#!/bin/bash +#!/bin/bash -e # # Ceph distributed storage system # -# Copyright (C) 2014 Red Hat +# Copyright (C) 2014, 2015 Red Hat # # Author: Loic Dachary # @@ -14,36 +14,134 @@ DIR=/tmp/install-deps.$$ trap "rm -fr $DIR" EXIT mkdir -p $DIR +if test $(id -u) != 0 ; then + SUDO=sudo +fi +export LC_ALL=C # the following is vulnerable to i18n + +if test -f /etc/redhat-release ; then + $SUDO yum install -y redhat-lsb-core +fi + +if type apt-get > /dev/null 2>&1 ; then + $SUDO apt-get install -y lsb-release +fi + +if type zypper > /dev/null 2>&1 ; then + $SUDO zypper --gpg-auto-import-keys --non-interactive install openSUSE-release lsb-release +fi case $(lsb_release -si) in Ubuntu|Debian|Devuan) - sudo apt-get install -y dpkg-dev + $SUDO apt-get install -y dpkg-dev + if ! test -r debian/control ; then + echo debian/control is not a readable file + exit 1 + fi touch $DIR/status packages=$(dpkg-checkbuilddeps --admindir=$DIR debian/control 2>&1 | \ perl -p -e 's/.*Unmet build dependencies: *//;' \ -e 's/build-essential:native/build-essential/;' \ + -e 's/\s*\|\s*/\|/g;' \ -e 's/\(.*?\)//g;' \ -e 's/ +/\n/g;' | sort) case $(lsb_release -sc) in - squeeze) - packages=$(echo $packages | perl -pe 's/\w*babeltrace\w*//g') + squeeze|wheezy) + packages=$(echo $packages | perl -pe 's/[-\w]*babeltrace[-\w]*//g') + backports="-t $(lsb_release -sc)-backports" ;; esac - sudo apt-get install -y $packages + packages=$(echo $packages) # change newlines into spaces + $SUDO env DEBIAN_FRONTEND=noninteractive apt-get install $backports -y $packages || exit 1 ;; -CentOS|Fedora|SUSE*|RedHatEnterpriseServer) +CentOS|Fedora|RedHatEnterpriseServer) case $(lsb_release -si) in - SUSE*) - sudo zypper -y yum-utils + Fedora) + $SUDO yum install -y yum-utils ;; - *) - sudo yum install -y yum-utils + CentOS|RedHatEnterpriseServer) + $SUDO yum install -y yum-utils + MAJOR_VERSION=$(lsb_release -rs | cut -f1 -d.) + if test $(lsb_release -si) == RedHatEnterpriseServer ; then + $SUDO yum install subscription-manager + $SUDO subscription-manager repos --enable=rhel-$MAJOR_VERSION-server-optional-rpms + fi + $SUDO yum-config-manager --add-repo https://dl.fedoraproject.org/pub/epel/$MAJOR_VERSION/x86_64/ + $SUDO yum install --nogpgcheck -y epel-release + $SUDO rpm --import /etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-$MAJOR_VERSION + $SUDO rm -f /etc/yum.repos.d/dl.fedoraproject.org* ;; esac sed -e 's/@//g' < ceph.spec.in > $DIR/ceph.spec - sudo yum-builddep -y $DIR/ceph.spec + $SUDO yum-builddep -y $DIR/ceph.spec 2>&1 | tee $DIR/yum-builddep.out + ! grep -q -i error: $DIR/yum-builddep.out || exit 1 + ;; +*SUSE*) + sed -e 's/@//g' < ceph.spec.in > $DIR/ceph.spec + $SUDO zypper --non-interactive install $(rpmspec -q --buildrequires $DIR/ceph.spec) || exit 1 ;; *) echo "$(lsb_release -si) is unknown, dependencies will have to be installed manually." ;; esac + +function populate_wheelhouse() { + local install=$1 + shift + + # Ubuntu-12.04 and Python 2.7.3 require this line + pip --timeout 300 $install 'distribute >= 0.7.3' || return 1 + # although pip comes with virtualenv, having a recent version + # of pip matters when it comes to using wheel packages + pip --timeout 300 $install 'setuptools >= 0.8' 'pip >= 7.0' 'wheel >= 0.24' || return 1 + if test $# != 0 ; then + pip --timeout 300 $install $@ || return 1 + fi +} + +function activate_virtualenv() { + local top_srcdir=$1 + local interpreter=$2 + local env_dir=$top_srcdir/install-deps-$interpreter + + if ! test -d $env_dir ; then + virtualenv --python $interpreter $env_dir + . $env_dir/bin/activate + if ! populate_wheelhouse install ; then + rm -rf $env_dir + return 1 + fi + fi + . $env_dir/bin/activate +} + +# use pip cache if possible but do not store it outside of the source +# tree +# see https://pip.pypa.io/en/stable/reference/pip_install.html#caching +mkdir -p install-deps-cache +top_srcdir=$(pwd) +export XDG_CACHE_HOME=$top_srcdir/install-deps-cache +wip_wheelhouse=wheelhouse-wip + +# +# preload python modules so that tox can run without network access +# +find . -name tox.ini | while read ini ; do + ( + cd $(dirname $ini) + require=$(ls *requirements.txt 2>/dev/null | sed -e 's/^/-r /') + if test "$require" && ! test -d wheelhouse ; then + for interpreter in python2.7 python3 ; do + type $interpreter > /dev/null 2>&1 || continue + activate_virtualenv $top_srcdir $interpreter || exit 1 + populate_wheelhouse "wheel -w $wip_wheelhouse" $require || exit 1 + done + mv $wip_wheelhouse wheelhouse + fi + ) +done + +for interpreter in python2.7 python3 ; do + rm -rf $top_srcdir/install-deps-$interpreter +done +rm -rf $XDG_CACHE_HOME diff --git a/m4/ax_arm.m4 b/m4/ax_arm.m4 index 2ccc9a977f823..37ea0aaf1d16a 100644 --- a/m4/ax_arm.m4 +++ b/m4/ax_arm.m4 @@ -13,13 +13,27 @@ AC_DEFUN([AX_ARM_FEATURES], fi ;; aarch64*) + AX_CHECK_COMPILE_FLAG(-march=armv8-a, ax_cv_support_armv8=yes, []) + if test x"$ax_cv_support_armv8" = x"yes"; then + ARM_ARCH_FLAGS="-march=armv8-a" + ARM_DEFINE_FLAGS="-DARCH_AARCH64" + fi AX_CHECK_COMPILE_FLAG(-march=armv8-a+simd, ax_cv_support_neon_ext=yes, []) if test x"$ax_cv_support_neon_ext" = x"yes"; then + ARM_ARCH_FLAGS="$ARM_ARCH_FLAGS+simd" + ARM_DEFINE_FLAGS="$ARM_DEFINE_FLAGS -DARM_NEON" ARM_NEON_FLAGS="-march=armv8-a+simd -DARCH_AARCH64 -DARM_NEON" - AC_SUBST(ARM_NEON_FLAGS) - ARM_FLAGS="$ARM_FLAGS $ARM_NEON_FLAGS" AC_DEFINE(HAVE_NEON,,[Support NEON instructions]) + AC_SUBST(ARM_NEON_FLAGS) + fi + AX_CHECK_COMPILE_FLAG(-march=armv8-a+crc, ax_cv_support_crc_ext=yes, []) + if test x"$ax_cv_support_crc_ext" = x"yes"; then + ARM_ARCH_FLAGS="$ARM_ARCH_FLAGS+crc" + ARM_CRC_FLAGS="-march=armv8-a+crc -DARCH_AARCH64" + AC_DEFINE(HAVE_ARMV8_CRC,,[Support ARMv8 CRC instructions]) + AC_SUBST(ARM_CRC_FLAGS) fi + ARM_FLAGS="$ARM_ARCH_FLAGS $ARM_DEFINE_FLAGS" ;; esac diff --git a/make-debs.sh b/make-debs.sh new file mode 100755 index 0000000000000..076829bb327e8 --- /dev/null +++ b/make-debs.sh @@ -0,0 +1,101 @@ +#!/bin/bash +# +# Copyright (C) 2015 Red Hat +# +# Author: Loic Dachary +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# +set -xe + +base=${1:-/tmp/release} +codename=$(lsb_release -sc) +releasedir=$base/$(lsb_release -si)/WORKDIR +rm -fr $releasedir +mkdir -p $releasedir +# +# remove all files not under git so they are not +# included in the distribution. +# +git clean -dxf +# +# git describe provides a version that is +# a) human readable +# b) is unique for each commit +# c) compares higher than any previous commit +# d) contains the short hash of the commit +# +vers=$(git describe --match "v*" | sed s/^v//) +# +# creating the distribution tarbal requires some configure +# options (otherwise parts of the source tree will be left out). +# +./autogen.sh +./configure --with-rocksdb --with-ocf \ + --with-nss --with-debug --enable-cephfs-java \ + --with-lttng --with-babeltrace +# +# use distdir= to set the name of the top level directory of the +# tarbal to match the desired version +# +make distdir=ceph-$vers dist +# +# rename the tarbal to match debian conventions and extract it +# +mv ceph-$vers.tar.gz $releasedir/ceph_$vers.orig.tar.gz +tar -C $releasedir -zxf $releasedir/ceph_$vers.orig.tar.gz +# +# copy the debian directory over and remove -dbg packages +# because they are large and take time to build +# +cp -a debian $releasedir/ceph-$vers/debian +cd $releasedir +perl -ni -e 'print if(!(/^Package: .*-dbg$/../^$/))' ceph-$vers/debian/control +perl -pi -e 's/--dbg-package.*//' ceph-$vers/debian/rules +# +# always set the debian version to 1 which is ok because the debian +# directory is included in the sources and the upstream version will +# change each time it is modified. +# +dvers="$vers-1" +# +# update the changelog to match the desired version +# +cd ceph-$vers +chvers=$(head -1 debian/changelog | perl -ne 's/.*\(//; s/\).*//; print') +if [ "$chvers" != "$dvers" ]; then + DEBEMAIL="contact@ceph.com" dch -D $codename --force-distribution -b -v "$dvers" "new version" +fi +# +# create the packages +# a) with ccache to speed things up when building repeatedly +# b) do not sign the packages +# c) use half of the available processors +# +: ${NPROC:=$(($(nproc) / 2))} +if test $NPROC -gt 1 ; then + j=-j${NPROC} +fi +PATH=/usr/lib/ccache:$PATH dpkg-buildpackage $j -uc -us +cd ../.. +mkdir -p $codename/conf +cat > $codename/conf/distributions < $codename/version diff --git a/make_dist.sh b/make_dist.sh new file mode 100755 index 0000000000000..044ad8316ff60 --- /dev/null +++ b/make_dist.sh @@ -0,0 +1,36 @@ +#!/bin/sh -e + +if [ ! -d .git ]; then + echo "no .git present. run this from the base dir of the git checkout." + exit 1 +fi + +version=$1 +[ -z "$version" ] && version=`git describe --match 'v*' | sed 's/^v//'` +outfile="ceph-$version" + +echo "version $version" + +# update submodules +echo "updating submodules..." +force=$(if git submodule usage 2>&1 | grep --quiet 'update.*--force'; then echo --force ; fi) +if ! git submodule sync || ! git submodule update $force --init --recursive; then + echo "Error: could not initialize submodule projects" + echo " Network connectivity might be required." + exit 1 +fi + +# clean out old cruft... +echo "cleanup..." +rm -f $outfile.tar $outfile.tar.gz + +# build new tarball +echo "building tarball..." +bin/git-archive-all.sh --prefix ceph-$version/ \ + --verbose \ + --ignore corpus \ + $outfile.tar +echo "compressing..." +bzip2 -9 $outfile.tar + +echo "done." diff --git a/man/.gitignore b/man/.gitignore index 5fc607b9e2fba..b60d7fc395a29 100644 --- a/man/.gitignore +++ b/man/.gitignore @@ -1 +1,3 @@ /Makefile +/*.8 +/doctrees diff --git a/man/CMakeLists.txt b/man/CMakeLists.txt new file mode 100644 index 0000000000000..ea0ef47609c94 --- /dev/null +++ b/man/CMakeLists.txt @@ -0,0 +1,31 @@ +install(FILES + ceph-osd.8 + ceph-mds.8 + ceph-mon.8 + ceph-fuse.8 + ceph-syn.8 + crushtool.8 + osdmaptool.8 + monmaptool.8 + ceph-conf.8 + ceph-run.8 + ceph.8 + mount.ceph.8 + ceph-create-keys.8 + radosgw.8 + radosgw-admin.8 + ceph-authtool.8 + rados.8 + librados-config.8 + rbd.8 + ceph-clsinfo.8 + ceph-debugpack.8 + cephfs.8 + ceph-dencoder.8 + ceph-rest-api.8 + ceph-rbdnamer.8 + ceph-post-file.8 + rbd-fuse.8 + rbd-replay.8 + rbd-replay-prep.8 + DESTINATION ${CEPH_MAN_DIR}/man8) diff --git a/man/Makefile-client.am b/man/Makefile-client.am new file mode 100644 index 0000000000000..14200f5a8ce22 --- /dev/null +++ b/man/Makefile-client.am @@ -0,0 +1,39 @@ +dist_man_MANS += \ + ceph-syn.8 \ + ceph-conf.8 \ + ceph.8 \ + ceph-authtool.8 \ + rados.8 \ + rbd.8 \ + ceph-post-file.8 \ + ceph-dencoder.8 + +if WITH_RADOS +dist_man_MANS += \ + librados-config.8 +endif + +if WITH_RBD +dist_man_MANS += \ + ceph-rbdnamer.8 \ + rbd-replay.8 \ + rbd-replay-many.8 \ + rbd-replay-prep.8 +endif + +if WITH_CEPHFS +dist_man_MANS += \ + cephfs.8 +endif + +if WITH_FUSE +dist_man_MANS += \ + rbd-fuse.8 \ + ceph-fuse.8 +endif + +if WITH_RADOSGW +dist_man_MANS += \ + radosgw.8 \ + radosgw-admin.8 +endif diff --git a/man/Makefile-server.am b/man/Makefile-server.am new file mode 100644 index 0000000000000..6739c44e3221b --- /dev/null +++ b/man/Makefile-server.am @@ -0,0 +1,29 @@ +dist_man_MANS += \ + ceph-deploy.8 \ + crushtool.8 \ + ceph-run.8 \ + mount.ceph.8 \ + ceph-create-keys.8 \ + ceph-rest-api.8 \ + ceph-debugpack.8 \ + ceph_selinux.8 + +if WITH_MON +dist_man_MANS += \ + ceph-mon.8 \ + monmaptool.8 +endif + +if WITH_OSD +dist_man_MANS += \ + ceph-clsinfo.8 \ + ceph-detect-init.8 \ + ceph-disk.8 \ + ceph-osd.8 \ + osdmaptool.8 +endif + +if WITH_MDS +dist_man_MANS += \ + ceph-mds.8 +endif diff --git a/man/Makefile.am b/man/Makefile.am index 14bab49f17e09..eb8edcbad7d25 100644 --- a/man/Makefile.am +++ b/man/Makefile.am @@ -1,33 +1,31 @@ AUTOMAKE_OPTIONS = gnu -dist_man_MANS = \ - ceph-disk.8 \ - ceph-osd.8 \ - ceph-mds.8 \ - ceph-mon.8 \ - ceph-fuse.8 \ - ceph-syn.8 \ - crushtool.8 \ - osdmaptool.8 \ - monmaptool.8 \ - ceph-conf.8 \ - ceph-run.8 \ - ceph.8 \ - mount.ceph.8 \ - radosgw.8 \ - radosgw-admin.8 \ - ceph-authtool.8 \ - rados.8 \ - librados-config.8 \ - rbd.8 \ - ceph-clsinfo.8 \ - ceph-debugpack.8 \ - cephfs.8 \ - ceph-dencoder.8 \ - ceph-rest-api.8 \ - ceph-rbdnamer.8 \ - ceph-post-file.8 \ - rbd-fuse.8 \ - rbd-replay.8 \ - rbd-replay-many.8 \ - rbd-replay-prep.8 +EXTRA_DIST = conf.py + +dist_man_MANS = + +if WITH_MAN_PAGES +if ENABLE_CLIENT +include Makefile-client.am +endif + +if ENABLE_SERVER +include Makefile-server.am +endif + +# prevent `make` from running in parallel, sphinx runs better in batch mode. +.PHONY: sphinx-build.stamp + +$(dist_man_MANS): sphinx-build.stamp + +# in a tree populated from dist tarball, the $(top_srcdir)/doc is not included +sphinx-build.stamp: + if [ -d $(top_srcdir)/doc/man ] ; then \ + ${SPHINX_BUILD} -b man -d doctrees -c $(top_srcdir)/man $(top_srcdir)/doc/man $(top_builddir)/man; \ + fi + +clean-local: + @rm -rf doctrees + +MAINTAINERCLEANFILES = $(dist_man_MANS) +endif diff --git a/man/ceph-authtool.8 b/man/ceph-authtool.8 deleted file mode 100644 index b28ed34dda73f..0000000000000 --- a/man/ceph-authtool.8 +++ /dev/null @@ -1,299 +0,0 @@ -.\" Man page generated from reStructuredText. -. -.TH "CEPH-AUTHTOOL" "8" "November 30, 2014" "dev" "Ceph" -.SH NAME -ceph-authtool \- ceph keyring manipulation tool -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -.SH SYNOPSIS -.nf -\fBceph\-authtool\fP \fIkeyringfile\fP [ \-l | \-\-list ] [ \-C | \-\-create\-keyring -] [ \-p | \-\-print ] [ \-n | \-\-name \fIentityname\fP ] [ \-\-gen\-key ] [ \-a | -\-\-add\-key \fIbase64_key\fP ] [ \-\-caps \fIcapfile\fP ] -.fi -.sp -.SH DESCRIPTION -.sp -\fBceph\-authtool\fP is a utility to create, view, and modify a Ceph keyring -file. A keyring file stores one or more Ceph authentication keys and -possibly an associated capability specification. Each key is -associated with an entity name, of the form -\fB{client,mon,mds,osd}.name\fP\&. -.sp -\fBWARNING\fP Ceph provides authentication and protection against -man\-in\-the\-middle attacks once secret keys are in place. However, -data over the wire is not encrypted, which may include the messages -used to configure said keys. The system is primarily intended to be -used in trusted environments. -.SH OPTIONS -.INDENT 0.0 -.TP -.B \-l, \-\-list -will list all keys and capabilities present in the keyring -.UNINDENT -.INDENT 0.0 -.TP -.B \-p, \-\-print -will print an encoded key for the specified entityname. This is -suitable for the \fBmount \-o secret=\fP argument -.UNINDENT -.INDENT 0.0 -.TP -.B \-C, \-\-create\-keyring -will create a new keyring, overwriting any existing keyringfile -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-gen\-key -will generate a new secret key for the specified entityname -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-add\-key -will add an encoded key to the keyring -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-cap subsystem capability -will set the capability for given subsystem -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-caps capsfile -will set all of capabilities associated with a given key, for all subsystems -.UNINDENT -.SH CAPABILITIES -.sp -The subsystem is the name of a Ceph subsystem: \fBmon\fP, \fBmds\fP, or -\fBosd\fP\&. -.sp -The capability is a string describing what the given user is allowed -to do. This takes the form of a comma separated list of allow -clauses with a permission specifier containing one or more of rwx for -read, write, and execute permission. The \fBallow *\fP grants full -superuser permissions for the given subsystem. -.sp -For example: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -# can read, write, and execute objects -osd = "allow rwx" - -# can access mds server -mds = "allow" - -# can modify cluster state (i.e., is a server daemon) -mon = "allow rwx" -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -A librados user restricted to a single pool might look like: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -mon = "allow r" - -osd = "allow rw pool foo" -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -A client using rbd with read access to one pool and read/write access to another: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -mon = "allow r" - -osd = "allow class\-read object_prefix rbd_children, allow pool templates r class\-read, allow pool vms rwx" -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -A client mounting the file system with minimal permissions would need caps like: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -mds = "allow" - -osd = "allow rw pool data" - -mon = "allow r" -.ft P -.fi -.UNINDENT -.UNINDENT -.SH OSD CAPABILITIES -.sp -In general, an osd capability follows the grammar: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -osdcap := grant[,grant...] -grant := allow (match capspec | capspec match) -match := [pool[=] | object_prefix ] -capspec := * | [r][w][x] [class\-read] [class\-write] -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -The capspec determines what kind of operations the entity can perform: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -r = read access to objects -w = write access to objects -x = can call any class method (same as class\-read class\-write) -class\-read = can call class methods that are reads -class\-write = can call class methods that are writes -* = equivalent to rwx, plus the ability to run osd admin commands, - i.e. ceph osd tell ... -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -The match criteria restrict a grant based on the pool being accessed. -Grants are additive if the client fulfills the match condition. For -example, if a client has the osd capabilities: "allow r object_prefix -prefix, allow w pool foo, allow x pool bar", then it has rw access to -pool foo, rx access to pool bar, and r access to objects whose -names begin with \(aqprefix\(aq in any pool. -.SH CAPS FILE FORMAT -.sp -The caps file format consists of zero or more key/value pairs, one per -line. The key and value are separated by an \fB=\fP, and the value must -be quoted (with \fB\(aq\fP or \fB"\fP) if it contains any whitespace. The key -is the name of the Ceph subsystem (\fBosd\fP, \fBmds\fP, \fBmon\fP), and the -value is the capability string (see above). -.SH EXAMPLE -.sp -To create a new keyring containing a key for client.foo: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -ceph\-authtool \-C \-n client.foo \-\-gen\-key keyring -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -To associate some capabilities with the key (namely, the ability to -mount a Ceph filesystem): -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -ceph\-authtool \-n client.foo \-\-cap mds \(aqallow\(aq \-\-cap osd \(aqallow rw pool=data\(aq \-\-cap mon \(aqallow r\(aq keyring -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -To display the contents of the keyring: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -ceph\-authtool \-l keyring -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -When mounting a Ceph file system, you can grab the appropriately encoded secret key with: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -mount \-t ceph serverhost:/ mountpoint \-o name=foo,secret=\(gaceph\-authtool \-p \-n client.foo keyring\(ga -.ft P -.fi -.UNINDENT -.UNINDENT -.SH AVAILABILITY -.sp -\fBceph\-authtool\fP is part of the Ceph distributed storage system. Please -refer to the Ceph documentation at \fI\%http://ceph.com/docs\fP for more -information. -.SH SEE ALSO -.sp -\fBceph\fP(8) -.SH COPYRIGHT -2010-2014, Inktank Storage, Inc. and contributors. Licensed under Creative Commons BY-SA -.\" Generated by docutils manpage writer. -. diff --git a/man/ceph-clsinfo.8 b/man/ceph-clsinfo.8 deleted file mode 100644 index 4fb82748a37dc..0000000000000 --- a/man/ceph-clsinfo.8 +++ /dev/null @@ -1,96 +0,0 @@ -.\" Man page generated from reStructuredText. -. -.TH "CEPH-CLSINFO" "8" "January 12, 2014" "dev" "Ceph" -.SH NAME -ceph-clsinfo \- show class object information -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -.SH SYNOPSIS -.nf -\fBceph\-clsinfo\fP [ \fIoptions\fP ] ... \fIfilename\fP -.fi -.sp -.SH DESCRIPTION -.sp -\fBceph\-clsinfo\fP can show name, version, and architecture information -about a specific class object. -.SH OPTIONS -.INDENT 0.0 -.TP -.B \-n, \-\-name -Shows the class name -.UNINDENT -.INDENT 0.0 -.TP -.B \-v, \-\-version -Shows the class version -.UNINDENT -.INDENT 0.0 -.TP -.B \-a, \-\-arch -Shows the class architecture -.UNINDENT -.SH AVAILABILITY -.sp -\fBceph\-clsinfo\fP is part of the Ceph distributed storage system. Please -refer to the Ceph documentation at \fI\%http://ceph.com/docs\fP for more -information. -.SH SEE ALSO -.sp -\fBceph\fP(8) -.SH COPYRIGHT -2010-2014, Inktank Storage, Inc. and contributors. Licensed under Creative Commons BY-SA -.\" Generated by docutils manpage writer. -. diff --git a/man/ceph-conf.8 b/man/ceph-conf.8 deleted file mode 100644 index 754f7fc953f4d..0000000000000 --- a/man/ceph-conf.8 +++ /dev/null @@ -1,159 +0,0 @@ -.\" Man page generated from reStructuredText. -. -.TH "CEPH-CONF" "8" "January 12, 2014" "dev" "Ceph" -.SH NAME -ceph-conf \- ceph conf file tool -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -.SH SYNOPSIS -.nf -\fBceph\-conf\fP \-c \fIconffile\fP \-\-list\-all\-sections -\fBceph\-conf\fP \-c \fIconffile\fP \-L -\fBceph\-conf\fP \-c \fIconffile\fP \-l \fIprefix\fP -\fBceph\-conf\fP \fIkey\fP \-s \fIsection1\fP ... -\fBceph\-conf\fP [\-s \fIsection\fP ] \-\-lookup \fIkey\fP -\fBceph\-conf\fP [\-s \fIsection\fP ] \fIkey\fP -.fi -.sp -.SH DESCRIPTION -.sp -\fBceph\-conf\fP is a utility for getting information about a ceph -configuration file. As with most Ceph programs, you can specify which -Ceph configuration file to use with the \fB\-c\fP flag. -.SH ACTIONS -.sp -\fBceph\-conf\fP will perform one of the following actions: -.sp -\-\-list\-all\-sections or \-L prints out a list of all the section names in the configuration -file. -.sp -\-\-list\-sections or \-l prints out a list of all the sections that begin -with a given prefix. For example, \-\-list\-sections mon would list all -sections beginning with mon. -.sp -\-\-lookup will search the configuration for a given value. By default, the sections that -are searched are determined by the Ceph name that we are using. The Ceph name defaults to -client.admin. It can be specified with \-\-name. -.sp -For example, if we specify \-\-name osd.0, the following sections will be searched: -[osd.0], [osd], [global] -.sp -You can specify additional sections to search with \-\-section or \-s. These additional -sections will be searched before the sections that would normally be searched. As always, -the first matching entry we find will be returned. -.sp -Note: \-\-lookup is the default action. If no other actions are given on the command line, -we will default to doing a lookup. -.SH EXAMPLES -.sp -To find out what value osd 0 will use for the "osd data" option: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -ceph\-conf \-c foo.conf \-\-name osd.0 \-\-lookup "osd data" -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -To find out what value will mds a use for the "log file" option: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -ceph\-conf \-c foo.conf \-\-name mds.a "log file" -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -To list all sections that begin with osd: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -ceph\-conf \-c foo.conf \-l osd -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -To list all sections: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -ceph\-conf \-c foo.conf \-L -.ft P -.fi -.UNINDENT -.UNINDENT -.SH AVAILABILITY -.sp -\fBceph\-conf\fP is part of the Ceph distributed storage system. Please refer -to the Ceph documentation at \fI\%http://ceph.com/docs\fP for more -information. -.SH SEE ALSO -.sp -\fBceph\fP(8), -.SH COPYRIGHT -2010-2014, Inktank Storage, Inc. and contributors. Licensed under Creative Commons BY-SA -.\" Generated by docutils manpage writer. -. diff --git a/man/ceph-debugpack.8 b/man/ceph-debugpack.8 deleted file mode 100644 index 9fc016ff6ff31..0000000000000 --- a/man/ceph-debugpack.8 +++ /dev/null @@ -1,95 +0,0 @@ -.\" Man page generated from reStructuredText. -. -.TH "CEPH-DEBUGPACK" "8" "January 12, 2014" "dev" "Ceph" -.SH NAME -ceph-debugpack \- ceph debug packer utility -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -.SH SYNOPSIS -.nf -\fBceph\-debugpack\fP [ \fIoptions\fP ] \fIfilename.tar.gz\fP -.fi -.sp -.SH DESCRIPTION -.sp -\fBceph\-debugpack\fP will build a tarball containing various items that are -useful for debugging crashes. The resulting tarball can be shared with -Ceph developers when debugging a problem. -.sp -The tarball will include the binaries for ceph\-mds, ceph\-osd, and ceph\-mon, radosgw, any -log files, the ceph.conf configuration file, any core files we can -find, and (if the system is running) dumps of the current cluster state -as reported by \(aqceph report\(aq. -.SH OPTIONS -.INDENT 0.0 -.TP -.B \-c ceph.conf, \-\-conf=ceph.conf -Use \fIceph.conf\fP configuration file instead of the default -\fB/etc/ceph/ceph.conf\fP to determine monitor addresses during -startup. -.UNINDENT -.SH AVAILABILITY -.sp -\fBceph\-debugpack\fP is part of the Ceph distributed storage system. Please -refer to the Ceph documentation at \fI\%http://ceph.com/docs\fP for more -information. -.SH SEE ALSO -.sp -\fBceph\fP(8) -\fBceph\-post\-file\fP(8) -.SH COPYRIGHT -2010-2014, Inktank Storage, Inc. and contributors. Licensed under Creative Commons BY-SA -.\" Generated by docutils manpage writer. -. diff --git a/man/ceph-dencoder.8 b/man/ceph-dencoder.8 deleted file mode 100644 index 51b2704535cfb..0000000000000 --- a/man/ceph-dencoder.8 +++ /dev/null @@ -1,193 +0,0 @@ -.\" Man page generated from reStructuredText. -. -.TH "CEPH-DENCODER" "8" "January 12, 2014" "dev" "Ceph" -.SH NAME -ceph-dencoder \- ceph encoder/decoder utility -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -.SH SYNOPSIS -.nf -\fBceph\-dencoder\fP [commands...] -.fi -.sp -.SH DESCRIPTION -.sp -\fBceph\-dencoder\fP is a utility to encode, decode, and dump ceph data -structures. It is used for debugging and for testing inter\-version -compatibility. -.sp -\fBceph\-dencoder\fP takes a simple list of commands and performs them -in order. -.SH COMMANDS -.INDENT 0.0 -.TP -.B version -Print the version string for the \fBceph\-dencoder\fP binary. -.UNINDENT -.INDENT 0.0 -.TP -.B import -Read a binary blob of encoded data from the given file. It will be -placed in an in\-memory buffer. -.UNINDENT -.INDENT 0.0 -.TP -.B export -Write the contents of the current in\-memory buffer to the given -file. -.UNINDENT -.INDENT 0.0 -.TP -.B list_types -List the data types known to this build of \fBceph\-dencoder\fP\&. -.UNINDENT -.INDENT 0.0 -.TP -.B type -Select the given type for future \fBencode\fP or \fBdecode\fP operations. -.UNINDENT -.INDENT 0.0 -.TP -.B decode -Decode the contents of the in\-memory buffer into an instance of the -previously selected type. If there is an error, report it. -.UNINDENT -.INDENT 0.0 -.TP -.B encode -Encode the contents of the in\-memory instance of the previously -selected type to the in\-memory buffer. -.UNINDENT -.INDENT 0.0 -.TP -.B dump_json -Print a JSON\-formatted description of the in\-memory object. -.UNINDENT -.INDENT 0.0 -.TP -.B count_tests -Print the number of built\-in test instances of the previosly -selected type that \fBceph\-dencoder\fP is able to generate. -.UNINDENT -.INDENT 0.0 -.TP -.B select_test -Select the given build\-in test instance as a the in\-memory instance -of the type. -.UNINDENT -.INDENT 0.0 -.TP -.B get_features -Print the decimal value of the feature set supported by this version -of \fBceph\-dencoder\fP\&. Each bit represents a feature. These correspond to -CEPH_FEATURE_* defines in src/include/ceph_features.h. -.UNINDENT -.INDENT 0.0 -.TP -.B set_features -Set the feature bits provided to \fBencode\fP to \fIf\fP\&. This allows -you to encode objects such that they can be understood by old -versions of the software (for those types that support it). -.UNINDENT -.SH EXAMPLE -.sp -Say you want to examine an attribute on an object stored by \fBceph\-osd\fP\&. You can do: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -$ cd /mnt/osd.12/current/2.b_head -$ attr \-l foo_bar_head_EFE6384B -Attribute "ceph.snapset" has a 31 byte value for foo_bar_head_EFE6384B -Attribute "ceph._" has a 195 byte value for foo_bar_head_EFE6384B -$ attr foo_bar_head_EFE6384B \-g ceph._ \-q > /tmp/a -$ ceph\-dencoder type object_info_t import /tmp/a decode dump_json -{ "oid": { "oid": "foo", - "key": "bar", - "snapid": \-2, - "hash": 4024842315, - "max": 0}, - "locator": { "pool": 2, - "preferred": \-1, - "key": "bar"}, - "category": "", - "version": "9\(aq1", - "prior_version": "0\(aq0", - "last_reqid": "client.4116.0:1", - "size": 1681, - "mtime": "2012\-02\-21 08:58:23.666639", - "lost": 0, - "wrlock_by": "unknown.0.0:0", - "snaps": [], - "truncate_seq": 0, - "truncate_size": 0, - "watchers": {}} -.ft P -.fi -.UNINDENT -.UNINDENT -.SH AVAILABILITY -.sp -\fBceph\-dencoder\fP is part of the Ceph distributed storage system. Please -refer to the Ceph documentation at \fI\%http://ceph.com/docs\fP for more -information. -.SH SEE ALSO -.sp -\fBceph\fP(8) -.SH COPYRIGHT -2010-2014, Inktank Storage, Inc. and contributors. Licensed under Creative Commons BY-SA -.\" Generated by docutils manpage writer. -. diff --git a/man/ceph-disk.8 b/man/ceph-disk.8 deleted file mode 100644 index 32d98c5360e39..0000000000000 --- a/man/ceph-disk.8 +++ /dev/null @@ -1,272 +0,0 @@ -.\" Man page generated from reStructuredText. -. -.TH "CEPH-DISK" "8" "November 26, 2014" "dev" "Ceph" -.SH NAME -ceph-disk \- Ceph disk preparation and activation utility for OSD -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -.SH SYNOPSIS -.nf -\fBceph\-disk\fP \fBprepare\fP [\-\-cluster \fIclustername\fP] [\-\-cluster\-uuid \fIuuid\fP] -[\-\-fs\-type \fIxfs|ext4|btrfs\fP] [\fIdata\-path\fP] [\fIjournal\-path\fP] -.fi -.sp -.nf -\fBceph\-disk\fP \fBactivate\fP [\fIdata\-path\fP] [\-\-activate\-key \fIpath\fP] -.fi -.sp -.nf -\fBceph\-disk\fP \fBactivate\-all\fP -.fi -.sp -.nf -\fBceph\-disk\fP \fBlist\fP -.fi -.sp -.SH DESCRIPTION -.sp -\fBceph\-disk\fP is a utility that can prepare and activate a disk, partition or -directory as a ceph OSD. It is run directly or triggered by \fBceph\-deploy\fP -or udev. -.sp -It actually automates the multiple steps involved in manual creation and start -of an OSD into 2 steps of preparing and activating the OSD by using the -subcommands \fBprepare\fP and \fBactivate\fP\&. -.SH SUBCOMMANDS -.sp -\fBprepare\fP: Prepare a directory, disk or drive for a ceph OSD. It creates a GPT -partition, marks the partition with ceph type uuid, creates a file system, marks -the file system as ready for ceph consumption, uses entire partition and adds a -new partition to the journal disk. It is run directly or triggered by -\fBceph\-deploy\fP\&. -.sp -Usage: ceph\-disk prepare \-\-cluster [cluster\-name] \-\-cluster\-uuid [uuid] \-\-fs\-type -[ext4|xfs|btrfs] [data\-path] [journal\-path] -.sp -Other options like \-\-osd\-uuid, \-\-journal\-uuid, \-\-zap\-disk, \-\-data\-dir, \-\-data\-dev, -\-\-journal\-file, \-\-journal\-dev, \-\-dmcrypt and \-\-dmcrypt\-key\-dir can also be used -with the subcommand. -.sp -\fBactivate\fP: Activate the ceph OSD. It mounts the volume in a temporary -location, allocates an OSD id (if needed), remounts in the correct location -/var/lib/ceph/osd/$cluster\-$id and starts ceph\-osd. It is triggered by udev -when it sees the OSD GPT partition type or on ceph service start with -\(aqceph disk activate\-all\(aq. It is also run directly or triggered by \fBceph\-deploy\fP\&. -.sp -Usage: ceph\-disk activate [PATH] -.sp -Here, [PATH] is path to block device or directory. -.sp -An additional option [\-\-activate\-key PATH] has to be used with this subcommand -when a copy of /var/lib/ceph/bootstrap\-osd/{cluster}.keyring isn\(aqt present in the -OSD node. -.sp -Usage: ceph\-disk activate [PATH] [\-\-activate\-key PATH] -.sp -Another option \-\-mark\-init can also be used with this subcommand. -.sp -\fBactivate\-journal\fP: Activate an OSD via it\(aqs journal device. udev triggers -\(aqceph\-disk activate\-journal \(aq based on the partition type. -.sp -Usage: ceph\-disk activate\-journal [DEV] -.sp -Here, [DEV] is the path to journal block device. -.sp -Others options can also be used with this subcommand like \-\-activate\-key and -\-\-mark\-init. -.sp -Usage: ceph\-disk activate\-journal [\-\-activate\-key PATH] [\-\-mark\-init INITSYSTEM] -[DEV] -.sp -\fBactivate\-all\fP: Activate all tagged OSD partitions. activate\-all relies on -/dev/disk/by\-parttype\-uuid/$typeuuid.$uuid to find all partitions. Special udev -rules are installed to create these links. It is triggered on ceph service start -or run directly. -.sp -Usage: ceph\-disk activate\-all -.sp -Others options can also be used with this subcommand like \-\-activate\-key and -\-\-mark\-init. -.sp -Usage: ceph\-disk activate\-all [\-\-activate\-key PATH] [\-\-mark\-init INITSYSTEM] -.sp -\fBlist\fP: List disk partitions and ceph OSDs. It is run directly or triggered -by \fBceph\-deploy\fP\&. -.sp -Usage: ceph\-disk list -.sp -\fBsuppress\-activate\fP: Suppress activate on a device (prefix). -Mark devices that you want to suppress activate with a file like -/var/lib/ceph/tmp/suppress\-activate.sdb where the last bit is -the sanitized device name (/dev/X without the /dev/ prefix). A -function is_suppressed() checks for and matches a prefix (/dev/). -It means suppressing sdb will stop activate on sdb1, sdb2, etc. -.sp -Usage: ceph\-disk suppress\-activate [PATH] -.sp -Here, [PATH] is path to block device or directory. -.sp -\fBunsuppress\-activate\fP: Stop suppressing activate on a device (prefix). -.sp -Usage: ceph\-disk unsuppress\-activate [PATH] -.sp -Here, [PATH] is path to block device or directory. -.sp -\fBzap\fP: Zap/erase/destroy a device\(aqs partition table and contents. -It actually uses \(aqsgdisk\(aq and it\(aqs option \(aq\-\-zap\-all\(aq to destroy both -GPT and MBR data structures so that the disk becomes suitable for -repartitioning. \(aqsgdisk\(aq then uses \(aq\-\-mbrtogpt\(aq to convert the MBR or -BSD disklabel disk to a GPT disk. The \fBprepare\fP subcommand can now be -executed which will create a new GPT partition. It is also run directly -or triggered by \fBceph\-deploy\fP\&. -.sp -Usage: ceph\-disk zap [DEV] -.sp -Here, [DEV] is path to block device. -.SH OPTIONS -.INDENT 0.0 -.TP -.B \-\-prepend\-to\-path PATH -Prepend PATH to $PATH for backward compatibility (default /usr/bin). -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-statedir PATH -Directory in which ceph configuration is preserved (default /usr/lib/ceph). -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-sysconfdir PATH -Directory in which ceph configuration files are found (default /etc/ceph). -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-cluster -Provide name of the ceph cluster in which the OSD is being prepared. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-cluster\-uuid -Provide uuid of the ceph cluster in which the OSD is being prepared. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-fs\-type -Provide the filesytem type for the OSD. e.g. \(aqxfs/ext4/btrfs\(aq. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-osd\-uuid -Unique OSD uuid to assign to the disk. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-journal\-uuid -Unique uuid to assign to the journal. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-zap\-disk -Destroy the partition table and content of a disk. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-data\-dir -Verify that [data\-path] is of a directory. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-data\-dev -Verify that [data\-path] is of a block device. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-journal\-file -Verify that journal is a file. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-journal\-dev -Verify that journal is a block device. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-dmcrypt -Encrypt [data\-path] and/or journal devices with dm\-crypt. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-dmcrypt\-key\-dir -Directory where dm\-crypt keys are stored. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-activate\-key -Use when a copy of /var/lib/ceph/bootstrap\-osd/{cluster}.keyring isn\(aqt -present in the OSD node. Suffix the option by the path to the keyring. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-mark\-init -Provide init system to manage the OSD directory. -.UNINDENT -.SH AVAILABILITY -.sp -\fBceph\-disk\fP is a part of the Ceph distributed storage system. Please refer to -the Ceph documentation at \fI\%http://ceph.com/docs\fP for more information. -.SH COPYRIGHT -2010-2014, Inktank Storage, Inc. and contributors. Licensed under Creative Commons BY-SA -.\" Generated by docutils manpage writer. -. diff --git a/man/ceph-fuse.8 b/man/ceph-fuse.8 deleted file mode 100644 index f41b7b911a4c9..0000000000000 --- a/man/ceph-fuse.8 +++ /dev/null @@ -1,120 +0,0 @@ -.\" Man page generated from reStructuredText. -. -.TH "CEPH-FUSE" "8" "January 12, 2014" "dev" "Ceph" -.SH NAME -ceph-fuse \- FUSE-based client for ceph -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -.SH SYNOPSIS -.nf -\fBceph\-fuse\fP [ \-m \fImonaddr\fP:\fIport\fP ] \fImountpoint\fP [ \fIfuse options\fP ] -.fi -.sp -.SH DESCRIPTION -.sp -\fBceph\-fuse\fP is a FUSE (File system in USErspace) client for Ceph -distributed file system. It will mount a ceph file system (specified -via the \-m option for described by ceph.conf (see below) at the -specific mount point. -.sp -The file system can be unmounted with: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -fusermount \-u mountpoint -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -or by sending \fBSIGINT\fP to the \fBceph\-fuse\fP process. -.SH OPTIONS -.sp -Any options not recognized by ceph\-fuse will be passed on to libfuse. -.INDENT 0.0 -.TP -.B \-d -Detach from console and daemonize after startup. -.UNINDENT -.INDENT 0.0 -.TP -.B \-c ceph.conf, \-\-conf=ceph.conf -Use \fIceph.conf\fP configuration file instead of the default -\fB/etc/ceph/ceph.conf\fP to determine monitor addresses during startup. -.UNINDENT -.INDENT 0.0 -.TP -.B \-m monaddress[:port] -Connect to specified monitor (instead of looking through ceph.conf). -.UNINDENT -.INDENT 0.0 -.TP -.B \-r root_directory -Use root_directory as the mounted root, rather than the full Ceph tree. -.UNINDENT -.SH AVAILABILITY -.sp -\fBceph\-fuse\fP is part of the Ceph distributed storage system. Please refer to -the Ceph documentation at \fI\%http://ceph.com/docs\fP for more information. -.SH SEE ALSO -.sp -fusermount(8), -\fBceph\fP(8) -.SH COPYRIGHT -2010-2014, Inktank Storage, Inc. and contributors. Licensed under Creative Commons BY-SA -.\" Generated by docutils manpage writer. -. diff --git a/man/ceph-mds.8 b/man/ceph-mds.8 deleted file mode 100644 index 398b0102debd9..0000000000000 --- a/man/ceph-mds.8 +++ /dev/null @@ -1,121 +0,0 @@ -.\" Man page generated from reStructuredText. -. -.TH "CEPH-MDS" "8" "January 12, 2014" "dev" "Ceph" -.SH NAME -ceph-mds \- ceph metadata server daemon -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -.SH SYNOPSIS -.nf -\fBceph\-mds\fP \-i \fIname\fP [[ \-\-hot\-standby [\fIrank\fP] ]|[\-\-journal_check \fIrank\fP]] -.fi -.sp -.SH DESCRIPTION -.sp -\fBceph\-mds\fP is the metadata server daemon for the Ceph distributed file -system. One or more instances of ceph\-mds collectively manage the file -system namespace, coordinating access to the shared OSD cluster. -.sp -Each ceph\-mds daemon instance should have a unique name. The name is used -to identify daemon instances in the ceph.conf. -.sp -Once the daemon has started, the monitor cluster will normally assign -it a logical rank, or put it in a standby pool to take over for -another daemon that crashes. Some of the specified options can cause -other behaviors. -.sp -If you specify hot\-standby or journal\-check, you must either specify -the rank on the command line, or specify one of the -mds_standby_for_[rank|name] parameters in the config. The command -line specification overrides the config, and specifying the rank -overrides specifying the name. -.SH OPTIONS -.INDENT 0.0 -.TP -.B \-f, \-\-foreground -Foreground: do not daemonize after startup (run in foreground). Do -not generate a pid file. Useful when run via \fBceph\-run\fP(8). -.UNINDENT -.INDENT 0.0 -.TP -.B \-d -Debug mode: like \fB\-f\fP, but also send all log output to stderr. -.UNINDENT -.INDENT 0.0 -.TP -.B \-c ceph.conf, \-\-conf=ceph.conf -Use \fIceph.conf\fP configuration file instead of the default -\fB/etc/ceph/ceph.conf\fP to determine monitor addresses during -startup. -.UNINDENT -.INDENT 0.0 -.TP -.B \-m monaddress[:port] -Connect to specified monitor (instead of looking through -\fBceph.conf\fP). -.UNINDENT -.SH AVAILABILITY -.sp -\fBceph\-mon\fP is part of the Ceph distributed storage system. Please refer to the Ceph documentation at -\fI\%http://ceph.com/docs\fP for more information. -.SH SEE ALSO -.sp -\fBceph\fP(8), -\fBceph\-mon\fP(8), -\fBceph\-osd\fP(8) -.SH COPYRIGHT -2010-2014, Inktank Storage, Inc. and contributors. Licensed under Creative Commons BY-SA -.\" Generated by docutils manpage writer. -. diff --git a/man/ceph-mon.8 b/man/ceph-mon.8 deleted file mode 100644 index 64e3bf9c1f323..0000000000000 --- a/man/ceph-mon.8 +++ /dev/null @@ -1,137 +0,0 @@ -.\" Man page generated from reStructuredText. -. -.TH "CEPH-MON" "8" "January 12, 2014" "dev" "Ceph" -.SH NAME -ceph-mon \- ceph monitor daemon -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -.SH SYNOPSIS -.nf -\fBceph\-mon\fP \-i \fImonid\fP [ \-\-mon\-data \fImondatapath\fP ] -.fi -.sp -.SH DESCRIPTION -.sp -\fBceph\-mon\fP is the cluster monitor daemon for the Ceph distributed -file system. One or more instances of \fBceph\-mon\fP form a Paxos -part\-time parliament cluster that provides extremely reliable and -durable storage of cluster membership, configuration, and state. -.sp -The \fImondatapath\fP refers to a directory on a local file system storing -monitor data. It is normally specified via the \fBmon data\fP option in -the configuration file. -.SH OPTIONS -.INDENT 0.0 -.TP -.B \-f, \-\-foreground -Foreground: do not daemonize after startup (run in foreground). Do -not generate a pid file. Useful when run via \fBceph\-run\fP(8). -.UNINDENT -.INDENT 0.0 -.TP -.B \-d -Debug mode: like \fB\-f\fP, but also send all log output to stderr. -.UNINDENT -.INDENT 0.0 -.TP -.B \-c ceph.conf, \-\-conf=ceph.conf -Use \fIceph.conf\fP configuration file instead of the default -\fB/etc/ceph/ceph.conf\fP to determine monitor addresses during -startup. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-mkfs -Initialize the \fBmon data\fP directory with seed information to form -and initial ceph file system or to join an existing monitor -cluster. Three pieces of information must be provided: -.INDENT 7.0 -.IP \(bu 2 -The cluster fsid. This can come from a monmap (\fB\-\-monmap \fP) or -explicitly via \fB\-\-fsid \fP\&. -.IP \(bu 2 -A list of monitors and their addresses. This list of monitors -can come from a monmap (\fB\-\-monmap \fP), the \fBmon host\fP -configuration value (in \fIceph.conf\fP or via \fB\-m -host1,host2,...\fP), or \fBmon addr\fP lines in \fIceph.conf\fP\&. If this -monitor is to be part of the initial monitor quorum for a new -Ceph cluster, then it must be included in the initial list, -matching either the name or address of a monitor in the list. -When matching by address, either the \fBpublic addr\fP or \fBpublic -subnet\fP options may be used. -.IP \(bu 2 -The monitor secret key \fBmon.\fP\&. This must be included in the -keyring provided via \fB\-\-keyring \fP\&. -.UNINDENT -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-keyring -Specify a keyring for use with \fB\-\-mkfs\fP\&. -.UNINDENT -.SH AVAILABILITY -.sp -\fBceph\-mon\fP is part of the Ceph distributed storage system. Please refer -to the Ceph documentation at \fI\%http://ceph.com/docs\fP for more -information. -.SH SEE ALSO -.sp -\fBceph\fP(8), -\fBceph\-mds\fP(8), -\fBceph\-osd\fP(8) -.SH COPYRIGHT -2010-2014, Inktank Storage, Inc. and contributors. Licensed under Creative Commons BY-SA -.\" Generated by docutils manpage writer. -. diff --git a/man/ceph-osd.8 b/man/ceph-osd.8 deleted file mode 100644 index 24ec595c08153..0000000000000 --- a/man/ceph-osd.8 +++ /dev/null @@ -1,169 +0,0 @@ -.\" Man page generated from reStructuredText. -. -.TH "CEPH-OSD" "8" "January 12, 2014" "dev" "Ceph" -.SH NAME -ceph-osd \- ceph object storage daemon -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -.SH SYNOPSIS -.nf -\fBceph\-osd\fP \-i \fIosdnum\fP [ \-\-osd\-data \fIdatapath\fP ] [ \-\-osd\-journal -\fIjournal\fP ] [ \-\-mkfs ] [ \-\-mkjournal ] [ \-\-mkkey ] -.fi -.sp -.SH DESCRIPTION -.sp -\fBceph\-osd\fP is the object storage daemon for the Ceph distributed file -system. It is responsible for storing objects on a local file system -and providing access to them over the network. -.sp -The datapath argument should be a directory on a btrfs file system -where the object data resides. The journal is optional, and is only -useful performance\-wise when it resides on a different disk than -datapath with low latency (ideally, an NVRAM device). -.SH OPTIONS -.INDENT 0.0 -.TP -.B \-f, \-\-foreground -Foreground: do not daemonize after startup (run in foreground). Do -not generate a pid file. Useful when run via \fBceph\-run\fP(8). -.UNINDENT -.INDENT 0.0 -.TP -.B \-d -Debug mode: like \fB\-f\fP, but also send all log output to stderr. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-osd\-data osddata -Use object store at \fIosddata\fP\&. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-osd\-journal journal -Journal updates to \fIjournal\fP\&. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-mkfs -Create an empty object repository. This also initializes the journal -(if one is defined). -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-mkkey -Generate a new secret key. This is normally used in combination -with \fB\-\-mkfs\fP as it is more convenient than generating a key by -hand with \fBceph\-authtool\fP(8). -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-mkjournal -Create a new journal file to match an existing object repository. -This is useful if the journal device or file is wiped out due to a -disk or file system failure. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-flush\-journal -Flush the journal to permanent store. This runs in the foreground -so you know when it\(aqs completed. This can be useful if you want to -resize the journal or need to otherwise destroy it: this guarantees -you won\(aqt lose data. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-get\-cluster\-fsid -Print the cluster fsid (uuid) and exit. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-get\-osd\-fsid -Print the OSD\(aqs fsid and exit. The OSD\(aqs uuid is generated at -\-\-mkfs time and is thus unique to a particular instantiation of -this OSD. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-get\-journal\-fsid -Print the journal\(aqs uuid. The journal fsid is set to match the OSD -fsid at \-\-mkfs time. -.UNINDENT -.INDENT 0.0 -.TP -.B \-c ceph.conf, \-\-conf=ceph.conf -Use \fIceph.conf\fP configuration file instead of the default -\fB/etc/ceph/ceph.conf\fP for runtime configuration options. -.UNINDENT -.INDENT 0.0 -.TP -.B \-m monaddress[:port] -Connect to specified monitor (instead of looking through -\fBceph.conf\fP). -.UNINDENT -.SH AVAILABILITY -.sp -\fBceph\-osd\fP is part of the Ceph distributed storage system. Please refer to -the Ceph documentation at \fI\%http://ceph.com/docs\fP for more information. -.SH SEE ALSO -.sp -\fBceph\fP(8), -\fBceph\-mds\fP(8), -\fBceph\-mon\fP(8), -\fBceph\-authtool\fP(8) -.SH COPYRIGHT -2010-2014, Inktank Storage, Inc. and contributors. Licensed under Creative Commons BY-SA -.\" Generated by docutils manpage writer. -. diff --git a/man/ceph-post-file.8 b/man/ceph-post-file.8 deleted file mode 100644 index a0378215cb0e3..0000000000000 --- a/man/ceph-post-file.8 +++ /dev/null @@ -1,130 +0,0 @@ -.\" Man page generated from reStructuredText. -. -.TH "CEPH-POST-FILE" "8" "January 12, 2014" "dev" "Ceph" -.SH NAME -ceph-post-file \- post files for ceph developers -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -.SH SYNOPSIS -.nf -\fBceph\-post\-file\fP [\-d \fIdescription] [\-u *user\fP] \fIfile or dir\fP ... -.fi -.sp -.SH DESCRIPTION -.sp -\fBceph\-post\-file\fP will upload files or directories to ceph.com for -later analysis by Ceph developers. -.sp -Each invocation uploads files or directories to a separate directory -with a unique tag. That tag can be passed to a developer or -referenced in a bug report (\fI\%http://tracker.ceph.com/\fP). Once the -upload completes, the directory is marked non\-readable and -non\-writeable to prevent access or modification by other users. -.SH WARNING -.sp -Basic measures are taken to make posted data be visible only to -developers with access to ceph.com infrastructure. However, users -should think twice and/or take appropriate precautions before -posting potentially sensitive data (for example, logs or data -directories that contain Ceph secrets). -.SH OPTIONS -.INDENT 0.0 -.TP -.B \-d *description*, \-\-description *description* -Add a short description for the upload. This is a good opportunity -to reference a bug number. There is no default value. -.UNINDENT -.INDENT 0.0 -.TP -.B \-u *user* -Set the user metadata for the upload. This defaults to \fIwhoami\(ga@\(gahostname \-f\fP\&. -.UNINDENT -.SH EXAMPLES -.sp -To upload a single log: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -ceph\-post\-file /var/log/ceph/ceph\-mon.\(gahostname\(ga.log -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -To upload several directories: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -ceph\-post\-file \-d \(aqmon data directories\(aq /var/log/ceph/mon/* -.ft P -.fi -.UNINDENT -.UNINDENT -.SH AVAILABILITY -.sp -\fBceph\-post\-file\fP is part of the Ceph distributed storage system. Please refer to -the Ceph documentation at \fI\%http://ceph.com/docs\fP for more information. -.SH SEE ALSO -.sp -\fBceph\fP(8), -\fBceph\-debugpack\fP(8), -.SH COPYRIGHT -2010-2014, Inktank Storage, Inc. and contributors. Licensed under Creative Commons BY-SA -.\" Generated by docutils manpage writer. -. diff --git a/man/ceph-rbdnamer.8 b/man/ceph-rbdnamer.8 deleted file mode 100644 index dab2c10655403..0000000000000 --- a/man/ceph-rbdnamer.8 +++ /dev/null @@ -1,92 +0,0 @@ -.\" Man page generated from reStructuredText. -. -.TH "CEPH-RBDNAMER" "8" "January 12, 2014" "dev" "Ceph" -.SH NAME -ceph-rbdnamer \- udev helper to name RBD devices -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -.SH SYNOPSIS -.nf -\fBceph\-rbdnamer\fP \fInum\fP -.fi -.sp -.SH DESCRIPTION -.sp -\fBceph\-rbdnamer\fP prints the pool and image name for the given RBD devices -to stdout. It is used by \fIudev\fP (using a rule like the one below) to -set up a device symlink. -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -KERNEL=="rbd[0\-9]*", PROGRAM="/usr/bin/ceph\-rbdnamer %n", SYMLINK+="rbd/%c{1}/%c{2}" -.ft P -.fi -.UNINDENT -.UNINDENT -.SH AVAILABILITY -.sp -\fBceph\-rbdnamer\fP is part of the Ceph distributed storage system. Please -refer to the Ceph documentation at \fI\%http://ceph.com/docs\fP for more -information. -.SH SEE ALSO -.sp -\fBrbd\fP(8), -\fBceph\fP(8) -.SH COPYRIGHT -2010-2014, Inktank Storage, Inc. and contributors. Licensed under Creative Commons BY-SA -.\" Generated by docutils manpage writer. -. diff --git a/man/ceph-rest-api.8 b/man/ceph-rest-api.8 deleted file mode 100644 index 0d443f650ad21..0000000000000 --- a/man/ceph-rest-api.8 +++ /dev/null @@ -1,208 +0,0 @@ -.\" Man page generated from reStructuredText. -. -.TH "CEPH-REST-API" "8" "January 12, 2014" "dev" "Ceph" -.SH NAME -ceph-rest-api \- ceph RESTlike administration server -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -.SH SYNOPSIS -.nf -\fBceph\-rest\-api\fP [ \-c \fIconffile\fP ] [\-\-cluster \fIclustername\fP ] [ \-n \fIname\fP ] [\-i \fIid\fP ] -.fi -.sp -.SH DESCRIPTION -.sp -\fBceph\-rest\-api\fP is a WSGI application that can run as a -standalone web service or run under a web server that supports -WSGI. It provides much of the functionality of the \fBceph\fP -command\-line tool through an HTTP\-accessible interface. -.SH OPTIONS -.INDENT 0.0 -.TP -.B \-c/\-\-conf conffile -names the ceph.conf file to use for configuration. If \-c is not -specified, the default depends on the state of the \-\-cluster option -(default \(aqceph\(aq; see below). The configuration file is searched -for in this order: -.INDENT 7.0 -.IP \(bu 2 -$CEPH_CONF -.IP \(bu 2 -/etc/ceph/${cluster}.conf -.IP \(bu 2 -~/.ceph/${cluster}.conf -.IP \(bu 2 -${cluster}.conf (in the current directory) -.UNINDENT -.sp -so you can also pass this option in the environment as CEPH_CONF. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-cluster clustername -set \fIclustername\fP for use in the $cluster metavariable, for -locating the ceph.conf file. The default is \(aqceph\(aq. -.UNINDENT -.INDENT 0.0 -.TP -.B \-n/\-\-name name -specifies the client \(aqname\(aq, which is used to find the -client\-specific configuration options in the config file, and -also is the name used for authentication when connecting -to the cluster (the entity name appearing in ceph auth list output, -for example). The default is \(aqclient.restapi\(aq. -.UNINDENT -.INDENT 0.0 -.TP -.B \-i/\-\-id id -specifies the client \(aqid\(aq, which will form the clientname -as \(aqclient.\(aq if clientname is not set. If \-n/\-name is -set, that takes precedence. -.sp -Also, global Ceph options are supported. -.UNINDENT -.SH CONFIGURATION PARAMETERS -.sp -Supported configuration parameters include: -.INDENT 0.0 -.IP \(bu 2 -\fBkeyring\fP the keyring file holding the key for \(aqclientname\(aq -.IP \(bu 2 -\fBpublic addr\fP ip:port to listen on (default 0.0.0.0:5000) -.IP \(bu 2 -\fBlog file\fP (usual Ceph default) -.IP \(bu 2 -\fBrestapi base url\fP the base URL to answer requests on (default /api/v0.1) -.IP \(bu 2 -\fBrestapi log level\fP critical, error, warning, info, debug (default warning) -.UNINDENT -.sp -Configuration parameters are searched in the standard order: -first in the section named \(aq\(aq, then \(aqclient\(aq, then \(aqglobal\(aq. -.sp - is either supplied by \-n/\-\-name, "client." where - is supplied by \-i/\-\-id, or \(aqclient.restapi\(aq if neither option -is present. -.sp -A single\-threaded server will run on \fBpublic addr\fP if the ceph\-rest\-api -executed directly; otherwise, configuration is specified by the enclosing -WSGI web server. -.SH COMMANDS -.sp -Commands are submitted with HTTP GET requests (for commands that -primarily return data) or PUT (for commands that affect cluster state). -HEAD and OPTIONS are also supported. Standard HTTP status codes -are returned. -.sp -For commands that return bulk data, the request can include -Accept: application/json or Accept: application/xml to select the -desired structured output, or you may use a .json or .xml addition -to the requested PATH. Parameters are supplied as query parameters -in the request; for parameters that take more than one value, repeat -the key=val construct. For instance, to remove OSDs 2 and 3, -send a PUT request to \fBosd/rm?ids=2&ids=3\fP\&. -.SH DISCOVERY -.sp -Human\-readable discovery of supported commands and parameters, along -with a small description of each command, is provided when the requested -path is incomplete/partially matching. Requesting / will redirect to -the value of \fBrestapi base url\fP, and that path will give a full list -of all known commands. -For example, requesting \fBapi/vX.X/mon\fP will return the list of API calls for -monitors \- \fBapi/vX.X/osd\fP will return the list of API calls for OSD and so on. -.sp -The command set is very similar to the commands -supported by the \fBceph\fP tool. One notable exception is that the -\fBceph pg \fP style of commands is supported here -as \fBtell//command?args\fP\&. -.SH DEPLOYMENT AS WSGI APPLICATION -.sp -When deploying as WSGI application (say, with Apache/mod_wsgi, -or nginx/uwsgi, or gunicorn, etc.), use the \fBceph_rest_api.py\fP module -(\fBceph\-rest\-api\fP is a thin layer around this module). The standalone web -server is of course not used, so address/port configuration is done in -the WSGI server. Use a python .wsgi module or the equivalent to call -\fBapp = generate_app(conf, cluster, clientname, clientid, args)\fP where: -.INDENT 0.0 -.IP \(bu 2 -conf is as \-c/\-\-conf above -.IP \(bu 2 -cluster is as \-\-cluster above -.IP \(bu 2 -clientname, \-n/\-\-name -.IP \(bu 2 -clientid, \-i/\-\-id, and -.IP \(bu 2 -args are any other generic Ceph arguments -.UNINDENT -.sp -When app is returned, it will have attributes \(aqceph_addr\(aq and \(aqceph_port\(aq -set to what the address and port are in the Ceph configuration; -those may be used for the server, or ignored. -.sp -Any errors reading configuration or connecting to the cluster cause an -exception to be raised; see your WSGI server documentation for how to -see those messages in case of problem. -.SH AVAILABILITY -.sp -\fBceph\-rest\-api\fP is part of the Ceph distributed storage system. Please refer to the Ceph documentation at -\fI\%http://ceph.com/docs\fP for more information. -.SH SEE ALSO -.sp -\fBceph\fP(8) -.SH COPYRIGHT -2010-2014, Inktank Storage, Inc. and contributors. Licensed under Creative Commons BY-SA -.\" Generated by docutils manpage writer. -. diff --git a/man/ceph-run.8 b/man/ceph-run.8 deleted file mode 100644 index 0baa632b51547..0000000000000 --- a/man/ceph-run.8 +++ /dev/null @@ -1,89 +0,0 @@ -.\" Man page generated from reStructuredText. -. -.TH "CEPH-RUN" "8" "January 12, 2014" "dev" "Ceph" -.SH NAME -ceph-run \- restart daemon on core dump -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -.SH SYNOPSIS -.nf -\fBceph\-run\fP \fIcommand\fP ... -.fi -.sp -.SH DESCRIPTION -.sp -\fBceph\-run\fP is a simple wrapper that will restart a daemon if it exits -with a signal indicating it crashed and possibly core dumped (that is, -signals 3, 4, 5, 6, 8, or 11). -.sp -The command should run the daemon in the foreground. For Ceph daemons, -that means the \fB\-f\fP option. -.SH OPTIONS -.sp -None -.SH AVAILABILITY -.sp -\fBceph\-run\fP is part of the Ceph distributed storage system. Please refer to -the Ceph documentation at \fI\%http://ceph.com/docs\fP for more information. -.SH SEE ALSO -.sp -\fBceph\fP(8), -\fBceph\-mon\fP(8), -\fBceph\-mds\fP(8), -\fBceph\-osd\fP(8) -.SH COPYRIGHT -2010-2014, Inktank Storage, Inc. and contributors. Licensed under Creative Commons BY-SA -.\" Generated by docutils manpage writer. -. diff --git a/man/ceph-syn.8 b/man/ceph-syn.8 deleted file mode 100644 index 6b12a09be28eb..0000000000000 --- a/man/ceph-syn.8 +++ /dev/null @@ -1,148 +0,0 @@ -.\" Man page generated from reStructuredText. -. -.TH "CEPH-SYN" "8" "January 12, 2014" "dev" "Ceph" -.SH NAME -ceph-syn \- ceph synthetic workload generator -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -.SH SYNOPSIS -.nf -\fBceph\-syn\fP [ \-m \fImonaddr\fP:\fIport\fP ] \-\-syn \fIcommand\fP \fI\&...\fP -.fi -.sp -.SH DESCRIPTION -.sp -\fBceph\-syn\fP is a simple synthetic workload generator for the Ceph -distributed file system. It uses the userspace client library to -generate simple workloads against a currently running file system. The -file system need not be mounted via ceph\-fuse(8) or the kernel client. -.sp -One or more \fB\-\-syn\fP command arguments specify the particular -workload, as documented below. -.SH OPTIONS -.INDENT 0.0 -.TP -.B \-d -Detach from console and daemonize after startup. -.UNINDENT -.INDENT 0.0 -.TP -.B \-c ceph.conf, \-\-conf=ceph.conf -Use \fIceph.conf\fP configuration file instead of the default -\fB/etc/ceph/ceph.conf\fP to determine monitor addresses during -startup. -.UNINDENT -.INDENT 0.0 -.TP -.B \-m monaddress[:port] -Connect to specified monitor (instead of looking through -\fBceph.conf\fP). -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-num_client num -Run num different clients, each in a separate thread. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-syn workloadspec -Run the given workload. May be specified as many times as -needed. Workloads will normally run sequentially. -.UNINDENT -.SH WORKLOADS -.sp -Each workload should be preceded by \fB\-\-syn\fP on the command -line. This is not a complete list. -.INDENT 0.0 -.TP -.B \fBmknap\fP \fIpath\fP \fIsnapname\fP -Create a snapshot called \fIsnapname\fP on \fIpath\fP\&. -.TP -.B \fBrmsnap\fP \fIpath\fP \fIsnapname\fP -Delete snapshot called \fIsnapname\fP on \fIpath\fP\&. -.TP -.B \fBrmfile\fP \fIpath\fP -Delete/unlink \fIpath\fP\&. -.TP -.B \fBwritefile\fP \fIsizeinmb\fP \fIblocksize\fP -Create a file, named after our client id, that is \fIsizeinmb\fP MB by -writing \fIblocksize\fP chunks. -.TP -.B \fBreadfile\fP \fIsizeinmb\fP \fIblocksize\fP -Read file, named after our client id, that is \fIsizeinmb\fP MB by -writing \fIblocksize\fP chunks. -.TP -.B \fBrw\fP \fIsizeinmb\fP \fIblocksize\fP -Write file, then read it back, as above. -.TP -.B \fBmakedirs\fP \fInumsubdirs\fP \fInumfiles\fP \fIdepth\fP -Create a hierarchy of directories that is \fIdepth\fP levels deep. Give -each directory \fInumsubdirs\fP subdirectories and \fInumfiles\fP files. -.TP -.B \fBwalk\fP -Recursively walk the file system (like find). -.UNINDENT -.SH AVAILABILITY -.sp -\fBceph\-syn\fP is part of the Ceph distributed storage system. Please refer to -the Ceph documentation at \fI\%http://ceph.com/docs\fP for more information. -.SH SEE ALSO -.sp -\fBceph\fP(8), -\fBceph\-fuse\fP(8) -.SH COPYRIGHT -2010-2014, Inktank Storage, Inc. and contributors. Licensed under Creative Commons BY-SA -.\" Generated by docutils manpage writer. -. diff --git a/man/ceph.8 b/man/ceph.8 deleted file mode 100644 index 9bb903c07c0b5..0000000000000 --- a/man/ceph.8 +++ /dev/null @@ -1,162 +0,0 @@ -.\" Man page generated from reStructuredText. -. -.TH "CEPH" "8" "January 12, 2014" "dev" "Ceph" -.SH NAME -ceph \- ceph file system control utility -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -.SH SYNOPSIS -.nf -\fBceph\fP [ \-m \fImonaddr\fP ] [ \-w | \fIcommand\fP ... ] -.fi -.sp -.SH DESCRIPTION -.sp -\fBceph\fP is a control utility for communicating with the monitor -cluster of a running Ceph distributed storage system. -.sp -There are three basic modes of operation. -.SS Interactive mode -.sp -To start in interactive mode, no arguments are necessary. Control\-d or -\(aqquit\(aq will exit. -.SS Watch mode -.sp -Watch mode shows cluster state changes as they occur. For example: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -ceph \-w -.ft P -.fi -.UNINDENT -.UNINDENT -.SS Command line mode -.sp -Finally, to send a single instruction to the monitor cluster (and wait -for a response), the command can be specified on the command line. -.SH OPTIONS -.INDENT 0.0 -.TP -.B \-i infile -will specify an input file to be passed along as a payload with the -command to the monitor cluster. This is only used for specific -monitor commands. -.UNINDENT -.INDENT 0.0 -.TP -.B \-o outfile -will write any payload returned by the monitor cluster with its -reply to outfile. Only specific monitor commands (e.g. osd getmap) -return a payload. -.UNINDENT -.INDENT 0.0 -.TP -.B \-c ceph.conf, \-\-conf=ceph.conf -Use ceph.conf configuration file instead of the default -/etc/ceph/ceph.conf to determine monitor addresses during startup. -.UNINDENT -.INDENT 0.0 -.TP -.B \-m monaddress[:port] -Connect to specified monitor (instead of looking through ceph.conf). -.UNINDENT -.SH EXAMPLES -.sp -To grab a copy of the current OSD map: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -ceph \-m 1.2.3.4:6789 osd getmap \-o osdmap -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -To get a dump of placement group (PG) state: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -ceph pg dump \-o pg.txt -.ft P -.fi -.UNINDENT -.UNINDENT -.SH MONITOR COMMANDS -.sp -A more complete summary of commands understood by the monitor cluster can be found in the -online documentation, at -.INDENT 0.0 -.INDENT 3.5 -\fI\%http://ceph.com/docs/master/rados/operations/control\fP -.UNINDENT -.UNINDENT -.SH AVAILABILITY -.sp -\fBceph\fP is part of the Ceph distributed storage system. Please refer to the Ceph documentation at -\fI\%http://ceph.com/docs\fP for more information. -.SH SEE ALSO -.sp -\fBceph\fP(8), -.SH COPYRIGHT -2010-2014, Inktank Storage, Inc. and contributors. Licensed under Creative Commons BY-SA -.\" Generated by docutils manpage writer. -. diff --git a/man/ceph_selinux.8 b/man/ceph_selinux.8 new file mode 100644 index 0000000000000..de74807c8ed87 --- /dev/null +++ b/man/ceph_selinux.8 @@ -0,0 +1,324 @@ +.TH "ceph_selinux" "8" "15-06-17" "ceph" "SELinux Policy ceph" +.SH "NAME" +ceph_selinux \- Security Enhanced Linux Policy for the ceph processes +.SH "DESCRIPTION" + +Security-Enhanced Linux secures the ceph processes via flexible mandatory access control. + +The ceph processes execute with the ceph_t SELinux type. You can check if you have these processes running by executing the \fBps\fP command with the \fB\-Z\fP qualifier. + +For example: + +.B ps -eZ | grep ceph_t + + +.SH "ENTRYPOINTS" + +The ceph_t SELinux type can be entered via the \fBceph_exec_t\fP file type. + +The default entrypoint paths for the ceph_t domain are the following: + +/usr/bin/ceph-mon, /usr/bin/ceph-mds, /usr/bin/ceph-osd +.SH PROCESS TYPES +SELinux defines process types (domains) for each process running on the system +.PP +You can see the context of a process using the \fB\-Z\fP option to \fBps\bP +.PP +Policy governs the access confined processes have to files. +SELinux ceph policy is very flexible allowing users to setup their ceph processes in as secure a method as possible. +.PP +The following process types are defined for ceph: + +.EX +.B ceph_t +.EE +.PP +Note: +.B semanage permissive -a ceph_t +can be used to make the process type ceph_t permissive. SELinux does not deny access to permissive process types, but the AVC (SELinux denials) messages are still generated. + +.SH BOOLEANS +SELinux policy is customizable based on least access required. ceph policy is extremely flexible and has several booleans that allow you to manipulate the policy and run ceph with the tightest access possible. + + +.PP +If you want to allow users to resolve user passwd entries directly from ldap rather then using a sssd server, you must turn on the authlogin_nsswitch_use_ldap boolean. Disabled by default. + +.EX +.B setsebool -P authlogin_nsswitch_use_ldap 1 + +.EE + +.PP +If you want to allow all daemons to write corefiles to /, you must turn on the daemons_dump_core boolean. Disabled by default. + +.EX +.B setsebool -P daemons_dump_core 1 + +.EE + +.PP +If you want to enable cluster mode for daemons, you must turn on the daemons_enable_cluster_mode boolean. Disabled by default. + +.EX +.B setsebool -P daemons_enable_cluster_mode 1 + +.EE + +.PP +If you want to allow all daemons to use tcp wrappers, you must turn on the daemons_use_tcp_wrapper boolean. Disabled by default. + +.EX +.B setsebool -P daemons_use_tcp_wrapper 1 + +.EE + +.PP +If you want to allow all daemons the ability to read/write terminals, you must turn on the daemons_use_tty boolean. Disabled by default. + +.EX +.B setsebool -P daemons_use_tty 1 + +.EE + +.PP +If you want to deny any process from ptracing or debugging any other processes, you must turn on the deny_ptrace boolean. Disabled by default. + +.EX +.B setsebool -P deny_ptrace 1 + +.EE + +.PP +If you want to allow all domains to use other domains file descriptors, you must turn on the domain_fd_use boolean. Enabled by default. + +.EX +.B setsebool -P domain_fd_use 1 + +.EE + +.PP +If you want to allow all domains to have the kernel load modules, you must turn on the domain_kernel_load_modules boolean. Disabled by default. + +.EX +.B setsebool -P domain_kernel_load_modules 1 + +.EE + +.PP +If you want to allow all domains to execute in fips_mode, you must turn on the fips_mode boolean. Enabled by default. + +.EX +.B setsebool -P fips_mode 1 + +.EE + +.PP +If you want to enable reading of urandom for all domains, you must turn on the global_ssp boolean. Disabled by default. + +.EX +.B setsebool -P global_ssp 1 + +.EE + +.PP +If you want to allow confined applications to run with kerberos, you must turn on the kerberos_enabled boolean. Enabled by default. + +.EX +.B setsebool -P kerberos_enabled 1 + +.EE + +.PP +If you want to allow system to run with NIS, you must turn on the nis_enabled boolean. Disabled by default. + +.EX +.B setsebool -P nis_enabled 1 + +.EE + +.PP +If you want to allow confined applications to use nscd shared memory, you must turn on the nscd_use_shm boolean. Enabled by default. + +.EX +.B setsebool -P nscd_use_shm 1 + +.EE + +.SH "MANAGED FILES" + +The SELinux process type ceph_t can manage files labeled with the following file types. The paths listed are the default paths for these file types. Note the processes UID still need to have DAC permissions. + +.br +.B ceph_log_t + + /var/log/ceph(/.*)? +.br + +.br +.B ceph_var_lib_t + + /var/lib/ceph(/.*)? +.br + +.br +.B ceph_var_run_t + + /var/run/ceph(/.*)? +.br + +.br +.B cluster_conf_t + + /etc/cluster(/.*)? +.br + +.br +.B cluster_var_lib_t + + /var/lib/pcsd(/.*)? +.br + /var/lib/cluster(/.*)? +.br + /var/lib/openais(/.*)? +.br + /var/lib/pengine(/.*)? +.br + /var/lib/corosync(/.*)? +.br + /usr/lib/heartbeat(/.*)? +.br + /var/lib/heartbeat(/.*)? +.br + /var/lib/pacemaker(/.*)? +.br + +.br +.B cluster_var_run_t + + /var/run/crm(/.*)? +.br + /var/run/cman_.* +.br + /var/run/rsctmp(/.*)? +.br + /var/run/aisexec.* +.br + /var/run/heartbeat(/.*)? +.br + /var/run/cpglockd\.pid +.br + /var/run/corosync\.pid +.br + /var/run/rgmanager\.pid +.br + /var/run/cluster/rgmanager\.sk +.br + +.br +.B root_t + + / +.br + /initrd +.br + +.SH FILE CONTEXTS +SELinux requires files to have an extended attribute to define the file type. +.PP +You can see the context of a file using the \fB\-Z\fP option to \fBls\bP +.PP +Policy governs the access confined processes have to these files. +SELinux ceph policy is very flexible allowing users to setup their ceph processes in as secure a method as possible. +.PP + +.PP +.B STANDARD FILE CONTEXT + +SELinux defines the file context types for the ceph, if you wanted to +store files with these types in a diffent paths, you need to execute the semanage command to sepecify alternate labeling and then use restorecon to put the labels on disk. + +.B semanage fcontext -a -t ceph_var_run_t '/srv/myceph_content(/.*)?' +.br +.B restorecon -R -v /srv/myceph_content + +Note: SELinux often uses regular expressions to specify labels that match multiple files. + +.I The following file types are defined for ceph: + + +.EX +.PP +.B ceph_exec_t +.EE + +- Set files with the ceph_exec_t type, if you want to transition an executable to the ceph_t domain. + +.br +.TP 5 +Paths: +/usr/bin/ceph-mon, /usr/bin/ceph-mds, /usr/bin/ceph-osd + +.EX +.PP +.B ceph_initrc_exec_t +.EE + +- Set files with the ceph_initrc_exec_t type, if you want to transition an executable to the ceph_initrc_t domain. + + +.EX +.PP +.B ceph_log_t +.EE + +- Set files with the ceph_log_t type, if you want to treat the data as ceph log data, usually stored under the /var/log directory. + + +.EX +.PP +.B ceph_var_lib_t +.EE + +- Set files with the ceph_var_lib_t type, if you want to store the ceph files under the /var/lib directory. + + +.EX +.PP +.B ceph_var_run_t +.EE + +- Set files with the ceph_var_run_t type, if you want to store the ceph files under the /run or /var/run directory. + + +.PP +Note: File context can be temporarily modified with the chcon command. If you want to permanently change the file context you need to use the +.B semanage fcontext +command. This will modify the SELinux labeling database. You will need to use +.B restorecon +to apply the labels. + +.SH "COMMANDS" +.B semanage fcontext +can also be used to manipulate default file context mappings. +.PP +.B semanage permissive +can also be used to manipulate whether or not a process type is permissive. +.PP +.B semanage module +can also be used to enable/disable/install/remove policy modules. + +.B semanage boolean +can also be used to manipulate the booleans + +.PP +.B system-config-selinux +is a GUI tool available to customize SELinux policy settings. + +.SH AUTHOR +This manual page was auto-generated using +.B "sepolicy manpage". + +.SH "SEE ALSO" +selinux(8), ceph(8), semanage(8), restorecon(8), chcon(1), sepolicy(8) +, setsebool(8) \ No newline at end of file diff --git a/man/cephfs.8 b/man/cephfs.8 deleted file mode 100644 index 2fa558390a35a..0000000000000 --- a/man/cephfs.8 +++ /dev/null @@ -1,144 +0,0 @@ -.\" Man page generated from reStructuredText. -. -.TH "CEPHFS" "8" "January 12, 2014" "dev" "Ceph" -.SH NAME -cephfs \- ceph file system options utility -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -.SH SYNOPSIS -.nf -\fBcephfs\fP [ \fIpath\fP \fIcommand\fP \fIoptions\fP ] -.fi -.sp -.SH DESCRIPTION -.sp -\fBcephfs\fP is a control utility for accessing and manipulating file -layout and location data in the Ceph distributed storage system. -.sp -Choose one of the following three commands: -.INDENT 0.0 -.IP \(bu 2 -\fBshow_layout\fP View the layout information on a file or directory -.IP \(bu 2 -\fBset_layout\fP Set the layout information on a file or directory -.IP \(bu 2 -\fBshow_location\fP View the location information on a file -.UNINDENT -.SH OPTIONS -.sp -Your applicable options differ depending on whether you are setting or viewing layout/location. -.SS Viewing options: -.INDENT 0.0 -.TP -.B \-l \-\-offset -Specify an offset for which to retrieve location data -.UNINDENT -.SS Setting options: -.INDENT 0.0 -.TP -.B \-u \-\-stripe_unit -Set the size of each stripe -.UNINDENT -.INDENT 0.0 -.TP -.B \-c \-\-stripe_count -Set the number of objects to stripe across -.UNINDENT -.INDENT 0.0 -.TP -.B \-s \-\-object_size -Set the size of the objects to stripe across -.UNINDENT -.INDENT 0.0 -.TP -.B \-p \-\-pool -Set the pool (by numeric value, not name!) to use -.UNINDENT -.INDENT 0.0 -.TP -.B \-o \-\-osd -Set the preferred OSD to use as the primary -.UNINDENT -.SH LIMITATIONS -.sp -When setting layout data, the specified object size must evenly divide -by the specified stripe unit. Any parameters you don\(aqt set -explicitly are left at the system defaults. -.sp -Obviously setting the layout of a file and a directory means different -things. Setting the layout of a file specifies exactly how to place -the individual file. This must be done before writing \fIany\fP data to -it. Truncating a file does not allow you to change the layout either. -.sp -Setting the layout of a directory sets the "default layout", which is -used to set the file layouts on any files subsequently created in the -directory (or any subdirectory). Pre\-existing files do not have their -layouts changed. -.sp -You\(aqll notice that the layout information allows you to specify a -preferred OSD for placement. This feature is unsupported and ignored -in modern versions of the Ceph servers; do not use it. -.SH AVAILABILITY -.sp -\fBcephfs\fP is part of the Ceph distributed storage system. Please refer -to the Ceph documentation at \fI\%http://ceph.com/docs\fP for more -information. -.SH SEE ALSO -.sp -\fBceph\fP(8) -.SH COPYRIGHT -2010-2014, Inktank Storage, Inc. and contributors. Licensed under Creative Commons BY-SA -.\" Generated by docutils manpage writer. -. diff --git a/man/conf.py b/man/conf.py new file mode 100644 index 0000000000000..2b24223b12dfb --- /dev/null +++ b/man/conf.py @@ -0,0 +1,59 @@ +import os + +project = u'Ceph' +copyright = u'2010-2014, Inktank Storage, Inc. and contributors. Licensed under Creative Commons BY-SA' +version = 'dev' +release = 'dev' + +exclude_patterns = ['**/.#*', '**/*~'] + + +def _get_description(fname, base): + with file(fname) as f: + one = None + while True: + line = f.readline().rstrip('\n') + if not line: + continue + if line.startswith(':') and line.endswith(':'): + continue + one = line + break + two = f.readline().rstrip('\n') + three = f.readline().rstrip('\n') + assert one == three + assert all(c=='=' for c in one) + name, description = two.split('--', 1) + assert name.strip() == base + return description.strip() + + +def _get_manpages(): + src_dir = os.path.dirname(__file__) + top_srcdir = os.path.dirname(src_dir) + man_dir = os.path.join(top_srcdir, 'doc', 'man') + sections = os.listdir(man_dir) + for section in sections: + section_dir = os.path.join(man_dir, section) + if not os.path.isdir(section_dir): + continue + for filename in os.listdir(section_dir): + base, ext = os.path.splitext(filename) + if ext != '.rst': + continue + if base == 'index': + continue + path = os.path.join(section_dir, filename) + description = _get_description(path, base) + yield ( + os.path.join(section, base), + base, + description, + '', + section, + ) + +man_pages = list(_get_manpages()) +# sphinx warns if no toc is found, so feed it with a random file +# which is also rendered in this run. +master_doc = '8/ceph' diff --git a/man/crushtool.8 b/man/crushtool.8 deleted file mode 100644 index bb518f7714d48..0000000000000 --- a/man/crushtool.8 +++ /dev/null @@ -1,433 +0,0 @@ -.\" Man page generated from reStructuredText. -. -.TH "CRUSHTOOL" "8" "January 12, 2014" "dev" "Ceph" -.SH NAME -crushtool \- CRUSH map manipulation tool -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -.SH SYNOPSIS -.nf -\fBcrushtool\fP ( \-d \fImap\fP | \-c \fImap.txt\fP | \-\-build \-\-num_osds \fInumosds\fP -\fIlayer1\fP \fI\&...\fP | \-\-test ) [ \-o \fIoutfile\fP ] -.fi -.sp -.SH DESCRIPTION -.INDENT 0.0 -.TP -.B \fBcrushtool\fP is a utility that lets you create, compile, decompile -and test CRUSH map files. -.UNINDENT -.sp -CRUSH is a pseudo\-random data distribution algorithm that efficiently -maps input values (typically data objects) across a heterogeneous, -hierarchically structured device map. The algorithm was originally -described in detail in the following paper (although it has evolved -some since then): -.INDENT 0.0 -.INDENT 3.5 -\fI\%http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf\fP -.UNINDENT -.UNINDENT -.sp -The tool has four modes of operation. -.INDENT 0.0 -.TP -.B \-\-compile|\-c map.txt -will compile a plaintext map.txt into a binary map file. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-decompile|\-d map -will take the compiled map and decompile it into a plaintext source -file, suitable for editing. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-build \-\-num_osds {num\-osds} layer1 ... -will create map with the given layer structure. See below for a -detailed explanation. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-test -will perform a dry run of a CRUSH mapping for a range of input -object names. See below for a detailed explanation. -.UNINDENT -.sp -Unlike other Ceph tools, \fBcrushtool\fP does not accept generic options -such as \fB\-\-debug\-crush\fP from the command line. They can however be -provided via the CEPH_ARGS environment variable. For instance, to -silence all output from the CRUSH subsystem: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -CEPH_ARGS="\-\-debug\-crush 0" crushtool ... -.ft P -.fi -.UNINDENT -.UNINDENT -.SH RUNNING TESTS WITH --TEST -.sp -The test mode will use the input crush map ( as specified with \fB\-i -map\fP ) and perform a dry run of CRUSH mapping or random placement ( -if \fB\-\-simulate\fP is set ). On completion, two kinds of reports can be -created. The \fB\-\-show\-...\fP options output human readable information -on stderr. The \fB\-\-output\-csv\fP option creates CSV files that are -documented by the \fB\-\-help\-output\fP option. -.INDENT 0.0 -.TP -.B \-\-show\-statistics -for each rule display the mapping of each object. For instance: -.INDENT 7.0 -.INDENT 3.5 -.sp -.nf -.ft C -CRUSH rule 1 x 24 [11,6] -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -shows that object \fB24\fP is mapped to devices \fB[11,6]\fP by rule -\fB1\fP\&. At the end of the mapping details, a summary of the -distribution is displayed. For instance: -.INDENT 7.0 -.INDENT 3.5 -.sp -.nf -.ft C -rule 1 (metadata) num_rep 5 result size == 5: 1024/1024 -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -shows that rule \fB1\fP which is named \fBmetadata\fP successfully -mapped \fB1024\fP objects to \fBresult size == 5\fP devices when trying -to map them to \fBnum_rep 5\fP replicas. When it fails to provide the -required mapping, presumably because the number of \fBtries\fP must -be increased, a breakdown of the failures is displays. For instance: -.INDENT 7.0 -.INDENT 3.5 -.sp -.nf -.ft C -rule 1 (metadata) num_rep 10 result size == 8: 4/1024 -rule 1 (metadata) num_rep 10 result size == 9: 93/1024 -rule 1 (metadata) num_rep 10 result size == 10: 927/1024 -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -shows that although \fBnum_rep 10\fP replicas were required, \fB4\fP -out of \fB1024\fP objects ( \fB4/1024\fP ) were mapped to \fBresult size -== 8\fP devices only. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-show\-bad\-mappings -display which object failed to be mapped to the required number of -devices. For instance: -.INDENT 7.0 -.INDENT 3.5 -.sp -.nf -.ft C -bad mapping rule 1 x 781 num_rep 7 result [8,10,2,11,6,9] -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -shows that when rule \fB1\fP was required to map \fB7\fP devices, it -could only map six : \fB[8,10,2,11,6,9]\fP\&. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-show\-utilization -display the expected and actual utilisation for each device, for -each number of replicas. For instance: -.INDENT 7.0 -.INDENT 3.5 -.sp -.nf -.ft C -device 0: stored : 951 expected : 853.333 -device 1: stored : 963 expected : 853.333 -\&... -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -shows that device \fB0\fP stored \fB951\fP objects and was expected to store \fB853\fP\&. -Implies \fB\-\-show\-statistics\fP\&. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-show\-utilization\-all -displays the same as \fB\-\-show\-utilization\fP but does not suppress -output when the weight of a device is zero. -Implies \fB\-\-show\-statistics\fP\&. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-show\-choose\-tries -display how many attempts were needed to find a device mapping. -For instance: -.INDENT 7.0 -.INDENT 3.5 -.sp -.nf -.ft C -0: 95224 -1: 3745 -2: 2225 -\&.. -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -shows that \fB95224\fP mappings succeeded without retries, \fB3745\fP -mappings succeeded with one attempts, etc. There are as many rows -as the value of the \fB\-\-set\-choose\-total\-tries\fP option. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-output\-csv -create CSV files (in the current directory) containing information -documented by \fB\-\-help\-output\fP\&. The files are named after the rule -used when collecting the statistics. For instance, if the rule -metadata is used, the CSV files will be: -.INDENT 7.0 -.INDENT 3.5 -.sp -.nf -.ft C -metadata\-absolute_weights.csv -metadata\-device_utilization.csv -\&... -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -The first line of the file shortly explains the column layout. For -instance: -.INDENT 7.0 -.INDENT 3.5 -.sp -.nf -.ft C -metadata\-absolute_weights.csv -Device ID, Absolute Weight -0,1 -\&... -.ft P -.fi -.UNINDENT -.UNINDENT -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-output\-name NAME -prepend \fBNAME\fP to the file names generated when \fB\-\-output\-csv\fP -is specified. For instance \fB\-\-output\-name FOO\fP will create -files: -.INDENT 7.0 -.INDENT 3.5 -.sp -.nf -.ft C -FOO\-metadata\-absolute_weights.csv -FOO\-metadata\-device_utilization.csv -\&... -.ft P -.fi -.UNINDENT -.UNINDENT -.UNINDENT -.sp -The \fB\-\-set\-...\fP options can be used to modify the tunables of the -input crush map. The input crush map is modified in -memory. For example: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -$ crushtool \-i mymap \-\-test \-\-show\-bad\-mappings -bad mapping rule 1 x 781 num_rep 7 result [8,10,2,11,6,9] -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -could be fixed by increasing the \fBchoose\-total\-tries\fP as follows: -.INDENT 0.0 -.INDENT 3.5 -.INDENT 0.0 -.TP -.B $ crushtool \-i mymap \-\-test -\-\-show\-bad\-mappings \-\-set\-choose\-total\-tries 500 -.UNINDENT -.UNINDENT -.UNINDENT -.SH BUILDING A MAP WITH --BUILD -.sp -The build mode will generate hierarchical maps. The first argument -specifies the number of devices (leaves) in the CRUSH hierarchy. Each -layer describes how the layer (or devices) preceding it should be -grouped. -.sp -Each layer consists of: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -bucket ( uniform | list | tree | straw ) size -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -The \fBbucket\fP is the type of the buckets in the layer -(e.g. "rack"). Each bucket name will be built by appending a unique -number to the \fBbucket\fP string (e.g. "rack0", "rack1"...). -.sp -The second component is the type of bucket: \fBstraw\fP should be used -most of the time. -.sp -The third component is the maximum size of the bucket. A size of zero -means a bucket of infinite capacity. -.SH EXAMPLE -.sp -Suppose we have two rows with two racks each and 20 nodes per rack. Suppose -each node contains 4 storage devices for Ceph OSD Daemons. This configuration -allows us to deploy 320 Ceph OSD Daemons. Lets assume a 42U rack with 2U nodes, -leaving an extra 2U for a rack switch. -.sp -To reflect our hierarchy of devices, nodes, racks and rows, we would execute -the following: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -$ crushtool \-o crushmap \-\-build \-\-num_osds 320 \e - node straw 4 \e - rack straw 20 \e - row straw 2 \e - root straw 0 -# id weight type name reweight -\-87 320 root root -\-85 160 row row0 -\-81 80 rack rack0 -\-1 4 node node0 -0 1 osd.0 1 -1 1 osd.1 1 -2 1 osd.2 1 -3 1 osd.3 1 -\-2 4 node node1 -4 1 osd.4 1 -5 1 osd.5 1 -\&... -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -CRUSH rulesets are created so the generated crushmap can be -tested. They are the same rulesets as the one created by default when -creating a new Ceph cluster. They can be further edited with: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -# decompile -crushtool \-d crushmap \-o map.txt - -# edit -emacs map.txt - -# recompile -crushtool \-c map.txt \-o crushmap -.ft P -.fi -.UNINDENT -.UNINDENT -.SH AVAILABILITY -.sp -\fBcrushtool\fP is part of the Ceph distributed storage system. Please -refer to the Ceph documentation at \fI\%http://ceph.com/docs\fP for more -information. -.SH SEE ALSO -.sp -\fBceph\fP(8), -\fBosdmaptool\fP(8), -.SH AUTHORS -.sp -John Wilkins, Sage Weil, Loic Dachary -.SH COPYRIGHT -2010-2014, Inktank Storage, Inc. and contributors. Licensed under Creative Commons BY-SA -.\" Generated by docutils manpage writer. -. diff --git a/man/librados-config.8 b/man/librados-config.8 deleted file mode 100644 index 48238fe1d1723..0000000000000 --- a/man/librados-config.8 +++ /dev/null @@ -1,94 +0,0 @@ -.\" Man page generated from reStructuredText. -. -.TH "LIBRADOS-CONFIG" "8" "January 12, 2014" "dev" "Ceph" -.SH NAME -librados-config \- display information about librados -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -.SH SYNOPSIS -.nf -\fBlibrados\-config\fP [ \-\-version ] [ \-\-vernum ] -.fi -.sp -.SH DESCRIPTION -.INDENT 0.0 -.TP -.B \fBlibrados\-config\fP is a utility that displays information about the -installed \fBlibrados\fP\&. -.UNINDENT -.SH OPTIONS -.INDENT 0.0 -.TP -.B \-\-version -Display \fBlibrados\fP version -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-vernum -Display the \fBlibrados\fP version code -.UNINDENT -.SH AVAILABILITY -.sp -\fBlibrados\-config\fP is part of the Ceph distributed storage system. -Please refer to the Ceph documentation at \fI\%http://ceph.com/docs\fP for -more information. -.SH SEE ALSO -.sp -\fBceph\fP(8), -\fBrados\fP(8) -.SH COPYRIGHT -2010-2014, Inktank Storage, Inc. and contributors. Licensed under Creative Commons BY-SA -.\" Generated by docutils manpage writer. -. diff --git a/man/monmaptool.8 b/man/monmaptool.8 deleted file mode 100644 index 1bd9f306ac9a8..0000000000000 --- a/man/monmaptool.8 +++ /dev/null @@ -1,188 +0,0 @@ -.\" Man page generated from reStructuredText. -. -.TH "MONMAPTOOL" "8" "January 12, 2014" "dev" "Ceph" -.SH NAME -monmaptool \- ceph monitor cluster map manipulation tool -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -.SH SYNOPSIS -.nf -\fBmonmaptool\fP \fImapfilename\fP [ \-\-clobber ] [ \-\-print ] [ \-\-create ] -[ \-\-add \fIip\fP:\fIport\fP \fI\&...\fP ] [ \-\-rm \fIip\fP:\fIport\fP \fI\&...\fP ] -.fi -.sp -.SH DESCRIPTION -.sp -\fBmonmaptool\fP is a utility to create, view, and modify a monitor -cluster map for the Ceph distributed storage system. The monitor map -specifies the only fixed addresses in the Ceph distributed system. -All other daemons bind to arbitrary addresses and register themselves -with the monitors. -.sp -When creating a map with \-\-create, a new monitor map with a new, -random UUID will be created. It should be followed by one or more -monitor addresses. -.sp -The default Ceph monitor port is 6789. -.SH OPTIONS -.INDENT 0.0 -.TP -.B \-\-print -will print a plaintext dump of the map, after any modifications are -made. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-clobber -will allow monmaptool to overwrite mapfilename if changes are made. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-create -will create a new monitor map with a new UUID (and with it, a new, -empty Ceph file system). -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-generate -generate a new monmap based on the values on the command line or specified -in the ceph configuration. This is, in order of preference, -.INDENT 7.0 -.INDENT 3.5 -.INDENT 0.0 -.IP 1. 3 -\fB\-\-monmap filename\fP to specify a monmap to load -.IP 2. 3 -\fB\-\-mon\-host \(aqhost1,ip2\(aq\fP to specify a list of hosts or ip addresses -.IP 3. 3 -\fB[mon.foo]\fP sections containing \fBmon addr\fP settings in the config -.UNINDENT -.UNINDENT -.UNINDENT -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-filter\-initial\-members -filter the initial monmap by applying the \fBmon initial members\fP -setting. Monitors not present in that list will be removed, and -initial members not present in the map will be added with dummy -addresses. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-add name ip:port -will add a monitor with the specified ip:port to the map. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-rm name -will remove the monitor with the specified ip:port from the map. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-fsid uuid -will set the fsid to the given uuid. If not specified with \-\-create, a random fsid will be generated. -.UNINDENT -.SH EXAMPLE -.sp -To create a new map with three monitors (for a fresh Ceph file system): -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -monmaptool \-\-create \-\-add mon.a 192.168.0.10:6789 \-\-add mon.b 192.168.0.11:6789 \e - \-\-add mon.c 192.168.0.12:6789 \-\-clobber monmap -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -To display the contents of the map: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -monmaptool \-\-print monmap -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -To replace one monitor: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -monmaptool \-\-rm mon.a \-\-add mon.a 192.168.0.9:6789 \-\-clobber monmap -.ft P -.fi -.UNINDENT -.UNINDENT -.SH AVAILABILITY -.sp -\fBmonmaptool\fP is part of the Ceph distributed storage system. Please -refer to the Ceph documentation at \fI\%http://ceph.com/docs\fP for more -information. -.SH SEE ALSO -.sp -\fBceph\fP(8), -\fBcrushtool\fP(8), -.SH COPYRIGHT -2010-2014, Inktank Storage, Inc. and contributors. Licensed under Creative Commons BY-SA -.\" Generated by docutils manpage writer. -. diff --git a/man/mount.ceph.8 b/man/mount.ceph.8 deleted file mode 100644 index c3549c084e212..0000000000000 --- a/man/mount.ceph.8 +++ /dev/null @@ -1,257 +0,0 @@ -.\" Man page generated from reStructuredText. -. -.TH "MOUNT.CEPH" "8" "January 12, 2014" "dev" "Ceph" -.SH NAME -mount.ceph \- mount a ceph file system -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -.SH SYNOPSIS -.nf -\fBmount.ceph\fP \fImonaddr1\fP[,\fImonaddr2\fP,...]:/[\fIsubdir\fP] \fIdir\fP [ -\-o \fIoptions\fP ] -.fi -.sp -.SH DESCRIPTION -.sp -\fBmount.ceph\fP is a simple helper for mounting the Ceph file system on -a Linux host. It serves to resolve monitor hostname(s) into IP -addresses and read authentication keys from disk; the Linux kernel -client component does most of the real work. In fact, it is possible -to mount a non\-authenticated Ceph file system without mount.ceph by -specifying monitor address(es) by IP: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -mount \-t ceph 1.2.3.4:/ mountpoint -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -Each monitor address monaddr takes the form host[:port]. If the port -is not specified, the Ceph default of 6789 is assumed. -.sp -Multiple monitor addresses can be separated by commas. Only one -responsible monitor is needed to successfully mount; the client will -learn about all monitors from any responsive monitor. However, it is a -good idea to specify more than one in case one happens to be down at -the time of mount. -.sp -A subdirectory subdir may be specified if a subset of the file system -is to be mounted. -.sp -Mount helper application conventions dictate that the first two -options are device to be mounted and destination path. Options must be -passed only after these fixed arguments. -.SH OPTIONS -.INDENT 0.0 -.TP -.B \fBwsize\fP -int, max write size. Default: none (writeback uses smaller of wsize -and stripe unit) -.TP -.B \fBrsize\fP -int (bytes), max readahead, multiple of 1024, Default: 524288 -(512*1024) -.TP -.B \fBosdtimeout\fP -int (seconds), Default: 60 -.TP -.B \fBosdkeepalivetimeout\fP -int, Default: 5 -.TP -.B \fBmount_timeout\fP -int (seconds), Default: 60 -.TP -.B \fBosd_idle_ttl\fP -int (seconds), Default: 60 -.TP -.B \fBcaps_wanted_delay_min\fP -int, cap release delay, Default: 5 -.TP -.B \fBcaps_wanted_delay_max\fP -int, cap release delay, Default: 60 -.TP -.B \fBcap_release_safety\fP -int, Default: calculated -.TP -.B \fBreaddir_max_entries\fP -int, Default: 1024 -.TP -.B \fBreaddir_max_bytes\fP -int, Default: 524288 (512*1024) -.TP -.B \fBwrite_congestion_kb\fP -int (kb), max writeback in flight. scale with available -memory. Default: calculated from available memory -.TP -.B \fBsnapdirname\fP -string, set the name of the hidden snapdir. Default: .snap -.TP -.B \fBname\fP -RADOS user to authenticate as when using cephx. Default: guest -.TP -.B \fBsecret\fP -secret key for use with cephx. This option is insecure because it exposes -the secret on the command line. To avoid this, use the secretfile option. -.TP -.B \fBsecretfile\fP -path to file containing the secret key to use with cephx -.TP -.B \fBip\fP -my ip -.TP -.B \fBnoshare\fP -create a new client instance, instead of sharing an existing -instance of a client mounting the same cluster -.TP -.B \fBdirstat\fP -funky \fIcat dirname\fP for stats, Default: off -.TP -.B \fBnodirstat\fP -no funky \fIcat dirname\fP for stats -.TP -.B \fBrbytes\fP -Report the recursive size of the directory contents for st_size on -directories. Default: on -.TP -.B \fBnorbytes\fP -Do not report the recursive size of the directory contents for -st_size on directories. -.TP -.B \fBnocrc\fP -no data crc on writes -.TP -.B \fBnoasyncreaddir\fP -no dcache readdir -.UNINDENT -.SH EXAMPLES -.sp -Mount the full file system: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -mount.ceph monhost:/ /mnt/foo -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -If there are multiple monitors: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -mount.ceph monhost1,monhost2,monhost3:/ /mnt/foo -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -If \fBceph\-mon\fP(8) is running on a non\-standard -port: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -mount.ceph monhost1:7000,monhost2:7000,monhost3:7000:/ /mnt/foo -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -To mount only part of the namespace: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -mount.ceph monhost1:/some/small/thing /mnt/thing -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -Assuming mount.ceph(8) is installed properly, it should be -automatically invoked by mount(8) like so: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -mount \-t ceph monhost:/ /mnt/foo -.ft P -.fi -.UNINDENT -.UNINDENT -.SH AVAILABILITY -.sp -\fBmount.ceph\fP is part of the Ceph distributed storage system. Please -refer to the Ceph documentation at \fI\%http://ceph.com/docs\fP for more -information. -.SH SEE ALSO -.sp -\fBceph\-fuse\fP(8), -\fBceph\fP(8) -.SH COPYRIGHT -2010-2014, Inktank Storage, Inc. and contributors. Licensed under Creative Commons BY-SA -.\" Generated by docutils manpage writer. -. diff --git a/man/osdmaptool.8 b/man/osdmaptool.8 deleted file mode 100644 index cf3660bfd02be..0000000000000 --- a/man/osdmaptool.8 +++ /dev/null @@ -1,139 +0,0 @@ -.\" Man page generated from reStructuredText. -. -.TH "OSDMAPTOOL" "8" "January 12, 2014" "dev" "Ceph" -.SH NAME -osdmaptool \- ceph osd cluster map manipulation tool -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -.SH SYNOPSIS -.nf -\fBosdmaptool\fP \fImapfilename\fP [\-\-print] [\-\-createsimple \fInumosd\fP -[\-\-pgbits \fIbitsperosd\fP ] ] [\-\-clobber] -.fi -.sp -.SH DESCRIPTION -.sp -\fBosdmaptool\fP is a utility that lets you create, view, and manipulate -OSD cluster maps from the Ceph distributed storage system. Notably, it -lets you extract the embedded CRUSH map or import a new CRUSH map. -.SH OPTIONS -.INDENT 0.0 -.TP -.B \-\-print -will simply make the tool print a plaintext dump of the map, after -any modifications are made. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-clobber -will allow osdmaptool to overwrite mapfilename if changes are made. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-import\-crush mapfile -will load the CRUSH map from mapfile and embed it in the OSD map. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-export\-crush mapfile -will extract the CRUSH map from the OSD map and write it to -mapfile. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-createsimple numosd [\-\-pgbits bitsperosd] -will create a relatively generic OSD map with the numosd devices. -If \-\-pgbits is specified, the initial placement group counts will -be set with bitsperosd bits per OSD. That is, the pg_num map -attribute will be set to numosd shifted by bitsperosd. -.UNINDENT -.SH EXAMPLE -.sp -To create a simple map with 16 devices: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -osdmaptool \-\-createsimple 16 osdmap \-\-clobber -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -To view the result: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -osdmaptool \-\-print osdmap -.ft P -.fi -.UNINDENT -.UNINDENT -.SH AVAILABILITY -.sp -\fBosdmaptool\fP is part of the Ceph distributed storage system. Please -refer to the Ceph documentation at \fI\%http://ceph.com/docs\fP for more -information. -.SH SEE ALSO -.sp -\fBceph\fP(8), -\fBcrushtool\fP(8), -.SH COPYRIGHT -2010-2014, Inktank Storage, Inc. and contributors. Licensed under Creative Commons BY-SA -.\" Generated by docutils manpage writer. -. diff --git a/man/rados.8 b/man/rados.8 deleted file mode 100644 index eea43572f8d2f..0000000000000 --- a/man/rados.8 +++ /dev/null @@ -1,267 +0,0 @@ -.\" Man page generated from reStructuredText. -. -.TH "RADOS" "8" "May 29, 2014" "dev" "Ceph" -.SH NAME -rados \- rados object storage utility -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -.SH SYNOPSIS -.nf -\fBrados\fP [ \-m \fImonaddr\fP ] [ mkpool | rmpool \fIfoo\fP ] [ \-p | \-\-pool -\fIpool\fP ] [ \-s | \-\-snap \fIsnap\fP ] [ \-i \fIinfile\fP ] [ \-o \fIoutfile\fP ] -\fIcommand\fP ... -.fi -.sp -.SH DESCRIPTION -.sp -\fBrados\fP is a utility for interacting with a Ceph object storage -cluster (RADOS), part of the Ceph distributed storage system. -.SH OPTIONS -.INDENT 0.0 -.TP -.B \-p pool, \-\-pool pool -Interact with the given pool. Required by most commands. -.UNINDENT -.INDENT 0.0 -.TP -.B \-s snap, \-\-snap snap -Read from the given pool snapshot. Valid for all pool\-specific read operations. -.UNINDENT -.INDENT 0.0 -.TP -.B \-i infile -will specify an input file to be passed along as a payload with the -command to the monitor cluster. This is only used for specific -monitor commands. -.UNINDENT -.INDENT 0.0 -.TP -.B \-o outfile -will write any payload returned by the monitor cluster with its -reply to outfile. Only specific monitor commands (e.g. osd getmap) -return a payload. -.UNINDENT -.INDENT 0.0 -.TP -.B \-c ceph.conf, \-\-conf=ceph.conf -Use ceph.conf configuration file instead of the default -/etc/ceph/ceph.conf to determine monitor addresses during startup. -.UNINDENT -.INDENT 0.0 -.TP -.B \-m monaddress[:port] -Connect to specified monitor (instead of looking through ceph.conf). -.UNINDENT -.SH GLOBAL COMMANDS -.INDENT 0.0 -.TP -.B \fBlspools\fP -List object pools -.TP -.B \fBdf\fP -Show utilization statistics, including disk usage (bytes) and object -counts, over the entire system and broken down by pool. -.TP -.B \fBmkpool\fP \fIfoo\fP -Create a pool with name foo. -.TP -.B \fBrmpool\fP \fIfoo\fP [ \fIfoo\fP \-\-yes\-i\-really\-really\-mean\-it ] -Delete the pool foo (and all its data) -.UNINDENT -.SH POOL SPECIFIC COMMANDS -.INDENT 0.0 -.TP -.B \fBget\fP \fIname\fP \fIoutfile\fP -Read object name from the cluster and write it to outfile. -.TP -.B \fBput\fP \fIname\fP \fIinfile\fP -Write object name to the cluster with contents from infile. -.TP -.B \fBrm\fP \fIname\fP -Remove object name. -.TP -.B \fBls\fP \fIoutfile\fP -List objects in given pool and write to outfile. -.TP -.B \fBlssnap\fP -List snapshots for given pool. -.TP -.B \fBclonedata\fP \fIsrcname\fP \fIdstname\fP \-\-object\-locator \fIkey\fP -Clone object byte data from \fIsrcname\fP to \fIdstname\fP\&. Both objects must be stored with the locator key \fIkey\fP (usually either \fIsrcname\fP or \fIdstname\fP). Object attributes and omap keys are not copied or cloned. -.TP -.B \fBmksnap\fP \fIfoo\fP -Create pool snapshot named \fIfoo\fP\&. -.TP -.B \fBrmsnap\fP \fIfoo\fP -Remove pool snapshot named \fIfoo\fP\&. -.TP -.B \fBbench\fP \fIseconds\fP \fImode\fP [ \-b \fIobjsize\fP ] [ \-t \fIthreads\fP ] -Benchmark for \fIseconds\fP\&. The mode can be \fIwrite\fP, \fIseq\fP, or -\fIrand\fP\&. \fIseq\fP and \fIrand\fP are read benchmarks, either -sequential or random. Before running one of the reading benchmarks, -run a write benchmark with the \fI\-\-no\-cleanup\fP option. The default -object size is 4 MB, and the default number of simulated threads -(parallel writes) is 16. -.UNINDENT -.sp -\fBcleanup\fP -.INDENT 0.0 -.TP -.B \fBlistomapkeys\fP \fIname\fP -List all the keys stored in the object map of object name. -.TP -.B \fBlistomapvals\fP \fIname\fP -List all key/value pairs stored in the object map of object name. -The values are dumped in hexadecimal. -.TP -.B \fBgetomapval\fP \fIname\fP \fIkey\fP -Dump the hexadecimal value of key in the object map of object name. -.TP -.B \fBsetomapval\fP \fIname\fP \fIkey\fP \fIvalue\fP -Set the value of key in the object map of object name. -.TP -.B \fBrmomapkey\fP \fIname\fP \fIkey\fP -Remove key from the object map of object name. -.TP -.B \fBgetomapheader\fP \fIname\fP -Dump the hexadecimal value of the object map header of object name. -.TP -.B \fBsetomapheader\fP \fIname\fP \fIvalue\fP -Set the value of the object map header of object name. -.UNINDENT -.SH EXAMPLES -.sp -To view cluster utilization: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -rados df -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -To get a list object in pool foo sent to stdout: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -rados \-p foo ls \- -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -To write an object: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -rados \-p foo put myobject blah.txt -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -To create a snapshot: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -rados \-p foo mksnap mysnap -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -To delete the object: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -rados \-p foo rm myobject -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -To read a previously snapshotted version of an object: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -rados \-p foo \-s mysnap get myobject blah.txt.old -.ft P -.fi -.UNINDENT -.UNINDENT -.SH AVAILABILITY -.sp -\fBrados\fP is part of the Ceph distributed storage system. Please refer to -the Ceph documentation at \fI\%http://ceph.com/docs\fP for more information. -.SH SEE ALSO -.sp -\fBceph\fP(8) -.SH COPYRIGHT -2010-2014, Inktank Storage, Inc. and contributors. Licensed under Creative Commons BY-SA -.\" Generated by docutils manpage writer. -. diff --git a/man/radosgw-admin.8 b/man/radosgw-admin.8 deleted file mode 100644 index 64c54ae2a5757..0000000000000 --- a/man/radosgw-admin.8 +++ /dev/null @@ -1,333 +0,0 @@ -.\" Man page generated from reStructuredText. -. -.TH "RADOSGW-ADMIN" "8" "January 12, 2014" "dev" "Ceph" -.SH NAME -radosgw-admin \- rados REST gateway user administration utility -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -.SH SYNOPSIS -.nf -\fBradosgw\-admin\fP \fIcommand\fP [ \fIoptions\fP \fI\&...\fP ] -.fi -.sp -.SH DESCRIPTION -.sp -\fBradosgw\-admin\fP is a RADOS gateway user administration utility. It -allows creating and modifying users. -.SH COMMANDS -.sp -\fIcommand\fP can be one of the following options: -.INDENT 0.0 -.TP -.B \fBuser create\fP -Create a new user -.TP -.B \fBuser modify\fP -Modify a user -.TP -.B \fBuser info\fP -Display information of a user, and any potentially available -subusers and keys -.TP -.B \fBuser rm\fP -Remove a user -.TP -.B \fBsubuser create\fP -Create a new subuser (primarily useful for clients using the Swift API) -.TP -.B \fBsubuser modify\fP -Modify a subuser -.TP -.B \fBsubuser rm\fP -Remove a subuser -.TP -.B \fBbucket list\fP -List all buckets -.TP -.B \fBbucket unlink\fP -Remove a bucket -.TP -.B \fBbucket rm\fP -Remove a bucket -.TP -.B \fBobject rm\fP -Remove an object -.TP -.B \fBkey create\fP -Create an access key -.TP -.B \fBkey rm\fP -Remove an access key -.TP -.B \fBpool add\fP -Add an existing pool for data placement -.TP -.B \fBpool rm\fP -Remove an existing pool from data placement set -.TP -.B \fBpools list\fP -List placement active set -.TP -.B \fBpolicy\fP -Display bucket/object policy -.TP -.B \fBlog show\fP -Show the log of a bucket (with a specified date) -.TP -.B \fBusage show\fP -Show the usage information (with optional user and date range) -.TP -.B \fBusage trim\fP -Trim usage information (with optional user and date range) -.UNINDENT -.SH OPTIONS -.INDENT 0.0 -.TP -.B \-c ceph.conf, \-\-conf=ceph.conf -Use \fIceph.conf\fP configuration file instead of the default -\fB/etc/ceph/ceph.conf\fP to determine monitor addresses during -startup. -.UNINDENT -.INDENT 0.0 -.TP -.B \-m monaddress[:port] -Connect to specified monitor (instead of looking through ceph.conf). -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-uid=uid -The radosgw user ID. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-secret=secret -The secret associated with a given key. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-display\-name=name -Configure the display name of the user. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-email=email -The e\-mail address of the user -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-bucket=bucket -Specify the bucket name. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-object=object -Specify the object name. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-date=yyyy\-mm\-dd -The date needed for some commands -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-start\-date=yyyy\-mm\-dd -The start date needed for some commands -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-end\-date=yyyy\-mm\-dd -The end date needed for some commands -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-auth\-uid=auid -The librados auid -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-purge\-data -Remove user data before user removal -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-purge\-objects -Remove all objects before bucket removal -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-lazy\-remove -Defer removal of object tail -.UNINDENT -.SH EXAMPLES -.sp -Generate a new user: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -$ radosgw\-admin user create \-\-display\-name="johnny rotten" \-\-uid=johnny -{ "user_id": "johnny", - "rados_uid": 0, - "display_name": "johnny rotten", - "email": "", - "suspended": 0, - "subusers": [], - "keys": [ - { "user": "johnny", - "access_key": "TCICW53D9BQ2VGC46I44", - "secret_key": "tfm9aHMI8X76L3UdgE+ZQaJag1vJQmE6HDb5Lbrz"}], - "swift_keys": []} -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -Remove a user: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -$ radosgw\-admin user rm \-\-uid=johnny -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -Remove a user and all associated buckets with their contents: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -$ radosgw\-admin user rm \-\-uid=johnny \-\-purge\-data -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -Remove a bucket: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -$ radosgw\-admin bucket unlink \-\-bucket=foo -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -Show the logs of a bucket from April 1st, 2012: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -$ radosgw\-admin log show \-\-bucket=foo \-\-date=2012=04\-01 -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -Show usage information for user from March 1st to (but not including) April 1st, 2012: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -$ radosgw\-admin usage show \-\-uid=johnny \e - \-\-start\-date=2012\-03\-01 \-\-end\-date=2012\-04\-01 -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -Show only summary of usage information for all users: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -$ radosgw\-admin usage show \-\-show\-log\-entries=false -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -Trim usage information for user until March 1st, 2012: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -$ radosgw\-admin usage trim \-\-uid=johnny \-\-end\-date=2012\-04\-01 -.ft P -.fi -.UNINDENT -.UNINDENT -.SH AVAILABILITY -.sp -\fBradosgw\-admin\fP is part of the Ceph distributed storage system. Please -refer to the Ceph documentation at \fI\%http://ceph.com/docs\fP for more -information. -.SH SEE ALSO -.sp -\fBceph\fP(8) -.SH COPYRIGHT -2010-2014, Inktank Storage, Inc. and contributors. Licensed under Creative Commons BY-SA -.\" Generated by docutils manpage writer. -. diff --git a/man/radosgw.8 b/man/radosgw.8 deleted file mode 100644 index 054fb7938950b..0000000000000 --- a/man/radosgw.8 +++ /dev/null @@ -1,242 +0,0 @@ -.\" Man page generated from reStructuredText. -. -.TH "RADOSGW" "8" "January 12, 2014" "dev" "Ceph" -.SH NAME -radosgw \- rados REST gateway -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -.SH SYNOPSIS -.nf -\fBradosgw\fP -.fi -.sp -.SH DESCRIPTION -.sp -\fBradosgw\fP is an HTTP REST gateway for the RADOS object store, a part -of the Ceph distributed storage system. It is implemented as a FastCGI -module using libfcgi, and can be used in conjunction with any FastCGI -capable web server. -.SH OPTIONS -.INDENT 0.0 -.TP -.B \-c ceph.conf, \-\-conf=ceph.conf -Use \fIceph.conf\fP configuration file instead of the default -\fB/etc/ceph/ceph.conf\fP to determine monitor addresses during startup. -.UNINDENT -.INDENT 0.0 -.TP -.B \-m monaddress[:port] -Connect to specified monitor (instead of looking through -\fBceph.conf\fP). -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-rgw\-socket\-path=path -Specify a unix domain socket path. -.UNINDENT -.SH CONFIGURATION -.sp -Currently it\(aqs the easiest to use the RADOS Gateway with Apache and mod_fastcgi: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -FastCgiExternalServer /var/www/s3gw.fcgi \-socket /tmp/radosgw.sock - - - ServerName rgw.example1.com - ServerAlias rgw - ServerAdmin webmaster@example1.com - DocumentRoot /var/www - - RewriteEngine On - RewriteRule ^/([a\-zA\-Z0\-9\-_.]*)([/]?.*) /s3gw.fcgi?page=$1¶ms=$2&%{QUERY_STRING} [E=HTTP_AUTHORIZATION:%{HTTP:Authorization},L] - - - - Options +ExecCGI - AllowOverride All - SetHandler fastcgi\-script - Order allow,deny - Allow from all - AuthBasicAuthoritative Off - - - - AllowEncodedSlashes On - ServerSignature Off - -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -And the corresponding radosgw script (/var/www/s3gw.fcgi): -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -#!/bin/sh -exec /usr/bin/radosgw \-c /etc/ceph/ceph.conf \-n client.radosgw.gateway -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -The radosgw daemon is a standalone process which needs a configuration -section in the ceph.conf The section name should start with -\(aqclient.radosgw.\(aq as specified in /etc/init.d/radosgw: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -[client.radosgw.gateway] - host = gateway - keyring = /etc/ceph/keyring.radosgw.gateway - rgw socket path = /tmp/radosgw.sock -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -You will also have to generate a key for the radosgw to use for -authentication with the cluster: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -ceph\-authtool \-C \-n client.radosgw.gateway \-\-gen\-key /etc/ceph/keyring.radosgw.gateway -ceph\-authtool \-n client.radosgw.gateway \-\-cap mon \(aqallow rw\(aq \-\-cap osd \(aqallow rwx\(aq /etc/ceph/keyring.radosgw.gateway -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -And add the key to the auth entries: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -ceph auth add client.radosgw.gateway \-\-in\-file=keyring.radosgw.gateway -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -Now you can start Apache and the radosgw daemon: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -/etc/init.d/apache2 start -/etc/init.d/radosgw start -.ft P -.fi -.UNINDENT -.UNINDENT -.SH USAGE LOGGING -.sp -The \fBradosgw\fP maintains an asynchronous usage log. It accumulates -statistics about user operations and flushes it periodically. The -logs can be accessed and managed through \fBradosgw\-admin\fP\&. -.sp -The information that is being logged contains total data transfer, -total operations, and total successful operations. The data is being -accounted in an hourly resolution under the bucket owner, unless the -operation was done on the service (e.g., when listing a bucket) in -which case it is accounted under the operating user. -.sp -Following is an example configuration: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -[client.radosgw.gateway] - rgw enable usage log = true - rgw usage log tick interval = 30 - rgw usage log flush threshold = 1024 - rgw usage max shards = 32 - rgw usage max user shards = 1 -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -The total number of shards determines how many total objects hold the -usage log information. The per\-user number of shards specify how many -objects hold usage information for a single user. The tick interval -configures the number of seconds between log flushes, and the flush -threshold specify how many entries can be kept before resorting to -synchronous flush. -.SH AVAILABILITY -.sp -\fBradosgw\fP is part of the Ceph distributed storage system. Please refer -to the Ceph documentation at \fI\%http://ceph.com/docs\fP for more -information. -.SH SEE ALSO -.sp -\fBceph\fP(8) -\fBradosgw\-admin\fP(8) -.SH COPYRIGHT -2010-2014, Inktank Storage, Inc. and contributors. Licensed under Creative Commons BY-SA -.\" Generated by docutils manpage writer. -. diff --git a/man/rbd-fuse.8 b/man/rbd-fuse.8 deleted file mode 100644 index 3ba0636edf419..0000000000000 --- a/man/rbd-fuse.8 +++ /dev/null @@ -1,110 +0,0 @@ -.\" Man page generated from reStructuredText. -. -.TH "RBD-FUSE" "8" "January 12, 2014" "dev" "Ceph" -.SH NAME -rbd-fuse \- expose rbd images as files -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -.SH SYNOPSIS -.nf -\fBrbd\-fuse\fP [ \-p pool ] [\-c conffile] \fImountpoint\fP [ \fIfuse options\fP ] -.fi -.sp -.SH DESCRIPTION -.sp -\fBrbd\-fuse\fP is a FUSE (File system in USErspace) client for RADOS -block device (rbd) images. Given a pool containing rbd images, -it will mount a userspace filesystem allowing access to those images -as regular files at \fBmountpoint\fP\&. -.sp -The file system can be unmounted with: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -fusermount \-u mountpoint -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -or by sending \fBSIGINT\fP to the \fBrbd\-fuse\fP process. -.SH OPTIONS -.sp -Any options not recognized by rbd\-fuse will be passed on to libfuse. -.INDENT 0.0 -.TP -.B \-c ceph.conf -Use \fIceph.conf\fP configuration file instead of the default -\fB/etc/ceph/ceph.conf\fP to determine monitor addresses during startup. -.UNINDENT -.INDENT 0.0 -.TP -.B \-p pool -Use \fIpool\fP as the pool to search for rbd images. Default is \fBrbd\fP\&. -.UNINDENT -.SH AVAILABILITY -.sp -\fBrbd\-fuse\fP is part of the Ceph distributed storage system. Please refer to -the Ceph documentation at \fI\%http://ceph.com/docs\fP for more information. -.SH SEE ALSO -.sp -fusermount(8), -\fBrbd\fP(8) -.SH COPYRIGHT -2010-2014, Inktank Storage, Inc. and contributors. Licensed under Creative Commons BY-SA -.\" Generated by docutils manpage writer. -. diff --git a/man/rbd-replay-many.8 b/man/rbd-replay-many.8 deleted file mode 100644 index e07d79a048119..0000000000000 --- a/man/rbd-replay-many.8 +++ /dev/null @@ -1,134 +0,0 @@ -.\" Man page generated from reStructuredText. -. -.TH "RBD-REPLAY-MANY" "8" "September 04, 2014" "dev" "Ceph" -.SH NAME -rbd-replay-many \- replay a rados block device (RBD) workload on several clients -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -.SH SYNOPSIS -.nf -\fBrbd\-replay\-many\fP [ \fIoptions\fP ] \-\-original\-image \fIname\fP \fIhost1\fP [ \fIhost2\fP [ ... ] ] \-\- \fIrbd_replay_args\fP -.fi -.sp -.SH DESCRIPTION -.sp -\fBrbd\-replay\-many\fP is a utility for replaying a rados block device (RBD) workload on several clients. -Although all clients use the same workload, they replay against separate images. -This matches normal use of librbd, where each original client is a VM with its own image. -.sp -Configuration and replay files are not automatically copied to clients. -Replay images must already exist. -.SH OPTIONS -.INDENT 0.0 -.TP -.B \-\-original\-image name -Specifies the name (and snap) of the originally traced image. -Necessary for correct name mapping. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-image\-prefix prefix -Prefix of image names to replay against. -Specifying \-\-image\-prefix=foo results in clients replaying against foo\-0, foo\-1, etc. -Defaults to the original image name. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-exec program -Path to the rbd\-replay executable. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-delay seconds -Delay between starting each client. Defaults to 0. -.UNINDENT -.SH EXAMPLES -.sp -Typical usage: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -rbd\-replay\-many host\-0 host\-1 \-\-original\-image=image \-\- \-c ceph.conf replay.bin -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -This results in the following commands being executed: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -ssh host\-0 \(aqrbd\-replay\(aq \-\-map\-image \(aqimage=image\-0\(aq \-c ceph.conf replay.bin -ssh host\-1 \(aqrbd\-replay\(aq \-\-map\-image \(aqimage=image\-1\(aq \-c ceph.conf replay.bin -.ft P -.fi -.UNINDENT -.UNINDENT -.SH AVAILABILITY -.sp -\fBrbd\-replay\-many\fP is part of the Ceph distributed storage system. Please refer to -the Ceph documentation at \fI\%http://ceph.com/docs\fP for more information. -.SH SEE ALSO -.sp -\fBrbd\-replay\fP(8), -\fBrbd\fP(8) -.SH COPYRIGHT -2010-2014, Inktank Storage, Inc. and contributors. Licensed under Creative Commons BY-SA -.\" Generated by docutils manpage writer. -. diff --git a/man/rbd-replay-prep.8 b/man/rbd-replay-prep.8 deleted file mode 100644 index 5b3c90102ccee..0000000000000 --- a/man/rbd-replay-prep.8 +++ /dev/null @@ -1,103 +0,0 @@ -.\" Man page generated from reStructuredText. -. -.TH "RBD-REPLAY-PREP" "8" "August 21, 2014" "dev" "Ceph" -.SH NAME -rbd-replay-prep \- prepare captured rados block device (RBD) workloads for replay -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -.SH SYNOPSIS -.nf -\fBrbd\-replay\-prep\fP [ \-\-window \fIseconds\fP ] [ \-\-anonymize ] \fItrace_dir\fP \fIreplay_file\fP -.fi -.sp -.SH DESCRIPTION -.sp -\fBrbd\-replay\-prep\fP processes raw rados block device (RBD) traces to prepare them for \fBrbd\-replay\fP\&. -.SH OPTIONS -.INDENT 0.0 -.TP -.B \-\-window seconds -Requests further apart than \(aqseconds\(aq seconds are assumed to be independent. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-anonymize -Anonymizes image and snap names. -.UNINDENT -.SH EXAMPLES -.sp -To prepare workload1\-trace for replay: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -rbd\-replay\-prep workload1\-trace/ust/uid/1000/64\-bit workload1 -.ft P -.fi -.UNINDENT -.UNINDENT -.SH AVAILABILITY -.sp -\fBrbd\-replay\-prep\fP is part of the Ceph distributed storage system. Please refer to -the Ceph documentation at \fI\%http://ceph.com/docs\fP for more information. -.SH SEE ALSO -.sp -\fBrbd\-replay\fP(8), -\fBrbd\fP(8) -.SH COPYRIGHT -2010-2014, Inktank Storage, Inc. and contributors. Licensed under Creative Commons BY-SA -.\" Generated by docutils manpage writer. -. diff --git a/man/rbd-replay.8 b/man/rbd-replay.8 deleted file mode 100644 index e50c2fdcd8a7e..0000000000000 --- a/man/rbd-replay.8 +++ /dev/null @@ -1,141 +0,0 @@ -.\" Man page generated from reStructuredText. -. -.TH "RBD-REPLAY" "8" "September 10, 2014" "dev" "Ceph" -.SH NAME -rbd-replay \- replay rados block device (RBD) workloads -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -.SH SYNOPSIS -.nf -\fBrbd\-replay\fP [ \fIoptions\fP ] \fIreplay_file\fP -.fi -.sp -.SH DESCRIPTION -.sp -\fBrbd\-replay\fP is a utility for replaying rados block device (RBD) workloads. -.SH OPTIONS -.INDENT 0.0 -.TP -.B \-c ceph.conf, \-\-conf ceph.conf -Use ceph.conf configuration file instead of the default /etc/ceph/ceph.conf to -determine monitor addresses during startup. -.UNINDENT -.INDENT 0.0 -.TP -.B \-p pool, \-\-pool pool -Interact with the given pool. Defaults to \(aqrbd\(aq. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-latency\-multiplier -Multiplies inter\-request latencies. Default: 1. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-read\-only -Only replay non\-destructive requests. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-map\-image rule -Add a rule to map image names in the trace to image names in the replay cluster. -A rule of image1@snap1=image2@snap2 would map snap1 of image1 to snap2 of image2. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-dump\-perf\-counters -\fBExperimental\fP -Dump performance counters to standard out before an image is closed. -Performance counters may be dumped multiple times if multiple images are closed, -or if the same image is opened and closed multiple times. -Performance counters and their meaning may change between versions. -.UNINDENT -.SH EXAMPLES -.sp -To replay workload1 as fast as possible: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -rbd\-replay \-\-latency\-multiplier=0 workload1 -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -To replay workload1 but use test_image instead of prod_image: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -rbd\-replay \-\-map\-image=prod_image=test_image workload1 -.ft P -.fi -.UNINDENT -.UNINDENT -.SH AVAILABILITY -.sp -\fBrbd\-replay\fP is part of the Ceph distributed storage system. Please refer to -the Ceph documentation at \fI\%http://ceph.com/docs\fP for more information. -.SH SEE ALSO -.sp -\fBrbd\-replay\-prep\fP(8), -\fBrbd\fP(8) -.SH COPYRIGHT -2010-2014, Inktank Storage, Inc. and contributors. Licensed under Creative Commons BY-SA -.\" Generated by docutils manpage writer. -. diff --git a/man/rbd.8 b/man/rbd.8 deleted file mode 100644 index ce457bde5aac7..0000000000000 --- a/man/rbd.8 +++ /dev/null @@ -1,616 +0,0 @@ -.\" Man page generated from reStructuredText. -. -.TH "RBD" "8" "January 12, 2014" "dev" "Ceph" -.SH NAME -rbd \- manage rados block device (RBD) images -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -. -.nr rst2man-indent-level 0 -. -.de1 rstReportMargin -\\$1 \\n[an-margin] -level \\n[rst2man-indent-level] -level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] -- -\\n[rst2man-indent0] -\\n[rst2man-indent1] -\\n[rst2man-indent2] -.. -.de1 INDENT -.\" .rstReportMargin pre: -. RS \\$1 -. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] -. nr rst2man-indent-level +1 -.\" .rstReportMargin post: -.. -.de UNINDENT -. RE -.\" indent \\n[an-margin] -.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] -.nr rst2man-indent-level -1 -.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] -.in \\n[rst2man-indent\\n[rst2man-indent-level]]u -.. -.SH SYNOPSIS -.nf -\fBrbd\fP [ \-c \fIceph.conf\fP ] [ \-m \fImonaddr\fP ] [ \-p | \-\-pool \fIpool\fP ] [ -\-\-size \fIsize\fP ] [ \-\-order \fIbits\fP ] [ \fIcommand\fP ... ] -.fi -.sp -.SH DESCRIPTION -.sp -\fBrbd\fP is a utility for manipulating rados block device (RBD) images, -used by the Linux rbd driver and the rbd storage driver for Qemu/KVM. -RBD images are simple block devices that are striped over objects and -stored in a RADOS object store. The size of the objects the image is -striped over must be a power of two. -.SH OPTIONS -.INDENT 0.0 -.TP -.B \-c ceph.conf, \-\-conf ceph.conf -Use ceph.conf configuration file instead of the default /etc/ceph/ceph.conf to -determine monitor addresses during startup. -.UNINDENT -.INDENT 0.0 -.TP -.B \-m monaddress[:port] -Connect to specified monitor (instead of looking through ceph.conf). -.UNINDENT -.INDENT 0.0 -.TP -.B \-p pool, \-\-pool pool -Interact with the given pool. Required by most commands. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-no\-progress -Do not output progress information (goes to standard error by -default for some commands). -.UNINDENT -.SH PARAMETERS -.INDENT 0.0 -.TP -.B \-\-image\-format format -Specifies which object layout to use. The default is 1. -.INDENT 7.0 -.IP \(bu 2 -format 1 \- Use the original format for a new rbd image. This format is -understood by all versions of librbd and the kernel rbd module, but -does not support newer features like cloning. -.IP \(bu 2 -format 2 \- Use the second rbd format, which is supported by -librbd and kernel since version 3.11 (except for striping). This adds -support for cloning and is more easily extensible to allow more -features in the future. -.UNINDENT -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-size size\-in\-mb -Specifies the size (in megabytes) of the new rbd image. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-order bits -Specifies the object size expressed as a number of bits, such that -the object size is \fB1 << order\fP\&. The default is 22 (4 MB). -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-stripe\-unit size\-in\-bytes -Specifies the stripe unit size in bytes. See striping section (below) for more details. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-stripe\-count num -Specifies the number of objects to stripe over before looping back -to the first object. See striping section (below) for more details. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-snap snap -Specifies the snapshot name for the specific operation. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-id username -Specifies the username (without the \fBclient.\fP prefix) to use with the map command. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-keyfile filename -Specifies a file containing the secret to use with the map command. -If not specified, \fBclient.admin\fP will be used by default. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-keyring filename -Specifies a keyring file containing a secret for the specified user -to use with the map command. If not specified, the default keyring -locations will be searched. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-shared tag -Option for \fIlock add\fP that allows multiple clients to lock the -same image if they use the same tag. The tag is an arbitrary -string. This is useful for situations where an image must -be open from more than one client at once, like during -live migration of a virtual machine, or for use underneath -a clustered filesystem. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-format format -Specifies output formatting (default: plain, json, xml) -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-pretty\-format -Make json or xml formatted output more human\-readable. -.UNINDENT -.INDENT 0.0 -.TP -.B \-o map\-options, \-\-options map\-options -Specifies which options to use when mapping an image. map\-options is -a comma\-separated string of options (similar to mount(8) mount options). -See map options section below for more details. -.UNINDENT -.INDENT 0.0 -.TP -.B \-\-read\-only -Map the image read\-only. Equivalent to \-o ro. -.UNINDENT -.SH COMMANDS -.INDENT 0.0 -.TP -.B \fBls\fP [\-l | \-\-long] [pool\-name] -Will list all rbd images listed in the rbd_directory object. With -\-l, also show snapshots, and use longer\-format output including -size, parent (if clone), format, etc. -.TP -.B \fBinfo\fP [\fIimage\-name\fP] -Will dump information (such as size and order) about a specific rbd image. -If image is a clone, information about its parent is also displayed. -If a snapshot is specified, whether it is protected is shown as well. -.TP -.B \fBcreate\fP [\fIimage\-name\fP] -Will create a new rbd image. You must also specify the size via \-\-size. The -\-\-stripe\-unit and \-\-stripe\-count arguments are optional, but must be used together. -.TP -.B \fBclone\fP [\fIparent\-snapname\fP] [\fIimage\-name\fP] -Will create a clone (copy\-on\-write child) of the parent snapshot. -Object order will be identical to that of the parent image unless -specified. Size will be the same as the parent snapshot. -.sp -The parent snapshot must be protected (see \fIrbd snap protect\fP). -This requires image format 2. -.TP -.B \fBflatten\fP [\fIimage\-name\fP] -If image is a clone, copy all shared blocks from the parent snapshot and -make the child independent of the parent, severing the link between -parent snap and child. The parent snapshot can be unprotected and -deleted if it has no further dependent clones. -.sp -This requires image format 2. -.TP -.B \fBchildren\fP [\fIimage\-name\fP] -List the clones of the image at the given snapshot. This checks -every pool, and outputs the resulting poolname/imagename. -.sp -This requires image format 2. -.TP -.B \fBresize\fP [\fIimage\-name\fP] [\-\-allow\-shrink] -Resizes rbd image. The size parameter also needs to be specified. -The \-\-allow\-shrink option lets the size be reduced. -.TP -.B \fBrm\fP [\fIimage\-name\fP] -Deletes an rbd image (including all data blocks). If the image has -snapshots, this fails and nothing is deleted. -.TP -.B \fBexport\fP [\fIimage\-name\fP] [\fIdest\-path\fP] -Exports image to dest path (use \- for stdout). -.TP -.B \fBimport\fP [\fIpath\fP] [\fIdest\-image\fP] -Creates a new image and imports its data from path (use \- for -stdin). The import operation will try to create sparse rbd images -if possible. For import from stdin, the sparsification unit is -the data block size of the destination image (1 << order). -.TP -.B \fBexport\-diff\fP [\fIimage\-name\fP] [\fIdest\-path\fP] [\-\-from\-snap \fIsnapname\fP] -Exports an incremental diff for an image to dest path (use \- for stdout). If -an initial snapshot is specified, only changes since that snapshot are included; otherwise, -any regions of the image that contain data are included. The end snapshot is specified -using the standard \-\-snap option or @snap syntax (see below). The image diff format includes -metadata about image size changes, and the start and end snapshots. It efficiently represents -discarded or \(aqzero\(aq regions of the image. -.TP -.B \fBimport\-diff\fP [\fIsrc\-path\fP] [\fIimage\-name\fP] -Imports an incremental diff of an image and applies it to the current image. If the diff -was generated relative to a start snapshot, we verify that snapshot already exists before -continuing. If there was an end snapshot we verify it does not already exist before -applying the changes, and create the snapshot when we are done. -.TP -.B \fBdiff\fP [\fIimage\-name\fP] [\-\-from\-snap \fIsnapname\fP] -Dump a list of byte extents in the image that have changed since the specified start -snapshot, or since the image was created. Each output line includes the starting offset -(in bytes), the length of the region (in bytes), and either \(aqzero\(aq or \(aqdata\(aq to indicate -whether the region is known to be zeros or may contain other data. -.TP -.B \fBcp\fP [\fIsrc\-image\fP] [\fIdest\-image\fP] -Copies the content of a src\-image into the newly created dest\-image. -dest\-image will have the same size, order, and image format as src\-image. -.TP -.B \fBmv\fP [\fIsrc\-image\fP] [\fIdest\-image\fP] -Renames an image. Note: rename across pools is not supported. -.TP -.B \fBsnap\fP ls [\fIimage\-name\fP] -Dumps the list of snapshots inside a specific image. -.TP -.B \fBsnap\fP create [\fIimage\-name\fP] -Creates a new snapshot. Requires the snapshot name parameter specified. -.TP -.B \fBsnap\fP rollback [\fIimage\-name\fP] -Rollback image content to snapshot. This will iterate through the entire blocks -array and update the data head content to the snapshotted version. -.TP -.B \fBsnap\fP rm [\fIimage\-name\fP] -Removes the specified snapshot. -.TP -.B \fBsnap\fP purge [\fIimage\-name\fP] -Removes all snapshots from an image. -.TP -.B \fBsnap\fP protect [\fIimage\-name\fP] -Protect a snapshot from deletion, so that clones can be made of it -(see \fIrbd clone\fP). Snapshots must be protected before clones are made; -protection implies that there exist dependent cloned children that -refer to this snapshot. \fIrbd clone\fP will fail on a nonprotected -snapshot. -.sp -This requires image format 2. -.TP -.B \fBsnap\fP unprotect [\fIimage\-name\fP] -Unprotect a snapshot from deletion (undo \fIsnap protect\fP). If cloned -children remain, \fIsnap unprotect\fP fails. (Note that clones may exist -in different pools than the parent snapshot.) -.sp -This requires image format 2. -.TP -.B \fBmap\fP [\fIimage\-name\fP] [\-o | \-\-options \fImap\-options\fP ] [\-\-read\-only] -Maps the specified image to a block device via the rbd kernel module. -.TP -.B \fBunmap\fP [\fIdevice\-path\fP] -Unmaps the block device that was mapped via the rbd kernel module. -.TP -.B \fBshowmapped\fP -Show the rbd images that are mapped via the rbd kernel module. -.TP -.B \fBlock\fP list [\fIimage\-name\fP] -Show locks held on the image. The first column is the locker -to use with the \fIlock remove\fP command. -.TP -.B \fBlock\fP add [\fIimage\-name\fP] [\fIlock\-id\fP] -Lock an image. The lock\-id is an arbitrary name for the user\(aqs -convenience. By default, this is an exclusive lock, meaning it -will fail if the image is already locked. The \-\-shared option -changes this behavior. Note that locking does not affect -any operation other than adding a lock. It does not -protect an image from being deleted. -.TP -.B \fBlock\fP remove [\fIimage\-name\fP] [\fIlock\-id\fP] [\fIlocker\fP] -Release a lock on an image. The lock id and locker are -as output by lock ls. -.TP -.B \fBbench\-write\fP [\fIimage\-name\fP] \-\-io\-size [\fIio\-size\-in\-bytes\fP] \-\-io\-threads [\fInum\-ios\-in\-flight\fP] \-\-io\-total [\fItotal\-bytes\-to\-write\fP] -Generate a series of sequential writes to the image and measure the -write throughput and latency. Defaults are: \-\-io\-size 4096, \-\-io\-threads 16, -\-\-io\-total 1GB -.UNINDENT -.SH IMAGE NAME -.sp -In addition to using the \-\-pool and the \-\-snap options, the image name can include both -the pool name and the snapshot name. The image name format is as follows: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -[pool/]image\-name[@snap] -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -Thus an image name that contains a slash character (\(aq/\(aq) requires specifying the pool -name explicitly. -.SH STRIPING -.sp -RBD images are striped over many objects, which are then stored by the -Ceph distributed object store (RADOS). As a result, read and write -requests for the image are distributed across many nodes in the -cluster, generally preventing any single node from becoming a -bottleneck when individual images get large or busy. -.sp -The striping is controlled by three parameters: -.INDENT 0.0 -.TP -.B order -.TP -.B The size of objects we stripe over is a power of two, specifially 2^[*order*] bytes. The default -.TP -.B is 22, or 4 MB. -.UNINDENT -.INDENT 0.0 -.TP -.B stripe_unit -.TP -.B Each [*stripe_unit*] contiguous bytes are stored adjacently in the same object, before we move on -.TP -.B to the next object. -.UNINDENT -.INDENT 0.0 -.TP -.B stripe_count -.TP -.B After we write [*stripe_unit*] bytes to [*stripe_count*] objects, we loop back to the initial object -.TP -.B and write another stripe, until the object reaches its maximum size (as specified by [*order*]. At that -.TP -.B point, we move on to the next [*stripe_count*] objects. -.UNINDENT -.sp -By default, [\fIstripe_unit\fP] is the same as the object size and [\fIstripe_count\fP] is 1. Specifying a different -[\fIstripe_unit\fP] requires that the STRIPINGV2 feature be supported (added in Ceph v0.53) and format 2 images be -used. -.SH MAP OPTIONS -.sp -Most of these options are useful mainly for debugging and benchmarking. The -default values are set in the kernel and may therefore depend on the version of -the running kernel. -.INDENT 0.0 -.IP \(bu 2 -fsid=aaaaaaaa\-bbbb\-cccc\-dddd\-eeeeeeeeeeee \- FSID that should be assumed by -the client. -.IP \(bu 2 -ip=a.b.c.d[:p] \- IP and, optionally, port the client should use. -.IP \(bu 2 -share \- Enable sharing of client instances with other mappings (default). -.IP \(bu 2 -noshare \- Disable sharing of client instances with other mappings. -.IP \(bu 2 -crc \- Enable CRC32C checksumming for data writes (default). -.IP \(bu 2 -nocrc \- Disable CRC32C checksumming for data writes. -.IP \(bu 2 -osdkeepalive=x \- OSD keepalive timeout (default is 5 seconds). -.IP \(bu 2 -osd_idle_ttl=x \- OSD idle TTL (default is 60 seconds). -.IP \(bu 2 -rw \- Map the image read\-write (default). -.IP \(bu 2 -ro \- Map the image read\-only. Equivalent to \-\-read\-only. -.UNINDENT -.SH EXAMPLES -.sp -To create a new rbd image that is 100 GB: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -rbd \-p mypool create myimage \-\-size 102400 -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -or alternatively: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -rbd create mypool/myimage \-\-size 102400 -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -To use a non\-default object size (8 MB): -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -rbd create mypool/myimage \-\-size 102400 \-\-order 23 -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -To delete an rbd image (be careful!): -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -rbd rm mypool/myimage -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -To create a new snapshot: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -rbd snap create mypool/myimage@mysnap -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -To create a copy\-on\-write clone of a protected snapshot: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -rbd clone mypool/myimage@mysnap otherpool/cloneimage -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -To see which clones of a snapshot exist: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -rbd children mypool/myimage@mysnap -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -To delete a snapshot: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -rbd snap rm mypool/myimage@mysnap -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -To map an image via the kernel with cephx enabled: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -rbd map mypool/myimage \-\-id admin \-\-keyfile secretfile -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -To unmap an image: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -rbd unmap /dev/rbd0 -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -To create an image and a clone from it: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -rbd import \-\-image\-format 2 image mypool/parent -rbd snap create \-\-snap snapname mypool/parent -rbd snap protect mypool/parent@snap -rbd clone mypool/parent@snap otherpool/child -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -To create an image with a smaller stripe_unit (to better distribute small writes in some workloads): -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -rbd \-p mypool create myimage \-\-size 102400 \-\-stripe\-unit 65536 \-\-stripe\-count 16 -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -To change an image from one image format to another, export it and then -import it as the desired image format: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -rbd export mypool/myimage@snap /tmp/img -rbd import \-\-image\-format 2 /tmp/img mypool/myimage2 -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -To lock an image for exclusive use: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -rbd lock add mypool/myimage mylockid -.ft P -.fi -.UNINDENT -.UNINDENT -.sp -To release a lock: -.INDENT 0.0 -.INDENT 3.5 -.sp -.nf -.ft C -rbd lock remove mypool/myimage mylockid client.2485 -.ft P -.fi -.UNINDENT -.UNINDENT -.SH AVAILABILITY -.sp -\fBrbd\fP is part of the Ceph distributed storage system. Please refer to -the Ceph documentation at \fI\%http://ceph.com/docs\fP for more information. -.SH SEE ALSO -.sp -\fBceph\fP(8), -\fBrados\fP(8) -.SH COPYRIGHT -2010-2014, Inktank Storage, Inc. and contributors. Licensed under Creative Commons BY-SA -.\" Generated by docutils manpage writer. -. diff --git a/qa/qa_scripts/cephscrub.sh b/qa/qa_scripts/cephscrub.sh index d2cffdc24f89e..7bd16fd21a20b 100755 --- a/qa/qa_scripts/cephscrub.sh +++ b/qa/qa_scripts/cephscrub.sh @@ -26,7 +26,7 @@ sudo apt-get -y purge libcephfs1-dbg sudo apt-get -y purge libcephfs-dev sudo apt-get -y purge radosgw sudo apt-get -y purge radosgw-dbg -sudo apt-get -y purge rest-bench -sudo apt-get -y purge rest-bench-dbg sudo apt-get -y purge obsync -sudo apt-get -y purge python-ceph +sudo apt-get -y purge python-rados +sudo apt-get -y purge python-rbd +sudo apt-get -y purge python-cephfs diff --git a/qa/rgw/s3.sh b/qa/rgw/s3.sh deleted file mode 100755 index 69040b74f1211..0000000000000 --- a/qa/rgw/s3.sh +++ /dev/null @@ -1,174 +0,0 @@ -#!/bin/sh - - -origdir=`pwd` -# set -x - -load_credentials() { - if [ -e ~/.s3 ]; then - source ~/.s3 - else - echo "ERROR: Credentials not defined!" - exit 1 - fi -} - -if [ "$S3_ACCESS_KEY_ID" == "" ] || - [ "$S3_HOSTNAME" == "" ] || - [ "$S3_SECRET_ACCESS_KEY" == "" ]; then - load_credentials -fi - -bindir=${origdir}/libs3/build/bin -libdir=${origdir}/libs3/build/lib -log=${origdir}/s3.log -export LD_LIBRARY_PATH=${libdir} -s3=${bindir}/s3 - -tmp_bucket="test-`date +%s`" -tmpdir="tmp" - -cleanup() { - rm -fR libs3 tmp -} - -build() { - echo "Checking out source" - log git clone git://github.com/wido/libs3.git - echo "Building" - log make -C libs3 -} - -init() { - cleanup - build - mkdir -p tmp -} - -log() { - "$@" >> $log -} - -check_error() { - should_succeed=$1 - fail=`grep -c ERROR .cmd.log` - [ $fail -eq 0 ] && success=1 || success=0 - if [ $success -ne $should_succeed ]; then - [ $should_succeed -ne 0 ] && echo "Command failed:" - [ $should_succeed -eq 0 ] && echo "Command succeeded unexpectedly:" - echo "$op $params" - cat .cmd.log - exit 1 - fi -} - -do_op() { - should_succeed=$1 - shift - op=$1 - shift - params="$@" - echo "# $op" "$@" | tee -a $log - $op "$@" > .cmd.log 2>&1 - log cat .cmd.log - check_error $should_succeed -} - -run_s3() { - echo $s3 "$@" >> .cmd.log - $s3 "$@" -} - -create_bucket() { - bucket_name=$1 - - run_s3 create $bucket_name -} - -delete_bucket() { - bucket_name=$1 - - run_s3 delete $bucket_name -} - -create_file() { - filename=$1 - dd if=/dev/urandom of=$tmpdir/$filename bs=4096 count=2048 - run_s3 put $tmp_bucket/$filename filename=$tmpdir/$filename -} - -get_file() { - filename=$1 - dest_fname=$2 - run_s3 get $tmp_bucket/$filename filename=$tmpdir/$dest_fname - do_op 1 diff $tmpdir/$filename $tmpdir/$dest_fname - rm -f $tmpdir/foo.tmp -} - -get_acl() { - filename=$1 - dest_fname=$2 - run_s3 getacl $tmp_bucket/$filename filename=$tmpdir/$dest_fname -} - -set_acl() { - filename=$1 - src_fname=$2 - run_s3 setacl $tmp_bucket/$filename filename=$tmpdir/$src_fname -} - -delete_file() { - filename=$1 - run_s3 delete $tmp_bucket/$filename -} - -get_anon() { - should_succeed=$1 - bucket=$2 - fname=$3 - dest=$tmpdir/$4 - - echo "# get_anon $@" - - url="http://$bucket.$S3_HOSTNAME/$fname" - wget $url -O $dest > .cmd.log 2>&1 - res=$? - log cat .cmd.log - if [ $res -ne 0 ]; then - echo "ERROR: Could not fetch file anonymously (url=$url)" > .cmd.log - fi - check_error $should_succeed -} - -add_acl() { - filename=$1 - acl=$2 - echo $acl >> $tmpdir/$filename -} - -main() { - log echo "****************************************************************" - log echo "* `date`" >> $log - log echo "****************************************************************" - init - do_op 1 create_bucket $tmp_bucket - do_op 0 create_bucket $tmp_bucket - - do_op 1 create_file foo - do_op 1 get_file foo foo.tmp - - do_op 1 get_acl foo foo.acl - get_anon 0 $tmp_bucket foo foo.anon - add_acl foo.acl "Group All Users READ" - - do_op 1 set_acl foo foo.acl - get_anon 1 $tmp_bucket foo foo.anon - - do_op 1 delete_file foo - do_op 1 delete_bucket $tmp_bucket -} - - -main "$@" - - diff --git a/qa/run_xfstests-obsolete.sh b/qa/run_xfstests-obsolete.sh index 3f5e2eca9f507..9845d08d5d907 100644 --- a/qa/run_xfstests-obsolete.sh +++ b/qa/run_xfstests-obsolete.sh @@ -33,7 +33,7 @@ PROGNAME=$(basename $0) # xfstests is downloaded from this git repository and then built. # XFSTESTS_REPO="git://oss.sgi.com/xfs/cmds/xfstests.git" -XFSTESTS_REPO="git://ceph.com/git/xfstests.git" +XFSTESTS_REPO="git://git.ceph.com/xfstests.git" # Default command line option values COUNT="1" diff --git a/qa/run_xfstests.sh b/qa/run_xfstests.sh index f08c321edd4d9..f1ffc26be114a 100644 --- a/qa/run_xfstests.sh +++ b/qa/run_xfstests.sh @@ -34,6 +34,11 @@ PROGNAME=$(basename $0) # xfstests is downloaded from this git repository and then built. # XFSTESTS_REPO="git://oss.sgi.com/xfs/cmds/xfstests.git" XFSTESTS_REPO="git://ceph.com/git/xfstests.git" +XFSTESTS_VERSION="facff609afd6a2ca557c2b679e088982026aa188" +XFSPROGS_REPO="git://oss.sgi.com/xfs/cmds/xfsprogs" +XFSPROGS_VERSION="v3.2.2" +XFSDUMP_REPO="git://oss.sgi.com/xfs/cmds/xfsdump" +XFSDUMP_VERSION="v3.1.4" # Default command line option values COUNT="1" @@ -44,9 +49,8 @@ SCRATCH_DEV="" # MUST BE SPECIFIED TEST_DEV="" # MUST BE SPECIFIED TESTS="-g auto" # The "auto" group is supposed to be "known good" -# rbd presents geometry information that causes mkfs.xfs to -# issue a warning. This option avoids this class of problems. -XFS_MKFS_OPTIONS="-l su=32k" +# We no longer need to set the stripe unit in XFS_MKFS_OPTIONS because recent +# versions of mkfs.xfs autodetect it. # print an error message and quit with non-zero status function err() { @@ -248,11 +252,14 @@ export PATH="${TESTDIR}/binary/usr/local/sbin:${PATH}" ################################################################ # Filesystem-specific mkfs options--set if not supplied -export XFS_MKFS_OPTIONS="${XFS_MKFS_OPTIONS:--f -l su=65536}" +#export XFS_MKFS_OPTIONS="${XFS_MKFS_OPTIONS:--f -l su=65536}" export EXT4_MKFS_OPTIONS="${EXT4_MKFS_OPTIONS:--F}" export BTRFS_MKFS_OPTION # No defaults XFSTESTS_DIR="/var/lib/xfstests" # Where the tests live +XFSPROGS_DIR="/tmp/cephtest/xfsprogs-install" +XFSDUMP_DIR="/tmp/cephtest/xfsdump-install" +export PATH="${XFSPROGS_DIR}/sbin:${XFSDUMP_DIR}/sbin:${PATH}" # download, build, and install xfstests function install_xfstests() { @@ -266,6 +273,7 @@ function install_xfstests() { git clone "${XFSTESTS_REPO}" cd xfstests + git checkout "${XFSTESTS_VERSION}" ncpu=$(getconf _NPROCESSORS_ONLN 2>&1) [ -n "${ncpu}" -a "${ncpu}" -gt 1 ] && multiple="-j ${ncpu}" @@ -288,10 +296,12 @@ function remove_xfstests() { # create a host options file that uses the specified devices function setup_host_options() { arg_count 0 $# + export MNTDIR="/tmp/cephtest" # Create mount points for the test and scratch filesystems - local test_dir="$(mktemp -d ${TESTDIR}/test_dir.XXXXXXXXXX)" - local scratch_dir="$(mktemp -d ${TESTDIR}/scratch_mnt.XXXXXXXXXX)" + mkdir -p ${MNTDIR} + local test_dir="$(mktemp -d ${MNTDIR}/test_dir.XXXXXXXXXX)" + local scratch_dir="$(mktemp -d ${MNTDIR}/scratch_mnt.XXXXXXXXXX)" # Write a host options file that uses these devices. # xfstests uses the file defined by HOST_OPTIONS as the @@ -393,13 +403,70 @@ function cleanup_xfstests() { # the corresponding setup function mounted them...) do_umount "${TEST_DEV}" do_umount "${SCRATCH_DEV}" + rmdir "${TEST_DIR}" + rmdir "${SCRATCH_MNT}" + rmdir "${MNTDIR}" } +function install_xfsprogs() { + arg_count 0 $# + + pushd "${TESTDIR}" + git clone ${XFSPROGS_REPO} + cd xfsprogs + git checkout ${XFSPROGS_VERSION} + libtoolize -c `libtoolize -n -i >/dev/null 2>/dev/null && echo -i` -f + cp include/install-sh . + aclocal -I m4 + autoconf + ./configure --prefix=${XFSPROGS_DIR} + make install + popd +} + +function install_xfsdump() { + arg_count 0 $# + + pushd "${TESTDIR}" + git clone ${XFSDUMP_REPO} + cd xfsdump + git checkout ${XFSDUMP_VERSION} + + # somebody took #define min and #define max out, which breaks the build on + # ubuntu. we back out this commit here, though that may cause problems with + # this script down the line. + git revert -n 5a2985233c390d59d2a9757b119cb0e001c87a96 + libtoolize -c `libtoolize -n -i >/dev/null 2>/dev/null && echo -i` -f + cp include/install-sh . + aclocal -I m4 + autoconf + ./configure --prefix=${XFSDUMP_DIR} + (make -k install || true) # that's right, the install process is broken too + popd +} + +function remove_xfsprogs() { + arg_count 0 $# + + rm -rf ${TESTDIR}/xfsprogs + rm -rf ${XFSPROGS_DIR} +} + +function remove_xfsdump() { + arg_count 0 $# + + rm -rf ${TESTDIR}/xfsdump + rm -rf ${XFSDUMP_DIR} +} + + # top-level setup routine function setup() { arg_count 0 $# setup_host_options + install_xfsprogs + install_xfsdump install_xfstests setup_xfstests } @@ -409,6 +476,8 @@ function cleanup() { arg_count 0 $# cd / + remove_xfsprogs + remove_xfsdump cleanup_xfstests remove_xfstests cleanup_host_options diff --git a/qa/run_xfstests_krbd.sh b/qa/run_xfstests_krbd.sh index 4e724d92d6443..72c6df9e9ba80 100644 --- a/qa/run_xfstests_krbd.sh +++ b/qa/run_xfstests_krbd.sh @@ -8,9 +8,12 @@ set -x [ -n "${TESTDIR}" ] || export TESTDIR="/tmp/cephtest" [ -d "${TESTDIR}" ] || mkdir "${TESTDIR}" -URL_BASE="https://ceph.com/git/?p=ceph.git;a=blob_plain;f=qa" SCRIPT="run_xfstests.sh" +if [ -z "${URL_BASE}" ]; then + URL_BASE="https://ceph.com/git/?p=ceph.git;a=blob_plain;f=qa" +fi + cd "${TESTDIR}" wget -O "${SCRIPT}" "${URL_BASE}/${SCRIPT}" @@ -23,37 +26,58 @@ cat > "${EXPUNGE}" <<-! # wasn't run - like 'mv', but wasn't specifically excluded # new test - didn't exist in the xfstests version that was # used by the old version of this script + + generic/038 + generic/042 # zeroes out only the last 4k of test file, but expects + # only zeros in the entire file. bug in test? + generic/046 # _count_extents in common/rc assumes backticks do not + # remove newlines. This breaks parsing on some + # platforms. + generic/050 # blockdev --setro right after mkfs returns EBUSY + generic/078 # RENAME_WHITEOUT was enabled in kernel commit 7dcf5c, but causes + # a BUG for now + generic/081 # ubuntu lvm2 doesn't suport --yes argument + generic/083 # mkfs.xfs -dxize=104857600,agcount=6 fails + # when sunit=swidth=8192 + generic/093 # not for Linux + generic/097 # not for Linux + generic/099 # not for Linux + generic/204 # stripe size throws off test's math for when to + # expect ENOSPC + generic/231 # broken for disk and rbd by xfs kernel commit 4162bb - generic/062 # mv - generic/083 # mv - generic/127 # mv - generic/204 # mv - generic/306 # new test + shared/272 # not for xfs + shared/289 # not for xfs - xfs/007 # new test - xfs/008 # mv, see 2db20d972125 - xfs/030 # mv - xfs/042 # mv - xfs/073 # mv - xfs/096 # mv - xfs/104 # mv - xfs/109 # mv - xfs/170 # mv - xfs/178 # mv - xfs/200 # mv - xfs/206 # mv - xfs/229 # mv - xfs/242 # mv - xfs/250 # mv - xfs/279 # wasn't run - xfs/287 # wasn't run - xfs/291 # wasn't run - xfs/292 # wasn't run - xfs/293 # wasn't run - xfs/295 # wasn't run - xfs/296 # wasn't run - xfs/301 # new test - xfs/302 # new test + xfs/007 # sector size math + xfs/030 # mkfs.xfs -dsize=100m,agcount=6 fails + # when sunit=swidth=8192 + xfs/032 # xfs_copy cleans up with pthread_kill (RHBA-2015-0537) + xfs/042 # stripe size throws off test's math when filling FS + xfs/051 + xfs/057 # test for IRIX + xfs/058 # test for IRIX + xfs/069 # _filter_bmap in common/punch parses incorrectly if + # blocks are not stripe-aligned + xfs/070 # extra output from xfs_repair + xfs/071 # xfs_repair issue on large offsets (RHBA-2015-0537) + xfs/073 + xfs/081 # very small mkfs breaks test with sunit=swidth-8192 + xfs/095 # not for Linux + xfs/096 # checks various mkfs options and chokes on sunit/swidth + xfs/104 # can't suppress sunit/swidth warnings on mkfs + xfs/109 # can't suppress sunit/swidth warnings on mkfs + xfs/167 + xfs/178 # test explicitly checks for stripe width of 0 + xfs/191 # tests NFSv4 + xfs/197 # tests 32-bit machines + xfs/205 # very small mkfs breaks tests with sunit=swidth=8192 + xfs/242 # _filter_bmap in common/punch parses incorrectly if + # blocks are not stripe-aligned + xfs/261 # bug in mount_xfs involving creation of new quota files + xfs/279 # sector size math (logical v. physical: BZ836433?) + xfs/297 # XXX: temporarily expunged due to length + xfs/300 # SELinux ! ./"${SCRIPT}" -x "$(readlink -f "${EXPUNGE}")" "$@" diff --git a/qa/workunits/ceph-deploy/ceph-deploy_hello_world.sh b/qa/workunits/ceph-deploy/ceph-deploy_hello_world.sh new file mode 100755 index 0000000000000..30e74cce5b56d --- /dev/null +++ b/qa/workunits/ceph-deploy/ceph-deploy_hello_world.sh @@ -0,0 +1,13 @@ +#!/bin/sh -e + +#check ceph health +ceph -s +#list pools +rados lspools +#lisr rbd images +rbd ls +#check that the monitors work +ceph osd set nodown +ceph osd unset nodown + +exit 0 diff --git a/qa/workunits/ceph-helpers.sh b/qa/workunits/ceph-helpers.sh new file mode 100755 index 0000000000000..de44b239f1fb1 --- /dev/null +++ b/qa/workunits/ceph-helpers.sh @@ -0,0 +1,1208 @@ +#!/bin/bash +# +# Copyright (C) 2013,2014 Cloudwatt +# Copyright (C) 2014,2015 Red Hat +# Copyright (C) 2014 Federico Gimenez +# +# Author: Loic Dachary +# Author: Federico Gimenez +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# +TIMEOUT=120 +PG_NUM=4 + +if type xmlstarlet > /dev/null 2>&1; then + XMLSTARLET=xmlstarlet +elif type xml > /dev/null 2>&1; then + XMLSTARLET=xml +else + echo "Missing xmlstarlet binary!" + exit 1 +fi + +#! @file ceph-helpers.sh +# @brief Toolbox to manage Ceph cluster dedicated to testing +# +# Example use case: +# +# ~~~~~~~~~~~~~~~~{.sh} +# source ceph-helpers.sh +# +# function mytest() { +# # cleanup leftovers and reset mydir +# setup mydir +# # create a cluster with one monitor and three osds +# run_mon mydir a +# run_osd mydir 0 +# run_osd mydir 2 +# run_osd mydir 3 +# # put and get an object +# rados --pool rbd put GROUP /etc/group +# rados --pool rbd get GROUP /tmp/GROUP +# # stop the cluster and cleanup the directory +# teardown mydir +# } +# ~~~~~~~~~~~~~~~~ +# +# The focus is on simplicity and efficiency, in the context of +# functional tests. The output is intentionally very verbose +# and functions return as soon as an error is found. The caller +# is also expected to abort on the first error so that debugging +# can be done by looking at the end of the output. +# +# Each function is documented, implemented and tested independently. +# When modifying a helper, the test and the documentation are +# expected to be updated and it is easier of they are collocated. A +# test for a given function can be run with +# +# ~~~~~~~~~~~~~~~~{.sh} +# ceph-helpers.sh TESTS test_get_osds +# ~~~~~~~~~~~~~~~~ +# +# and all the tests (i.e. all functions matching test_*) are run +# with: +# +# ~~~~~~~~~~~~~~~~{.sh} +# ceph-helpers.sh TESTS +# ~~~~~~~~~~~~~~~~ +# +# A test function takes a single argument : the directory dedicated +# to the tests. It is expected to not create any file outside of this +# directory and remove it entirely when it completes successfully. +# + + +## +# Cleanup any leftovers found in **dir** via **teardown** +# and reset **dir** as an empty environment. +# +# @param dir path name of the environment +# @return 0 on success, 1 on error +# +function setup() { + local dir=$1 + teardown $dir || return 1 + mkdir -p $dir +} + +function test_setup() { + local dir=$dir + setup $dir || return 1 + test -d $dir || return 1 + setup $dir || return 1 + test -d $dir || return 1 + teardown $dir +} + +####################################################################### + +## +# Kill all daemons for which a .pid file exists in **dir** and remove +# **dir**. If the file system in which **dir** is btrfs, delete all +# subvolumes that relate to it. +# +# @param dir path name of the environment +# @return 0 on success, 1 on error +# +function teardown() { + local dir=$1 + kill_daemons $dir + if [ $(stat -f -c '%T' .) == "btrfs" ]; then + __teardown_btrfs $dir + fi + rm -fr $dir +} + +function __teardown_btrfs() { + local btrfs_base_dir=$1 + + btrfs_dirs=`ls -l $btrfs_base_dir | egrep '^d' | awk '{print $9}'` + for btrfs_dir in $btrfs_dirs + do + btrfs_subdirs=`ls -l $btrfs_base_dir/$btrfs_dir | egrep '^d' | awk '{print $9}'` + for btrfs_subdir in $btrfs_subdirs + do + btrfs subvolume delete $btrfs_base_dir/$btrfs_dir/$btrfs_subdir + done + done +} + +function test_teardown() { + local dir=$dir + setup $dir || return 1 + teardown $dir || return 1 + ! test -d $dir || return 1 +} + +####################################################################### + +## +# Kill all daemons for which a .pid file exists in **dir**. Each +# daemon is sent a **signal** and kill_daemons waits for it to exit +# during a few minutes. By default all daemons are killed. If a +# **name_prefix** is provided, only the daemons matching it are +# killed. +# +# Send KILL to all daemons : kill_daemons $dir +# Send TERM to all daemons : kill_daemons $dir TERM +# Send TERM to all osds : kill_daemons $dir TERM osd +# +# If a daemon is sent the TERM signal and does not terminate +# within a few minutes, it will still be running even after +# kill_daemons returns. +# +# If all daemons are kill successfully the function returns 0 +# if at least one daemon remains, this is treated as an +# error and the function return 1. +# +# After the daemon is sent **signal**, its actual termination +# will be verified by sending it signal 0. If the daemon is +# still alive, kill_daemons will pause for a few seconds and +# try again. This will repeat for a fixed number of times +# before kill_daemons returns on failure. The list of +# sleep intervals can be specified as **delays** and defaults +# to: +# +# 0 1 1 1 2 3 5 5 5 10 10 20 60 +# +# This sequence is designed to not require a sleep time (0) if the +# machine is fast enough and the daemon terminates in a fraction of a +# second. The increasing sleep numbers should give plenty of time for +# the daemon to die even on the slowest running machine. If a daemon +# takes more than two minutes to stop (the sum of all sleep times), +# there probably is no point in waiting more and a number of things +# are likely to go wrong anyway: better give up and return on error. +# +# @param dir path name of the environment +# @param signal name of the first signal (defaults to KILL) +# @param name_prefix only kill match daemons (defaults to all) +# @params delays sequence of sleep times before failure +# @return 0 on success, 1 on error +# +function kill_daemons() { + local trace=$(shopt -q -o xtrace && echo true || echo false) + $trace && shopt -u -o xtrace + local dir=$1 + local signal=${2:-KILL} + local name_prefix=$3 # optional, osd, mon, osd.1 + local delays=${4:-0 0 1 1 1 2 3 5 5 5 10 10 20 60} + + local status=0 + for pidfile in $(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid') ; do + pid=$(cat $pidfile) + local send_signal=$signal + local kill_complete=false + for try in $delays ; do + sleep $try + if kill -$send_signal $pid 2> /dev/null ; then + kill_complete=false + else + kill_complete=true + break + fi + send_signal=0 + done + if ! $kill_complete ; then + status=1 + fi + done + $trace && shopt -s -o xtrace + return $status +} + +function test_kill_daemons() { + local dir=$1 + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=1 || return 1 + run_osd $dir 0 || return 1 + # sending signal 0 won't kill the daemon + # waiting just for one second instead of the default schedule + # allows us to quickly verify what happens when kill fails + # to stop the daemon (i.e. it must return false) + ! kill_daemons $dir 0 osd 1 || return 1 + kill_daemons $dir TERM osd || return 1 + ceph osd dump | grep "osd.0 down" || return 1 + kill_daemons $dir TERM || return 1 + ! ceph --connect-timeout 1 status || return 1 + teardown $dir || return 1 +} + +####################################################################### + +## +# Run a monitor by the name mon.**id** with data in **dir**/**id**. +# The logs can be found in **dir**/mon.**id**.log and the pid file +# is **dir**/mon.**id**.pid and the admin socket is +# **dir**/**id**/ceph-mon.**id**.asok. +# +# The remaining arguments are passed verbatim to ceph-mon --mkfs +# and the ceph-mon daemon. +# +# Two mandatory arguments must be provided: --fsid and --mon-host +# Instead of adding them to every call to run_mon, they can be +# set in the CEPH_ARGS environment variable to be read implicitly +# by every ceph command. +# +# The CEPH_CONF variable is expected to be set to /dev/null to +# only rely on arguments for configuration. +# +# Examples: +# +# CEPH_ARGS="--fsid=$(uuidgen) " +# CEPH_ARGS+="--mon-host=127.0.0.1:7018 " +# run_mon $dir a # spawn a mon and bind port 7018 +# run_mon $dir a --debug-filestore=20 # spawn with filestore debugging +# +# If mon_initial_members is not set, the default rbd pool is deleted +# and replaced with a replicated pool with less placement groups to +# speed up initialization. If mon_initial_members is set, no attempt +# is made to recreate the rbd pool because it would hang forever, +# waiting for other mons to join. +# +# A **dir**/ceph.conf file is created but not meant to be used by any +# function. It is convenient for debugging a failure with: +# +# ceph --conf **dir**/ceph.conf -s +# +# @param dir path name of the environment +# @param id mon identifier +# @param ... can be any option valid for ceph-mon +# @return 0 on success, 1 on error +# +function run_mon() { + local dir=$1 + shift + local id=$1 + shift + local data=$dir/$id + + ceph-mon \ + --id $id \ + --mkfs \ + --mon-data=$data \ + --run-dir=$dir \ + "$@" || return 1 + + ceph-mon \ + --id $id \ + --mon-osd-full-ratio=.99 \ + --mon-data-avail-crit=1 \ + --paxos-propose-interval=0.1 \ + --osd-crush-chooseleaf-type=0 \ + --osd-pool-default-erasure-code-directory=.libs \ + --debug-mon 20 \ + --debug-ms 20 \ + --debug-paxos 20 \ + --chdir= \ + --mon-data=$data \ + --log-file=$dir/\$name.log \ + --admin-socket=$dir/\$cluster-\$name.asok \ + --mon-cluster-log-file=$dir/log \ + --run-dir=$dir \ + --pid-file=$dir/\$name.pid \ + "$@" || return 1 + + cat > $dir/ceph.conf </dev/null | \ + $XMLSTARLET sel -t -m "//acting/osd" -v . -o ' ') + # get rid of the trailing space + echo $osds +} + +function test_get_osds() { + local dir=$1 + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=2 || return 1 + run_osd $dir 0 || return 1 + run_osd $dir 1 || return 1 + wait_for_clean || return 1 + get_osds rbd GROUP | grep --quiet '^[0-1] [0-1]$' || return 1 + teardown $dir || return 1 +} + +####################################################################### + +## +# Return the PG of supporting the **objectname** stored in +# **poolname**, as reported by ceph osd map. +# +# @param poolname an existing pool +# @param objectname an objectname (may or may not exist) +# @param STDOUT a PG +# @return 0 on success, 1 on error +# +function get_pg() { + local poolname=$1 + local objectname=$2 + + ceph --format xml osd map $poolname $objectname 2>/dev/null | \ + $XMLSTARLET sel -t -m "//pgid" -v . -n +} + +function test_get_pg() { + local dir=$1 + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=1 || return 1 + run_osd $dir 0 || return 1 + wait_for_clean || return 1 + get_pg rbd GROUP | grep --quiet '^[0-9]\.[0-9a-f][0-9a-f]*$' || return 1 + teardown $dir || return 1 +} + +####################################################################### + +## +# Return the value of the **config**, obtained via the config get command +# of the admin socket of **daemon**.**id**. +# +# @param daemon mon or osd +# @param id mon or osd ID +# @param config the configuration variable name as found in config_opts.h +# @param STDOUT the config value +# @return 0 on success, 1 on error +# +function get_config() { + local daemon=$1 + local id=$2 + local config=$3 + + CEPH_ARGS='' \ + ceph --format xml daemon $dir/ceph-$daemon.$id.asok \ + config get $config 2> /dev/null | \ + $XMLSTARLET sel -t -m "//$config" -v . -n +} + +function test_get_config() { + local dir=$1 + + # override the default config using command line arg and check it + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=1 || return 1 + test $(get_config mon a osd_pool_default_size) = 1 || return 1 + run_osd $dir 0 --osd_max_scrubs=3 || return 1 + test $(get_config osd 0 osd_max_scrubs) = 3 || return 1 + teardown $dir || return 1 +} + +## +# Set the **config** to specified **value**, via the config set command +# of the admin socket of **daemon**.**id** +# +# @param daemon mon or osd +# @parma id mon or osd ID +# @param config the configuration variable name as found in config_opts.h +# @param value the config value +# @return 0 on success, 1 on error +# +function set_config() { + local daemon=$1 + local id=$2 + local config=$3 + local value=$4 + + CEPH_ARGS='' \ + ceph --format xml daemon $dir/ceph-$daemon.$id.asok \ + config set $config $value 2> /dev/null | \ + $XMLSTARLET sel -Q -t -m "//success" -v . +} + +function test_set_config() { + local dir=$1 + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=1 || return 1 + test $(get_config mon a ms_crc_header) = true || return 1 + set_config mon a ms_crc_header false || return 1 + test $(get_config mon a ms_crc_header) = false || return 1 + set_config mon a ms_crc_header true || return 1 + test $(get_config mon a ms_crc_header) = true || return 1 + teardown $dir || return 1 +} + +####################################################################### + +## +# Return the OSD id of the primary OSD supporting the **objectname** +# stored in **poolname**, as reported by ceph osd map. +# +# @param poolname an existing pool +# @param objectname an objectname (may or may not exist) +# @param STDOUT the primary OSD id +# @return 0 on success, 1 on error +# +function get_primary() { + local poolname=$1 + local objectname=$2 + + ceph --format xml osd map $poolname $objectname 2>/dev/null | \ + $XMLSTARLET sel -t -m "//acting_primary" -v . -n +} + +function test_get_primary() { + local dir=$1 + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=1 || return 1 + local osd=0 + run_osd $dir $osd || return 1 + wait_for_clean || return 1 + test $(get_primary rbd GROUP) = $osd || return 1 + teardown $dir || return 1 +} + +####################################################################### + +## +# Return the id of any OSD supporting the **objectname** stored in +# **poolname**, as reported by ceph osd map, except the primary. +# +# @param poolname an existing pool +# @param objectname an objectname (may or may not exist) +# @param STDOUT the OSD id +# @return 0 on success, 1 on error +# +function get_not_primary() { + local poolname=$1 + local objectname=$2 + + local primary=$(get_primary $poolname $objectname) + ceph --format xml osd map $poolname $objectname 2>/dev/null | \ + $XMLSTARLET sel -t -m "//acting/osd[not(.='$primary')]" -v . -n | \ + head -1 +} + +function test_get_not_primary() { + local dir=$1 + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=2 || return 1 + run_osd $dir 0 || return 1 + run_osd $dir 1 || return 1 + wait_for_clean || return 1 + local primary=$(get_primary rbd GROUP) + local not_primary=$(get_not_primary rbd GROUP) + test $not_primary != $primary || return 1 + test $not_primary = 0 -o $not_primary = 1 || return 1 + teardown $dir || return 1 +} + +####################################################################### + +## +# Run ceph-objectstore-tool against the OSD **id** using the data path +# **dir**. The OSD is killed with TERM prior to running +# ceph-objectstore-tool because access to the data path is +# exclusive. The OSD is restarted after the command completes. The +# objectstore_tool returns after all PG are active+clean again. +# +# @param dir the data path of the OSD +# @param id the OSD id +# @param ... arguments to ceph-objectstore-tool +# @param STDIN the input of ceph-objectstore-tool +# @param STDOUT the output of ceph-objectstore-tool +# @return 0 on success, 1 on error +# +function objectstore_tool() { + local dir=$1 + shift + local id=$1 + shift + local osd_data=$dir/$id + + kill_daemons $dir TERM osd.$id >&2 < /dev/null || return 1 + ceph-objectstore-tool \ + --data-path $osd_data \ + --journal-path $osd_data/journal \ + "$@" || return 1 + activate_osd $dir $id >&2 || return 1 + wait_for_clean >&2 +} + +function test_objectstore_tool() { + local dir=$1 + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=1 || return 1 + local osd=0 + run_osd $dir $osd || return 1 + wait_for_clean || return 1 + rados --pool rbd put GROUP /etc/group || return 1 + objectstore_tool $dir $osd GROUP get-bytes | \ + diff - /etc/group + ! objectstore_tool $dir $osd NOTEXISTS get-bytes || return 1 + teardown $dir || return 1 +} + +####################################################################### + +## +# Predicate checking if there is an ongoing recovery in the +# cluster. If any of the recovering_{keys,bytes,objects}_per_sec +# counters are reported by ceph status, it means recovery is in +# progress. +# +# @return 0 if recovery in progress, 1 otherwise +# +function get_is_making_recovery_progress() { + local progress=$(ceph --format xml status 2>/dev/null | \ + $XMLSTARLET sel \ + -t -m "//pgmap/recovering_keys_per_sec" -v . -o ' ' \ + -t -m "//pgmap/recovering_bytes_per_sec" -v . -o ' ' \ + -t -m "//pgmap/recovering_objects_per_sec" -v .) + test -n "$progress" +} + +function test_get_is_making_recovery_progress() { + local dir=$1 + + setup $dir || return 1 + run_mon $dir a || return 1 + ! get_is_making_recovery_progress || return 1 + teardown $dir || return 1 +} + +####################################################################### + +## +# Return the number of active PGs in the cluster. A PG is active if +# ceph pg dump pgs reports it both **active** and **clean** and that +# not **stale**. +# +# @param STDOUT the number of active PGs +# @return 0 on success, 1 on error +# +function get_num_active_clean() { + local expression="(" + expression+="contains(.,'active') and " + expression+="contains(.,'clean') and " + expression+="not(contains(.,'stale'))" + expression+=")" + # xmlstarlet 1.3.0 (which is on Ubuntu precise) + # add extra new lines that must be ignored with + # grep -v '^$' + ceph --format xml pg dump pgs 2>/dev/null | \ + $XMLSTARLET sel -t -m "//pg_stat/state[$expression]" -v . -n | \ + grep -v '^$' | wc -l +} + +function test_get_num_active_clean() { + local dir=$1 + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=1 || return 1 + run_osd $dir 0 || return 1 + wait_for_clean || return 1 + local num_active_clean=$(get_num_active_clean) + test "$num_active_clean" = $PG_NUM || return 1 + teardown $dir || return 1 +} + +####################################################################### + +## +# Return the number of PGs in the cluster, according to +# ceph pg dump pgs. +# +# @param STDOUT the number of PGs +# @return 0 on success, 1 on error +# +function get_num_pgs() { + ceph --format xml status 2>/dev/null | \ + $XMLSTARLET sel -t -m "//pgmap/num_pgs" -v . +} + +function test_get_num_pgs() { + local dir=$1 + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=1 || return 1 + run_osd $dir 0 || return 1 + wait_for_clean || return 1 + local num_pgs=$(get_num_pgs) + test "$num_pgs" -gt 0 || return 1 + teardown $dir || return 1 +} + +####################################################################### + +## +# Return the date and time of the last completed scrub for **pgid**, +# as reported by ceph pg dump pgs. Note that a repair also sets this +# date. +# +# @param pgid the id of the PG +# @param STDOUT the date and time of the last scrub +# @return 0 on success, 1 on error +# +function get_last_scrub_stamp() { + local pgid=$1 + ceph --format xml pg dump pgs 2>/dev/null | \ + $XMLSTARLET sel -t -m "//pg_stat[pgid='$pgid']/last_scrub_stamp" -v . +} + +function test_get_last_scrub_stamp() { + local dir=$1 + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=1 || return 1 + run_osd $dir 0 || return 1 + wait_for_clean || return 1 + stamp=$(get_last_scrub_stamp 1.0) + test -n "$stamp" || return 1 + teardown $dir || return 1 +} + +####################################################################### + +## +# Predicate checking if the cluster is clean, i.e. all of its PGs are +# in a clean state (see get_num_active_clean for a definition). +# +# @return 0 if the cluster is clean, 1 otherwise +# +function is_clean() { + num_pgs=$(get_num_pgs) + test $num_pgs != 0 || return 1 + test $(get_num_active_clean) = $num_pgs || return 1 +} + +function test_is_clean() { + local dir=$1 + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=1 || return 1 + run_osd $dir 0 || return 1 + ! is_clean || return 1 + wait_for_clean || return 1 + is_clean || return 1 + teardown $dir || return 1 +} + +####################################################################### + +## +# Wait until the cluster becomes clean or if it does not make progress +# for $TIMEOUT seconds. The function **is_clean** is used to determine +# if the cluster is clean. Progress is measured either vian the +# **get_is_making_recovery_progress** predicate or if the number of +# clean PGs changes. +# +# @return 0 if the cluster is clean, 1 otherwise +# +function wait_for_clean() { + local status=1 + local num_active_clean=$(get_num_active_clean) + local cur_active_clean + local -i timer=0 + while ! is_clean ; do + if get_is_making_recovery_progress ; then + timer=0 + elif (( timer >= $TIMEOUT )) ; then + ceph report + return 1 + fi + + cur_active_clean=$(get_num_active_clean) + if test $cur_active_clean != $num_active_clean ; then + timer=0 + num_active_clean=$cur_active_clean + fi + sleep 1 + timer=$(expr $timer + 1) + done + return 0 +} + +function test_wait_for_clean() { + local dir=$1 + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=1 || return 1 + ! TIMEOUT=1 wait_for_clean || return 1 + run_osd $dir 0 || return 1 + wait_for_clean || return 1 + teardown $dir || return 1 +} + +####################################################################### + +## +# Run repair on **pgid** and wait until it completes. The repair +# function will fail if repair does not complete within $TIMEOUT +# seconds. The repair is complete whenever the +# **get_last_scrub_stamp** function reports a timestamp different from +# the one stored before starting the repair. +# +# @param pgid the id of the PG +# @return 0 on success, 1 on error +# +function repair() { + local pgid=$1 + local last_scrub=$(get_last_scrub_stamp $pgid) + + ceph pg repair $pgid + for ((i=0; i < $TIMEOUT; i++)); do + if test "$last_scrub" != "$(get_last_scrub_stamp $pgid)" ; then + return 0 + fi + sleep 1 + done + return 1 +} + +function test_repair() { + local dir=$1 + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=1 || return 1 + run_osd $dir 0 || return 1 + wait_for_clean || return 1 + repair 1.0 || return 1 + kill_daemons $dir KILL osd || return 1 + ! TIMEOUT=1 repair 1.0 || return 1 + teardown $dir || return 1 +} + +####################################################################### + +## +# Run the *command* and expect it to fail (i.e. return a non zero status). +# The output (stderr and stdout) is stored in a temporary file in *dir* +# and is expected to contain the string *expected*. +# +# Return 0 if the command failed and the string was found. Otherwise +# return 1 and cat the full output of the command on stderr for debug. +# +# @param dir temporary directory to store the output +# @param expected string to look for in the output +# @param command ... the command and its arguments +# @return 0 on success, 1 on error +# + +function expect_failure() { + local dir=$1 + shift + local expected="$1" + shift + local success + + if "$@" > $dir/out 2>&1 ; then + success=true + else + success=false + fi + + if $success || ! grep --quiet "$expected" $dir/out ; then + cat $dir/out >&2 + return 1 + else + return 0 + fi +} + +function test_expect_failure() { + local dir=$1 + + setup $dir || return 1 + expect_failure $dir FAIL bash -c 'echo FAIL ; exit 1' || return 1 + # the command did not fail + ! expect_failure $dir FAIL bash -c 'echo FAIL ; exit 0' > $dir/out || return 1 + grep --quiet FAIL $dir/out || return 1 + # the command failed but the output does not contain the expected string + ! expect_failure $dir FAIL bash -c 'echo UNEXPECTED ; exit 1' > $dir/out || return 1 + ! grep --quiet FAIL $dir/out || return 1 + teardown $dir || return 1 +} + +####################################################################### + +## +# Return 0 if the erasure code *plugin* is available, 1 otherwise. +# +# @param plugin erasure code plugin +# @return 0 on success, 1 on error +# + +function erasure_code_plugin_exists() { + local plugin=$1 + + local status + if ceph osd erasure-code-profile set TESTPROFILE plugin=$plugin 2>&1 | + grep "$plugin.*No such file" ; then + status=1 + else + status=0 + ceph osd erasure-code-profile rm TESTPROFILE + fi + return $status +} + +function test_erasure_code_plugin_exists() { + local dir=$1 + + setup $dir || return 1 + run_mon $dir a || return 1 + erasure_code_plugin_exists jerasure || return 1 + ! erasure_code_plugin_exists FAKE || return 1 + teardown $dir || return 1 +} + +####################################################################### + +## +# Call the **run** function (which must be defined by the caller) with +# the **dir** argument followed by the caller argument list. +# +# **teardown** function is called when the **run** function returns +# (on success or on error), to cleanup leftovers. The CEPH_CONF is set +# to /dev/null and CEPH_ARGS is unset so that the tests are protected from +# external interferences. +# +# It is the responsibility of the **run** function to call the +# **setup** function to prepare the test environment (create a temporary +# directory etc.). +# +# The shell is required (via PS4) to display the function and line +# number whenever a statement is executed to help debugging. +# +# @param dir directory in which all data is stored +# @param ... arguments passed transparently to **run** +# @return 0 on success, 1 on error +# +function main() { + local dir=testdir/$1 + shift + + shopt -s -o xtrace + PS4='${BASH_SOURCE[0]}:$LINENO: ${FUNCNAME[0]}: ' + + export PATH=:$PATH # make sure program from sources are prefered + + export CEPH_CONF=/dev/null + unset CEPH_ARGS + + local code + if run $dir "$@" ; then + code=0 + else + code=1 + fi + teardown $dir || return 1 + return $code +} + +####################################################################### + +function run_tests() { + shopt -s -o xtrace + PS4='${BASH_SOURCE[0]}:$LINENO: ${FUNCNAME[0]}: ' + + export PATH=":$PATH" + export CEPH_MON="127.0.0.1:7109" + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + CEPH_ARGS+="--mon-host=$CEPH_MON " + export CEPH_CONF=/dev/null + + local funcs=${@:-$(set | sed -n -e 's/^\(test_[0-9a-z_]*\) .*/\1/p')} + local dir=testdir/ceph-helpers + + for func in $funcs ; do + $func $dir || return 1 + done +} + +if test "$1" = TESTS ; then + shift + run_tests "$@" +fi + +# Local Variables: +# compile-command: "cd ../../src ; make -j4 && ../qa/workunits/ceph-helpers.sh TESTS # test_get_config" +# End: diff --git a/qa/workunits/cephtool/test.sh b/qa/workunits/cephtool/test.sh index b333984bcee81..f84031aa44f07 100755 --- a/qa/workunits/cephtool/test.sh +++ b/qa/workunits/cephtool/test.sh @@ -1,8 +1,10 @@ #!/bin/bash -x +source $(dirname $0)/../ceph-helpers.sh + set -e set -o functrace -PS4=' ${FUNCNAME[0]}: $LINENO: ' +PS4='${BASH_SOURCE[0]}:$LINENO: ${FUNCNAME[0]}: ' SUDO=${SUDO:-sudo} function check_no_osd_down() @@ -23,23 +25,6 @@ function wait_no_osd_down() check_no_osd_down } -function get_pg() -{ - local pool obj map_output pg - pool=$1 - obj=$2 - declare -a map_output - map_output=($(ceph osd map $1 $2)) - for (( i=0; i<${#map_output[*]}; i++ )) ; do - if [ "${map_output[$i]}" == "pg" ] ; then - pg=${map_output[((i+2))]} - break - fi - done - pg=$(echo $pg | sed 's/[()]//g') - echo $pg -} - function expect_false() { set -x @@ -155,6 +140,42 @@ function expect_config_value() fi } +function ceph_watch_start() +{ + local whatch_opt=--watch + + if [ -n "$1" ]; then + whatch_opt=--watch-$1 + fi + + CEPH_WATCH_FILE=${TMPDIR}/CEPH_WATCH_$$ + ceph $whatch_opt > $CEPH_WATCH_FILE & + CEPH_WATCH_PID=$! +} + +function ceph_watch_wait() +{ + local regexp=$1 + local timeout=30 + + if [ -n "$2" ]; then + timeout=$2 + fi + + for i in `seq ${timeout}`; do + sleep 1 + grep -q "$regexp" $CEPH_WATCH_FILE && break + done + + kill $CEPH_WATCH_PID + + if ! grep "$regexp" $CEPH_WATCH_FILE; then + echo "pattern ${regexp} not found in watch file. Full watch file content:" >&2 + cat $CEPH_WATCH_FILE >&2 + return 1 + fi +} + function test_mon_injectargs() { CEPH_ARGS='--mon_debug_dump_location the.dump' ceph tell osd.0 injectargs --no-osd_debug_op_order >& $TMPFILE || return 1 @@ -168,6 +189,8 @@ function test_mon_injectargs() check_response "osd_debug_op_order = 'true'" ceph tell osd.0 injectargs -- '--osd_debug_op_order --osd_failsafe_full_ratio .98' >& $TMPFILE || return 1 check_response "osd_debug_op_order = 'true' osd_failsafe_full_ratio = '0.98'" + ceph tell osd.0 injectargs -- '--osd_failsafe_full_ratio' >& $TMPFILE || return 1 + check_response "Option --osd_failsafe_full_ratio requires an argument" } function test_mon_injectargs_SI() @@ -194,10 +217,7 @@ function test_mon_injectargs_SI() expect_config_value "mon.a" "mon_pg_warn_min_objects" 10240 ceph tell mon.a injectargs '--mon_pg_warn_min_objects 1G' expect_config_value "mon.a" "mon_pg_warn_min_objects" 1073741824 - # < /dev/null accounts for the fact that ceph will go in interactive mode - # because injectargs is discarded (actually saved for the benefit of - # a tell command that never comes) - expect_false ceph injectargs mon.a '--mon_pg_warn_min_objects 10F' < /dev/null 2> /dev/null + expect_false ceph tell mon.a injectargs '--mon_pg_warn_min_objects 10F' $SUDO ceph daemon mon.a config set mon_pg_warn_min_objects $initial_value } @@ -223,7 +243,7 @@ function test_tiering() # test with dirty objects in the tier pool # tier pool currently set to 'writeback' rados -p cache put /etc/passwd /etc/passwd - ceph tell osd.* flush_pg_stats || true + ceph tell osd.\* flush_pg_stats || true # 1 dirty object in pool 'cache' ceph osd tier cache-mode cache forward expect_false ceph osd tier cache-mode cache none @@ -232,7 +252,7 @@ function test_tiering() # remove object from tier pool rados -p cache rm /etc/passwd rados -p cache cache-flush-evict-all - ceph tell osd.* flush_pg_stats || true + ceph tell osd.\* flush_pg_stats || true # no dirty objects in pool 'cache' ceph osd tier cache-mode cache forward ceph osd tier cache-mode cache none @@ -279,10 +299,28 @@ function test_tiering() ceph osd pool delete cache cache --yes-i-really-really-mean-it ceph osd pool delete cache2 cache2 --yes-i-really-really-mean-it + # make sure we can't clobber snapshot state + ceph osd pool create snap_base 2 + ceph osd pool create snap_cache 2 + ceph osd pool mksnap snap_cache snapname + expect_false ceph osd tier add snap_base snap_cache + ceph osd pool delete snap_base snap_base --yes-i-really-really-mean-it + ceph osd pool delete snap_cache snap_cache --yes-i-really-really-mean-it + + # make sure we can't create an ec pool tier + ceph osd pool create eccache 2 2 erasure + ceph osd pool create repbase 2 + expect_false ceph osd tier add repbase eccache + ceph osd pool delete repbase repbase --yes-i-really-really-mean-it + ceph osd pool delete eccache eccache --yes-i-really-really-mean-it + # convenient add-cache command ceph osd pool create cache3 2 ceph osd tier add-cache slow cache3 1024000 ceph osd dump | grep cache3 | grep bloom | grep 'false_positive_probability: 0.05' | grep 'target_bytes 1024000' | grep '1200s x4' + ceph osd tier remove slow cache3 2> $TMPFILE || true + check_response "EBUSY: tier pool 'cache3' is the overlay for 'slow'; please remove-overlay first" + ceph osd tier remove-overlay slow ceph osd tier remove slow cache3 ceph osd pool ls | grep cache3 ceph osd pool delete cache3 cache3 --yes-i-really-really-mean-it @@ -291,6 +329,20 @@ function test_tiering() ceph osd pool delete slow2 slow2 --yes-i-really-really-mean-it ceph osd pool delete slow slow --yes-i-really-really-mean-it + # check add-cache whether work + ceph osd pool create datapool 2 + ceph osd pool create cachepool 2 + ceph osd tier add-cache datapool cachepool 1024000 + ceph osd tier cache-mode cachepool writeback + rados -p datapool put object /etc/passwd + rados -p cachepool stat object + rados -p cachepool cache-flush object + rados -p datapool stat object + ceph osd tier remove-overlay datapool + ceph osd tier remove datapool cachepool + ceph osd pool delete cachepool cachepool --yes-i-really-really-mean-it + ceph osd pool delete datapool datapool --yes-i-really-really-mean-it + # protection against pool removal when used as tiers ceph osd pool create datapool 2 ceph osd pool create cachepool 2 @@ -299,29 +351,39 @@ function test_tiering() check_response "EBUSY: pool 'cachepool' is a tier of 'datapool'" ceph osd pool delete datapool datapool --yes-i-really-really-mean-it 2> $TMPFILE || true check_response "EBUSY: pool 'datapool' has tiers cachepool" + ceph osd tier remove-overlay datapool ceph osd tier remove datapool cachepool ceph osd pool delete cachepool cachepool --yes-i-really-really-mean-it ceph osd pool delete datapool datapool --yes-i-really-really-mean-it - # check health check + ## check health check + ceph osd set notieragent ceph osd pool create datapool 2 ceph osd pool create cache4 2 - ceph osd tier add datapool cache4 - ceph osd pool set cache4 target_max_objects 5 - ceph osd pool set cache4 target_max_bytes 1000 - for f in `seq 1 5` ; do - rados -p cache4 put foo$f /etc/passwd - done - while ! ceph df | grep cache4 | grep ' 5 ' ; do - echo waiting for pg stats to flush - sleep 2 - done + ceph osd tier add-cache datapool cache4 1024000 + ceph osd tier cache-mode cache4 writeback + tmpfile=$(mktemp|grep tmp) + dd if=/dev/zero of=$tmpfile bs=4K count=1 + ceph osd pool set cache4 target_max_objects 200 + ceph osd pool set cache4 target_max_bytes 1000000 + rados -p cache4 put foo1 $tmpfile + rados -p cache4 put foo2 $tmpfile + rm -f $tmpfile + ceph tell osd.\* flush_pg_stats || true + ceph df | grep cache4 | grep ' 2 ' + local max_objects=1 + ceph osd pool set cache4 target_max_objects $max_objects + local max_bytes=1024 + ceph osd pool set cache4 target_max_bytes $max_bytes ceph health | grep WARN | grep cache4 - ceph health detail | grep cache4 | grep 'target max' | grep objects - ceph health detail | grep cache4 | grep 'target max' | grep 'B' + ceph health detail | grep cache4 | grep 'target max' | grep "${max_objects} objects" + ceph health detail | grep cache4 | grep 'target max' | grep "${max_bytes}B" + ceph osd tier remove-overlay datapool ceph osd tier remove datapool cache4 ceph osd pool delete cache4 cache4 --yes-i-really-really-mean-it ceph osd pool delete datapool datapool --yes-i-really-really-mean-it + ceph osd unset notieragent + # make sure 'tier remove' behaves as we expect # i.e., removing a tier from a pool that's not its base pool only @@ -377,6 +439,15 @@ function test_auth() diff authfile authfile2 rm authfile authfile2 ceph auth del client.xx + expect_false ceph auth get client.xx + + # (almost) interactive mode + echo -e 'auth add client.xx mon allow osd "allow *"\n' | ceph + ceph auth get client.xx + # script mode + echo 'auth del client.xx' | ceph + expect_false ceph auth get client.xx + # # get / set auid # @@ -495,19 +566,19 @@ function test_mon_misc() ceph health --format json-pretty ceph health detail --format xml-pretty - ceph -w > $TMPDIR/$$ & - wpid="$!" + ceph node ls + for t in mon osd mds ; do + ceph node ls $t + done + + ceph_watch_start mymsg="this is a test log message $$.$(date)" ceph log "$mymsg" - sleep 3 - if ! grep "$mymsg" $TMPDIR/$$; then - # in case it is very slow (mon thrashing or something) - sleep 30 - grep "$mymsg" $TMPDIR/$$ - fi - kill $wpid -} + ceph_watch_wait "$mymsg" + ceph mon metadata a + ceph node ls +} function check_mds_active() { @@ -634,6 +705,9 @@ function test_mon_mds() ceph mds compat show expect_false ceph mds deactivate 2 ceph mds dump + for mds_gid in $(get_mds_gids) ; do + ceph mds metadata $mds_id + done # XXX mds fail, but how do you undo it? mdsmapfile=$TMPDIR/mdsmap.$$ current_epoch=$(ceph mds getmap -o $mdsmapfile --no-log-to-stderr 2>&1 | grep epoch | sed 's/.*epoch //') @@ -658,7 +732,11 @@ function test_mon_mds() ceph osd pool delete data3 data3 --yes-i-really-really-mean-it ceph mds set_max_mds 4 ceph mds set_max_mds 3 + ceph mds set_max_mds 256 + expect_false ceph mds set_max_mds 257 ceph mds set max_mds 4 + ceph mds set max_mds 256 + expect_false ceph mds set max_mds 257 expect_false ceph mds set max_mds asdf expect_false ceph mds set inline_data true ceph mds set inline_data true --yes-i-really-mean-it @@ -692,6 +770,13 @@ function test_mon_mds() metadata_poolnum=$(ceph osd dump | grep "pool.* 'fs_metadata" | awk '{print $2;}') fail_all_mds + + # Check that 'fs reset' runs + ceph fs reset cephfs --yes-i-really-mean-it + + fail_all_mds + + # Clean up to enable subsequent newfs tests ceph fs rm cephfs --yes-i-really-mean-it set +e @@ -709,15 +794,21 @@ function test_mon_mds() check_response 'erasure-code' $? 22 set -e - # ... however if we create a cache tier in front of the EC pool, we should - # be permitted to use it... + # ... new create a cache tier in front of the EC pool... ceph osd pool create mds-tier 2 ceph osd tier add mds-ec-pool mds-tier ceph osd tier set-overlay mds-ec-pool mds-tier - ceph osd tier cache-mode mds-tier writeback tier_poolnum=$(ceph osd dump | grep "pool.* 'mds-tier" | awk '{print $2;}') + # Use of a readonly tier should be forbidden + ceph osd tier cache-mode mds-tier readonly + set +e + ceph fs new cephfs fs_metadata mds-ec-pool 2>$TMPFILE + check_response 'has a write tier (mds-tier) that is configured to forward' $? 22 set -e + + # Use of a writeback tier should enable FS creation + ceph osd tier cache-mode mds-tier writeback ceph fs new cephfs fs_metadata mds-ec-pool # While a FS exists using the tiered pools, I should not be allowed @@ -764,6 +855,7 @@ function test_mon_mds() fail_all_mds ceph fs rm cephfs --yes-i-really-mean-it + ceph osd pool delete mds-ec-pool mds-ec-pool --yes-i-really-really-mean-it # Create a FS and check that we can subsequently add a cache tier to it ceph fs new cephfs fs_metadata fs_data @@ -773,16 +865,16 @@ function test_mon_mds() ceph osd tier cache-mode mds-tier writeback ceph osd tier set-overlay fs_metadata mds-tier - # Clean up FS - fail_all_mds - ceph fs rm cephfs --yes-i-really-mean-it - - # Clean up overlay/tier relationship + # Removing tier should be permitted because the underlying pool is + # replicated (#11504 case) + ceph osd tier cache-mode mds-tier forward ceph osd tier remove-overlay fs_metadata ceph osd tier remove fs_metadata mds-tier - ceph osd pool delete mds-tier mds-tier --yes-i-really-really-mean-it - ceph osd pool delete mds-ec-pool mds-ec-pool --yes-i-really-really-mean-it + + # Clean up FS + fail_all_mds + ceph fs rm cephfs --yes-i-really-mean-it ceph mds stat # ceph mds tell mds.a getmap @@ -795,8 +887,29 @@ function test_mon_mds() ceph osd pool delete fs_metadata fs_metadata --yes-i-really-really-mean-it } +function test_mon_mds_metadata() +{ + local nmons=$(ceph tell 'mon.*' version | grep -c 'version') + test "$nmons" -gt 0 + + ceph mds dump | + sed -nEe "s/^([0-9]+):.*'([a-z])' mds\\.([0-9]+)\\..*/\\1 \\2 \\3/p" | + while read gid id rank; do + ceph mds metadata ${gid} | grep '"hostname":' + ceph mds metadata ${id} | grep '"hostname":' + ceph mds metadata ${rank} | grep '"hostname":' + + local n=$(ceph tell 'mon.*' mds metadata ${id} | grep -c '"hostname":') + test "$n" -eq "$nmons" + done + + expect_false ceph mds metadata UNKNOWN +} + function test_mon_mon() { + # print help message + ceph mon # no mon add/remove ceph mon dump ceph mon getmap -o $TMPDIR/monmap.$$ @@ -811,8 +924,13 @@ function test_mon_osd() # osd blacklist # bl=192.168.0.1:0/1000 + # Escaped form which may appear in JSON output + bl_json=192.168.0.1:0\\\\/1000 ceph osd blacklist add $bl ceph osd blacklist ls | grep $bl + ceph osd blacklist ls --format=json-pretty | grep $bl_json + ceph osd dump --format=json-pretty | grep $bl + ceph osd dump | grep "^blacklist $bl" ceph osd blacklist rm $bl expect_false "ceph osd blacklist ls | grep $bl" @@ -828,6 +946,7 @@ function test_mon_osd() # # osd crush # + ceph osd crush reweight-all ceph osd crush tunables legacy ceph osd crush show-tunables | grep argonaut ceph osd crush tunables bobtail @@ -835,6 +954,11 @@ function test_mon_osd() ceph osd crush tunables firefly ceph osd crush show-tunables | grep firefly + ceph osd crush set-tunable straw_calc_version 0 + ceph osd crush get-tunable straw_calc_version | grep 0 + ceph osd crush set-tunable straw_calc_version 1 + ceph osd crush get-tunable straw_calc_version | grep 1 + # # osd scrub # @@ -843,7 +967,7 @@ function test_mon_osd() ceph osd deep-scrub 0 ceph osd repair 0 - for f in noup nodown noin noout noscrub nodeep-scrub nobackfill norecover notieragent + for f in noup nodown noin noout noscrub nodeep-scrub nobackfill norebalance norecover notieragent full do ceph osd set $f ceph osd unset $f @@ -881,11 +1005,14 @@ function test_mon_osd() f=$TMPDIR/map.$$ ceph osd getcrushmap -o $f [ -s $f ] + ceph osd setcrushmap -i $f rm $f ceph osd getmap -o $f [ -s $f ] rm $f save=$(ceph osd getmaxosd | sed -e 's/max_osd = //' -e 's/ in epoch.*//') + [ "$save" -gt 0 ] + ceph osd setmaxosd $((save - 1)) 2>&1 | grep 'EBUSY' ceph osd setmaxosd 10 ceph osd getmaxosd | grep 'max_osd = 10' ceph osd setmaxosd $save @@ -899,6 +1026,7 @@ function test_mon_osd() local old_osds=$(echo $(ceph osd ls)) id=`ceph osd create` + ceph osd find $id ceph osd lost $id --yes-i-really-mean-it expect_false ceph osd setmaxosd $id local new_osds=$(echo $(ceph osd ls)) @@ -912,10 +1040,71 @@ function test_mon_osd() [ "$id" = "$id2" ] ceph osd rm $id + ceph osd + + # reset max_osd. + ceph osd setmaxosd $id + ceph osd getmaxosd | grep "max_osd = $save" + local max_osd=$save + + ceph osd create $uuid 0 2>&1 | grep 'EINVAL' + ceph osd create $uuid $((max_osd - 1)) 2>&1 | grep 'EINVAL' + + id=`ceph osd create $uuid $max_osd` + [ "$id" = "$max_osd" ] + ceph osd find $id + max_osd=$((max_osd + 1)) + ceph osd getmaxosd | grep "max_osd = $max_osd" + + ceph osd create $uuid $((id - 1)) 2>&1 | grep 'EINVAL' + ceph osd create $uuid $((id + 1)) 2>&1 | grep 'EINVAL' + id2=`ceph osd create $uuid` + [ "$id" = "$id2" ] + id2=`ceph osd create $uuid $id` + [ "$id" = "$id2" ] + + uuid=`uuidgen` + local gap_start=$max_osd + id=`ceph osd create $uuid $((gap_start + 100))` + [ "$id" = "$((gap_start + 100))" ] + max_osd=$((id + 1)) + ceph osd getmaxosd | grep "max_osd = $max_osd" + + ceph osd create $uuid $gap_start 2>&1 | grep 'EINVAL' + + # + # When CEPH_CLI_TEST_DUP_COMMAND is set, osd create + # is repeated and consumes two osd id, not just one. + # + local next_osd + if test "$CEPH_CLI_TEST_DUP_COMMAND" ; then + next_osd=$((gap_start + 1)) + else + next_osd=$gap_start + fi + id=`ceph osd create` + [ "$id" = "$next_osd" ] + + next_osd=$((id + 1)) + id=`ceph osd create $(uuidgen)` + [ "$id" = "$next_osd" ] + + next_osd=$((id + 1)) + id=`ceph osd create $(uuidgen) $next_osd` + [ "$id" = "$next_osd" ] + + local new_osds=$(echo $(ceph osd ls)) + for id in $(echo $new_osds | sed -e "s/$old_osds//") ; do + [ $id -ge $save ] + ceph osd rm $id + done + ceph osd setmaxosd $save + ceph osd ls ceph osd pool create data 10 ceph osd lspools | grep data ceph osd map data foo | grep 'pool.*data.*object.*foo.*pg.*up.*acting' + ceph osd map data foo namespace| grep 'pool.*data.*object.*namespace/foo.*pg.*up.*acting' ceph osd pool delete data data --yes-i-really-really-mean-it ceph osd pause @@ -1020,6 +1209,26 @@ function test_mon_pg() ceph pg dump_stuck inactive ceph pg dump_stuck unclean ceph pg dump_stuck stale + ceph pg dump_stuck undersized + ceph pg dump_stuck degraded + ceph pg ls + ceph pg ls 0 + ceph pg ls stale + ceph pg ls active stale repair recovering + ceph pg ls 0 active + ceph pg ls 0 active stale + ceph pg ls-by-primary osd.0 + ceph pg ls-by-primary osd.0 0 + ceph pg ls-by-primary osd.0 active + ceph pg ls-by-primary osd.0 active stale + ceph pg ls-by-primary osd.0 0 active stale + ceph pg ls-by-osd osd.0 + ceph pg ls-by-osd osd.0 0 + ceph pg ls-by-osd osd.0 active + ceph pg ls-by-osd osd.0 active stale + ceph pg ls-by-osd osd.0 0 active stale + ceph pg ls-by-pool rbd + ceph pg ls-by-pool rbd active stale # can't test this... # ceph pg force_create_pg ceph pg getmap -o $TMPDIR/map.$$ @@ -1075,7 +1284,9 @@ function test_mon_pg() function test_mon_osd_pool_set() { TEST_POOL_GETSET=pool_getset - ceph osd pool create $TEST_POOL_GETSET 10 + ceph osd pool create $TEST_POOL_GETSET 1 + wait_for_clean + ceph osd pool get $TEST_POOL_GETSET all for s in pg_num pgp_num size min_size crash_replay_interval crush_ruleset; do ceph osd pool get $TEST_POOL_GETSET $s @@ -1087,7 +1298,8 @@ function test_mon_osd_pool_set() ceph osd pool get $TEST_POOL_GETSET size | grep "size: $new_size" ceph osd pool set $TEST_POOL_GETSET size $old_size - ceph osd pool create pool_erasure 12 12 erasure + ceph osd pool create pool_erasure 1 1 erasure + wait_for_clean set +e ceph osd pool set pool_erasure size 4444 2>$TMPFILE check_response 'not change the size' @@ -1100,13 +1312,34 @@ function test_mon_osd_pool_set() ceph --format=xml osd pool get $TEST_POOL_GETSET auid | grep $auid ceph osd pool set $TEST_POOL_GETSET auid 0 - ceph osd pool set $TEST_POOL_GETSET hashpspool true - ceph osd pool set $TEST_POOL_GETSET hashpspool false - ceph osd pool set $TEST_POOL_GETSET hashpspool 0 - ceph osd pool set $TEST_POOL_GETSET hashpspool 1 - expect_false ceph osd pool set $TEST_POOL_GETSET hashpspool asdf - expect_false ceph osd pool set $TEST_POOL_GETSET hashpspool 2 + for flag in hashpspool nodelete nopgchange nosizechange; do + ceph osd pool set $TEST_POOL_GETSET $flag false + ceph osd pool set $TEST_POOL_GETSET $flag true + ceph osd pool set $TEST_POOL_GETSET $flag 1 + ceph osd pool set $TEST_POOL_GETSET $flag 0 + expect_false ceph osd pool set $TEST_POOL_GETSET $flag asdf + expect_false ceph osd pool set $TEST_POOL_GETSET $flag 2 + done + ceph osd pool set $TEST_POOL_GETSET nopgchange 1 + expect_false ceph osd pool set $TEST_POOL_GETSET pg_num 10 + expect_false ceph osd pool set $TEST_POOL_GETSET pgp_num 10 + ceph osd pool set $TEST_POOL_GETSET nopgchange 0 + ceph osd pool set $TEST_POOL_GETSET pg_num 10 + wait_for_clean + ceph osd pool set $TEST_POOL_GETSET pgp_num 10 + + ceph osd pool set $TEST_POOL_GETSET nosizechange 1 + expect_false ceph osd pool set $TEST_POOL_GETSET size 2 + expect_false ceph osd pool set $TEST_POOL_GETSET min_size 2 + ceph osd pool set $TEST_POOL_GETSET nosizechange 0 + ceph osd pool set $TEST_POOL_GETSET size 2 + wait_for_clean + ceph osd pool set $TEST_POOL_GETSET min_size 2 + + ceph osd pool set $TEST_POOL_GETSET nodelete 1 + expect_false ceph osd pool delete $TEST_POOL_GETSET $TEST_POOL_GETSET --yes-i-really-really-mean-it + ceph osd pool set $TEST_POOL_GETSET nodelete 0 ceph osd pool delete $TEST_POOL_GETSET $TEST_POOL_GETSET --yes-i-really-really-mean-it ceph osd pool get rbd crush_ruleset | grep 'crush_ruleset: 0' @@ -1143,6 +1376,11 @@ function test_mon_osd_tiered_pool_set() grep 'cache_target_dirty_ratio:[ \t]\+0.123' expect_false ceph osd pool set real-tier cache_target_dirty_ratio -.2 expect_false ceph osd pool set real-tier cache_target_dirty_ratio 1.1 + ceph osd pool set real-tier cache_target_dirty_high_ratio .123 + ceph osd pool get real-tier cache_target_dirty_high_ratio | \ + grep 'cache_target_dirty_high_ratio:[ \t]\+0.123' + expect_false ceph osd pool set real-tier cache_target_dirty_high_ratio -.2 + expect_false ceph osd pool set real-tier cache_target_dirty_high_ratio 1.1 ceph osd pool set real-tier cache_target_full_ratio .123 ceph osd pool get real-tier cache_target_full_ratio | \ grep 'cache_target_full_ratio:[ \t]\+0.123' @@ -1159,6 +1397,7 @@ function test_mon_osd_tiered_pool_set() # this is not a tier pool ceph osd pool create fake-tier 2 + wait_for_clean expect_false ceph osd pool set fake-tier hit_set_type explicit_hash expect_false ceph osd pool get fake-tier hit_set_type @@ -1182,6 +1421,10 @@ function test_mon_osd_tiered_pool_set() expect_false ceph osd pool get fake-tier cache_target_dirty_ratio expect_false ceph osd pool set fake-tier cache_target_dirty_ratio -.2 expect_false ceph osd pool set fake-tier cache_target_dirty_ratio 1.1 + expect_false ceph osd pool set fake-tier cache_target_dirty_high_ratio .123 + expect_false ceph osd pool get fake-tier cache_target_dirty_high_ratio + expect_false ceph osd pool set fake-tier cache_target_dirty_high_ratio -.2 + expect_false ceph osd pool set fake-tier cache_target_dirty_high_ratio 1.1 expect_false ceph osd pool set fake-tier cache_target_full_ratio .123 expect_false ceph osd pool get fake-tier cache_target_full_ratio expect_false ceph osd pool set fake-tier cache_target_full_ratio 1.0 @@ -1296,6 +1539,113 @@ function test_osd_bench() ceph tell osd.0 bench 104857600 2097152 } +function test_mon_tell() +{ + ceph tell mon.a version + ceph tell mon.b version + expect_false ceph tell mon.foo version + + sleep 1 + + ceph_watch_start debug + ceph tell mon.a version + ceph_watch_wait 'mon.0 \[DBG\] from.*cmd=\[{"prefix": "version"}\]: dispatch' + + ceph_watch_start debug + ceph tell mon.b version + ceph_watch_wait 'mon.1 \[DBG\] from.*cmd=\[{"prefix": "version"}\]: dispatch' +} + +function test_mon_crushmap_validation() +{ + local map=$TMPDIR/map + ceph osd getcrushmap -o $map + + local crushtool_path="${TMPDIR}/crushtool" + touch "${crushtool_path}" + chmod +x "${crushtool_path}" + local crushtool_path_old=`ceph-conf --show-config-value crushtool` + ceph tell mon.\* injectargs --crushtool "${crushtool_path}" + + printf "%s\n" \ + "#!/bin/sh + cat > /dev/null + exit 0" > "${crushtool_path}" + + ceph osd setcrushmap -i $map + + printf "%s\n" \ + "#!/bin/sh + cat > /dev/null + exit 1" > "${crushtool_path}" + + expect_false ceph osd setcrushmap -i $map + + printf "%s\n" \ + "#!/bin/sh + cat > /dev/null + echo 'TEST FAIL' >&2 + exit 1" > "${crushtool_path}" + + expect_false ceph osd setcrushmap -i $map 2> $TMPFILE + check_response "Error EINVAL: Failed to parse crushmap: TEST FAIL" + + local mon_lease=`ceph-conf --show-config-value mon_lease` + + test "${mon_lease}" -gt 0 + + printf "%s\n" \ + "#!/bin/sh + cat > /dev/null + sleep $((mon_lease - 1))" > "${crushtool_path}" + + ceph osd setcrushmap -i $map + + printf "%s\n" \ + "#!/bin/sh + cat > /dev/null + sleep $((mon_lease + 1))" > "${crushtool_path}" + + expect_false ceph osd setcrushmap -i $map 2> $TMPFILE + check_response "Error EINVAL: Failed to parse crushmap: ${crushtool_path}: timed out (${mon_lease} sec)" + + ceph tell mon.\* injectargs --crushtool "${crushtool_path_old}" + + rm -f "${crushtool_path}" +} + +function test_mon_ping() +{ + ceph ping mon.a + ceph ping mon.b + expect_false ceph ping mon.foo + + ceph ping mon.\* +} + +function test_mon_deprecated_commands() +{ + # current DEPRECATED commands are: + # ceph compact + # ceph scrub + # ceph sync force + # + # Testing should be accomplished by setting + # 'mon_debug_deprecated_as_obsolete = true' and expecting ENOTSUP for + # each one of these commands. + + ceph tell mon.a injectargs '--mon-debug-deprecated-as-obsolete' + expect_false ceph tell mon.a compact 2> $TMPFILE + check_response "\(EOPNOTSUPP\|ENOTSUP\): command is obsolete" + + expect_false ceph tell mon.a scrub 2> $TMPFILE + check_response "\(EOPNOTSUPP\|ENOTSUP\): command is obsolete" + + expect_false ceph tell mon.a sync force 2> $TMPFILE + check_response "\(EOPNOTSUPP\|ENOTSUP\): command is obsolete" + + ceph tell mon.a injectargs '--no-mon-debug-deprecated-as-obsolete' +} # # New tests should be added to the TESTS array below @@ -1330,11 +1680,16 @@ MON_TESTS+=" mon_osd_tiered_pool_set" MON_TESTS+=" mon_osd_erasure_code" MON_TESTS+=" mon_osd_misc" MON_TESTS+=" mon_heap_profiler" +MON_TESTS+=" mon_tell" +MON_TESTS+=" mon_crushmap_validation" +MON_TESTS+=" mon_ping" +MON_TESTS+=" mon_deprecated_commands" OSD_TESTS+=" osd_bench" MDS_TESTS+=" mds_tell" MDS_TESTS+=" mon_mds" +MDS_TESTS+=" mon_mds_metadata" TESTS+=$MON_TESTS TESTS+=$OSD_TESTS diff --git a/qa/workunits/erasure-code/.gitignore b/qa/workunits/erasure-code/.gitignore new file mode 100644 index 0000000000000..7e563b8b3023b --- /dev/null +++ b/qa/workunits/erasure-code/.gitignore @@ -0,0 +1,2 @@ +*.log +*.trs diff --git a/qa/workunits/erasure-code/bench.html b/qa/workunits/erasure-code/bench.html index d5dd1095ce02f..3b4b6c74c0027 100644 --- a/qa/workunits/erasure-code/bench.html +++ b/qa/workunits/erasure-code/bench.html @@ -28,27 +28,6 @@

Erasure Code Plugins Benchmarks

decode: Y = GB/s, X = K/M/erasures

-
-
-
-

encode 4KB: Y = GB/s, X = K/M

- -
-
-
-

decode 4KB: Y = GB/s, X = K/M/erasures

- -
-
-
-

encode 1MB: Y = GB/s, X = K/M

- -
-
-
-

decode 1MB: Y = GB/s, X = K/M/erasures

- - diff --git a/qa/workunits/erasure-code/bench.sh b/qa/workunits/erasure-code/bench.sh index 27689665c4fcc..acd70354edcdc 100755 --- a/qa/workunits/erasure-code/bench.sh +++ b/qa/workunits/erasure-code/bench.sh @@ -16,7 +16,6 @@ # # Test that it works from sources with: # -# TOTAL_SIZE=$((1024 * 1024)) \ # CEPH_ERASURE_CODE_BENCHMARK=src/ceph_erasure_code_benchmark \ # PLUGIN_DIRECTORY=src/.libs \ # qa/workunits/erasure-code/bench.sh fplot jerasure | @@ -24,7 +23,7 @@ # # This should start immediately and display: # -# var encode_reed_sol_van_4096 = [ +# ... # [ '2/1', .48035538612887358583 ], # [ '3/2', .21648470405675016626 ], # etc. @@ -36,6 +35,7 @@ # Once it is confirmed to work, it can be run with a more significant # volume of data so that the measures are more reliable: # +# TOTAL_SIZE=$((4 * 1024 * 1024 * 1024)) \ # CEPH_ERASURE_CODE_BENCHMARK=src/ceph_erasure_code_benchmark \ # PLUGIN_DIRECTORY=src/.libs \ # qa/workunits/erasure-code/bench.sh fplot jerasure | @@ -49,8 +49,8 @@ export PATH=/sbin:$PATH : ${CEPH_ERASURE_CODE_BENCHMARK:=ceph_erasure_code_benchmark} : ${PLUGIN_DIRECTORY:=/usr/lib/ceph/erasure-code} : ${PLUGINS:=example jerasure isa} -: ${TOTAL_SIZE:=$((10 * 1024 * 1024))} -: ${SIZES:=4096 $((1024 * 1024))} +: ${TOTAL_SIZE:=$((1024 * 1024))} +: ${SIZE:=4096} : ${PARAMETERS:=--parameter jerasure-per-chunk-alignment=true} function bench_header() { @@ -85,15 +85,6 @@ function bench() { echo -e "$result\t$plugin\t$k\t$m\t$workload\t$iterations\t$size\t$erasures\t$command ""$@" } -function example_test() { - local plugin=example - local size - for size in $SIZES ; do - bench $plugin 2 1 encode $ITERATIONS $size 0 - bench $plugin 2 1 decode $ITERATIONS $size 1 - done -} - function packetsize() { local k=$1 local w=$2 @@ -107,7 +98,7 @@ function packetsize() { echo $p } -function jerasure_test() { +function bench_run() { local plugin=jerasure local w=8 local VECTOR_WORDSIZE=16 @@ -118,31 +109,39 @@ function jerasure_test() { k2ms[4]="2 3" k2ms[6]="2 3 4" k2ms[10]="3 4" - for technique in reed_sol_van cauchy_good ; do - for size in $SIZES ; do - echo "serie encode_${technique}_${size}" + local isa2technique_vandermonde='reed_sol_van' + local isa2technique_cauchy='cauchy' + local jerasure_generic2technique_vandermonde='reed_sol_van' + local jerasure_generic2technique_cauchy='cauchy_good' + local jerasure_sse42technique_vandermonde='reed_sol_van' + local jerasure_sse42technique_cauchy='cauchy_good' + for technique in vandermonde cauchy ; do + for plugin in isa jerasure_generic jerasure_sse4 ; do + eval technique_parameter=\$${plugin}2technique_${technique} + echo "serie encode_${technique}_${plugin}" for k in $ks ; do for m in ${k2ms[$k]} ; do - bench $plugin $k $m encode $(($TOTAL_SIZE / $size)) $size 0 \ - --parameter packetsize=$(packetsize $k $w $VECTOR_WORDSIZE $size) \ + bench $plugin $k $m encode $(($TOTAL_SIZE / $SIZE)) $SIZE 0 \ + --parameter packetsize=$(packetsize $k $w $VECTOR_WORDSIZE $SIZE) \ ${PARAMETERS} \ - --parameter technique=$technique + --parameter technique=$technique_parameter done done done done - for technique in reed_sol_van cauchy_good ; do - for size in $SIZES ; do - echo "serie decode_${technique}_${size}" + for technique in vandermonde cauchy ; do + for plugin in isa jerasure_generic jerasure_sse4 ; do + eval technique_parameter=\$${plugin}2technique_${technique} + echo "serie decode_${technique}_${plugin}" for k in $ks ; do for m in ${k2ms[$k]} ; do echo for erasures in $(seq 1 $m) ; do - bench $plugin $k $m decode $(($TOTAL_SIZE / $size)) $size $erasures \ - --parameter packetsize=$(packetsize $k $w $VECTOR_WORDSIZE $size) \ + bench $plugin $k $m decode $(($TOTAL_SIZE / $SIZE)) $SIZE $erasures \ + --parameter packetsize=$(packetsize $k $w $VECTOR_WORDSIZE $SIZE) \ ${PARAMETERS} \ - --parameter technique=$technique + --parameter technique=$technique_parameter done done done @@ -150,46 +149,9 @@ function jerasure_test() { done } -function isa_test() { - local plugin=isa - local ks="2 3 4 6 10" - declare -A k2ms - k2ms[2]="1" - k2ms[3]="2" - k2ms[4]="2 3" - k2ms[6]="2 3 4" - k2ms[10]="3 4" - for technique in reed_sol_van cauchy ; do - for size in $SIZES ; do - echo "serie encode_${technique}_${size}" - for k in $ks ; do - for m in ${k2ms[$k]} ; do - bench $plugin $k $m encode $(($TOTAL_SIZE / $size)) $size 0 \ - --parameter technique=$technique - - done - done - done - done - for technique in reed_sol_van cauchy ; do - for size in $SIZES ; do - echo "serie decode_${technique}_${size}" - for k in $ks ; do - for m in ${k2ms[$k]} ; do - echo - for erasures in $(seq 1 $m) ; do - bench $plugin $k $m decode $(($TOTAL_SIZE / $size)) $size $erasures \ - --parameter technique=$technique - done - done - done - done - done -} function fplot() { - local plugin=$1 local serie - ${plugin}_test | while read seconds total plugin k m workload iteration size erasures rest ; do + bench_run | while read seconds total plugin k m workload iteration size erasures rest ; do if [ -z $seconds ] ; then echo null, elif [ $seconds = serie ] ; then @@ -213,209 +175,10 @@ function fplot() { function main() { bench_header - for plugin in ${PLUGINS} ; do - ${plugin}_test || return 1 - done + bench_run } -if [ "$1" = TEST ] ; then - set -x - set -o functrace - PS4=' ${FUNCNAME[0]}: $LINENO: ' - - TOTAL_SIZE=1024 - SIZE=1024 - - function run_test() { - dir=/tmp/erasure-code - rm -fr $dir - mkdir $dir - expected=$(cat </dev/null >/dev/null + if [ $? != 0 ]; then + echo Try to write $(($i * 1048576)) + set -x + return 1 + fi + sleep 0.05 + done + set -x + return 0 +} + +mkdir quota-test +cd quota-test + +# bytes +setfattr . -n ceph.quota.max_bytes -v 100000000 # 100m +expect_false write_file big 1000 # 1g +expect_false write_file second 10 +setfattr . -n ceph.quota.max_bytes -v 0 +dd if=/dev/zero of=third bs=1M count=10 +dd if=/dev/zero of=big2 bs=1M count=100 + + +rm -rf * + +# files +setfattr . -n ceph.quota.max_files -v 5 +mkdir ok +touch ok/1 +touch ok/2 +touch 3 +expect_false touch shouldbefail # 5 files will include the "." +expect_false touch ok/shouldbefail # 5 files will include the "." +setfattr . -n ceph.quota.max_files -v 0 +touch shouldbecreated +touch shouldbecreated2 + + +rm -rf * + +# mix +mkdir bytes bytes/files + +setfattr bytes -n ceph.quota.max_bytes -v 10000000 #10m +setfattr bytes/files -n ceph.quota.max_files -v 5 +dd if=/dev/zero of=bytes/files/1 bs=1M count=4 +dd if=/dev/zero of=bytes/files/2 bs=1M count=4 +expect_false write_file bytes/files/3 1000 +expect_false write_file bytes/files/4 1000 +expect_false write_file bytes/files/5 1000 +stat --printf="%n %s\n" bytes/files/1 #4M +stat --printf="%n %s\n" bytes/files/2 #4M +stat --printf="%n %s\n" bytes/files/3 #bigger than 2M +stat --printf="%n %s\n" bytes/files/4 #should be zero +expect_false stat bytes/files/5 #shouldn't be exist + + + + +rm -rf * + +#mv +mkdir files limit +truncate files/file -s 10G +setfattr limit -n ceph.quota.max_bytes -v 1000000 #1m +expect_false mv files limit/ + + + +rm -rf * + +#limit by ancestor + +mkdir -p ancestor/p1/p2/parent/p3 +setfattr ancestor -n ceph.quota.max_bytes -v 1000000 +setfattr ancestor/p1/p2/parent -n ceph.quota.max_bytes -v 1000000000 #1g +expect_false write_file ancestor/p1/p2/parent/p3/file1 900 #900m +stat --printf="%n %s\n" ancestor/p1/p2/parent/p3/file1 + + +#get/set attribute + +setfattr -n ceph.quota.max_bytes -v 0 . +setfattr -n ceph.quota.max_bytes -v 1 . +setfattr -n ceph.quota.max_bytes -v 9223372036854775807 . +expect_false setfattr -n ceph.quota.max_bytes -v 9223372036854775808 . +expect_false setfattr -n ceph.quota.max_bytes -v -1 . +expect_false setfattr -n ceph.quota.max_bytes -v -9223372036854775808 . +expect_false setfattr -n ceph.quota.max_bytes -v -9223372036854775809 . + +setfattr -n ceph.quota.max_files -v 0 . +setfattr -n ceph.quota.max_files -v 1 . +setfattr -n ceph.quota.max_files -v 9223372036854775807 . +expect_false setfattr -n ceph.quota.max_files -v 9223372036854775808 . +expect_false setfattr -n ceph.quota.max_files -v -1 . +expect_false setfattr -n ceph.quota.max_files -v -9223372036854775808 . +expect_false setfattr -n ceph.quota.max_files -v -9223372036854775809 . + +setfattr -n ceph.quota -v "max_bytes=0 max_files=0" . +setfattr -n ceph.quota -v "max_bytes=1 max_files=0" . +setfattr -n ceph.quota -v "max_bytes=0 max_files=1" . +setfattr -n ceph.quota -v "max_bytes=1 max_files=1" . +expect_false setfattr -n ceph.quota -v "max_bytes=-1 max_files=0" . +expect_false setfattr -n ceph.quota -v "max_bytes=0 max_files=-1" . +expect_false setfattr -n ceph.quota -v "max_bytes=-1 max_files=-1" . + +#addme + +cd .. +rm -rf quota-test + +echo OK diff --git a/qa/workunits/hadoop/internal-tests.sh b/qa/workunits/hadoop/internal-tests.sh deleted file mode 100755 index 48a9224dac17d..0000000000000 --- a/qa/workunits/hadoop/internal-tests.sh +++ /dev/null @@ -1,74 +0,0 @@ -#!/bin/bash -e - -# bail if $TESTDIR is not set as this test will fail in that scenario -[ -z $TESTDIR ] && { echo "\$TESTDIR needs to be set, but is not. Exiting."; exit 1; } - -# configure CEPH_CONF and LD_LIBRARY_PATH if they're not already set -conf="$CEPH_CONF" -if [ -z "$conf" ] ; then - echo "Setting conf to /etc/ceph/ceph.conf" - conf="/etc/ceph/ceph.conf" -else - echo "conf is set to $conf" -fi - -ld_lib_path="$LD_LIBRARY_PATH" -if [ -z "$ld_lib_path" ] ; then - echo "Setting ld_lib_path to /usr/lib/jni" - ld_lib_path="/usr/lib/jni" -else - echo "ld_lib_path was set to $ld_lib_path" -fi - -POOL_SIZES=`seq 1 8` -POOL_BASE=hadoop -POOL_NAMES=`echo -n $POOL_SIZES | sed "s/\([0-9]*\)/$POOL_BASE\1/g" | sed "s/ /,/g"` - -function gen_hadoop_conf() { -local outfile=$1 -local poolnames=$2 -local conf=$3 -cat << EOF > $outfile - - - - - ceph.conf.file - $conf - - - ceph.data.pools - $poolnames - - -EOF -} - -echo creating hadoop test pools -for size in $POOL_SIZES; do - name=${POOL_BASE}$size - echo creating pool $name - ceph osd pool create $name 100 100 - ceph osd pool set $name size $size - - echo making pool $name a data pool - poolid=`ceph osd dump | sed -n "s/^pool \([0-9]*\) '$name'.*/\1/p"` - ceph mds add_data_pool $poolid -done - -def_repl_conf=`mktemp` -echo generating default replication hadoop config $def_repl_conf -gen_hadoop_conf $def_repl_conf "" $conf - -cust_repl_conf=`mktemp` -echo generating custom replication hadoop config $cust_repl_conf -gen_hadoop_conf $cust_repl_conf $POOL_NAMES $conf - -echo running default replication hadoop tests -java -Dhadoop.conf.file=$def_repl_conf -Djava.library.path=$ld_lib_path -cp /usr/share/java/junit4.jar:$TESTDIR/apache_hadoop/build/hadoop-core-1.0.4-SNAPSHOT.jar:$TESTDIR/inktank_hadoop/build/hadoop-cephfs.jar:$TESTDIR/inktank_hadoop/build/hadoop-cephfs-test.jar:$TESTDIR/apache_hadoop/build/hadoop-test-1.0.4-SNAPSHOT.jar:$TESTDIR/apache_hadoop/build/ivy/lib/Hadoop/common/commons-logging-1.1.1.jar:/usr/share/java/libcephfs.jar org.junit.runner.JUnitCore org.apache.hadoop.fs.ceph.TestCephDefaultReplication - -echo running custom replication hadoop tests -java -Dhadoop.conf.file=$cust_repl_conf -Djava.library.path=$ld_lib_path -cp /usr/share/java/junit4.jar:$TESTDIR/apache_hadoop/build/hadoop-core-1.0.4-SNAPSHOT.jar:$TESTDIR/inktank_hadoop/build/hadoop-cephfs.jar:$TESTDIR/inktank_hadoop/build/hadoop-cephfs-test.jar:$TESTDIR/apache_hadoop/build/hadoop-test-1.0.4-SNAPSHOT.jar:$TESTDIR/apache_hadoop/build/ivy/lib/Hadoop/common/commons-logging-1.1.1.jar:/usr/share/java/libcephfs.jar org.junit.runner.JUnitCore org.apache.hadoop.fs.ceph.TestCephCustomReplication - -echo "completed hadoop-internal-tests tests" -exit 0 diff --git a/qa/workunits/hadoop/repl.sh b/qa/workunits/hadoop/repl.sh new file mode 100755 index 0000000000000..f2e9fccbd3089 --- /dev/null +++ b/qa/workunits/hadoop/repl.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +set -e +set -x + +# bail if $TESTDIR is not set as this test will fail in that scenario +[ -z $TESTDIR ] && { echo "\$TESTDIR needs to be set, but is not. Exiting."; exit 1; } + +# if HADOOP_PREFIX is not set, use default +[ -z $HADOOP_PREFIX ] && { HADOOP_PREFIX=$TESTDIR/hadoop; } + +# create pools with different replication factors +for repl in 2 3 7 8 9; do + name=hadoop.$repl + ceph osd pool create $name 8 8 + ceph osd pool set $name size $repl + + id=`ceph osd dump | sed -n "s/^pool \([0-9]*\) '$name'.*/\1/p"` + ceph mds add_data_pool $id +done + +# create a file in each of the pools +for repl in 2 3 7 8 9; do + name=hadoop.$repl + $HADOOP_PREFIX/bin/hadoop fs -rm -f /$name.dat + dd if=/dev/zero bs=1048576 count=1 | \ + $HADOOP_PREFIX/bin/hadoop fs -Dceph.data.pools="$name" \ + -put - /$name.dat +done + +# check that hadoop reports replication matching +# that of the pool the file was written into +for repl in 2 3 7 8 9; do + name=hadoop.$repl + repl2=$($HADOOP_PREFIX/bin/hadoop fs -ls /$name.dat | awk '{print $2}') + if [ $repl -ne $repl2 ]; then + echo "replication factors didn't match!" + exit 1 + fi +done + +exit 0 diff --git a/qa/workunits/hadoop/terasort.sh b/qa/workunits/hadoop/terasort.sh new file mode 100755 index 0000000000000..7996aece7ea4c --- /dev/null +++ b/qa/workunits/hadoop/terasort.sh @@ -0,0 +1,76 @@ +#!/bin/bash + +set -e +set -x + +INPUT=/terasort-input +OUTPUT=/terasort-output +REPORT=/tersort-report + +num_records=100000 +[ ! -z $NUM_RECORDS ] && num_records=$NUM_RECORDS + +# bail if $TESTDIR is not set as this test will fail in that scenario +[ -z $TESTDIR ] && { echo "\$TESTDIR needs to be set, but is not. Exiting."; exit 1; } + +# if HADOOP_PREFIX is not set, use default +[ -z $HADOOP_PREFIX ] && { HADOOP_PREFIX=$TESTDIR/hadoop; } + +# Nuke hadoop directories +$HADOOP_PREFIX/bin/hadoop fs -rm -r $INPUT $OUTPUT $REPORT || true + +# Generate terasort data +# +#-Ddfs.blocksize=512M \ +#-Dio.file.buffer.size=131072 \ +#-Dmapreduce.map.java.opts=-Xmx1536m \ +#-Dmapreduce.map.memory.mb=2048 \ +#-Dmapreduce.task.io.sort.mb=256 \ +#-Dyarn.app.mapreduce.am.resource.mb=1024 \ +#-Dmapred.map.tasks=64 \ +$HADOOP_PREFIX/bin/hadoop jar \ + $HADOOP_PREFIX/share/hadoop/mapreduce/hadoop-mapreduce-examples-*.jar \ + teragen \ + -Dmapred.map.tasks=9 \ + $num_records \ + $INPUT + +# Run the sort job +# +#-Ddfs.blocksize=512M \ +#-Dio.file.buffer.size=131072 \ +#-Dmapreduce.map.java.opts=-Xmx1536m \ +#-Dmapreduce.map.memory.mb=2048 \ +#-Dmapreduce.map.output.compress=true \ +#-Dmapreduce.map.output.compress.codec=org.apache.hadoop.io.compress.Lz4Codec \ +#-Dmapreduce.reduce.java.opts=-Xmx1536m \ +#-Dmapreduce.reduce.memory.mb=2048 \ +#-Dmapreduce.task.io.sort.factor=100 \ +#-Dmapreduce.task.io.sort.mb=768 \ +#-Dyarn.app.mapreduce.am.resource.mb=1024 \ +#-Dmapred.reduce.tasks=100 \ +#-Dmapreduce.terasort.output.replication=1 \ +$HADOOP_PREFIX/bin/hadoop jar \ + $HADOOP_PREFIX/share/hadoop/mapreduce/hadoop-mapreduce-examples-*.jar \ + terasort \ + -Dmapred.reduce.tasks=10 \ + $INPUT $OUTPUT + +# Validate the sorted data +# +#-Ddfs.blocksize=512M \ +#-Dio.file.buffer.size=131072 \ +#-Dmapreduce.map.java.opts=-Xmx1536m \ +#-Dmapreduce.map.memory.mb=2048 \ +#-Dmapreduce.reduce.java.opts=-Xmx1536m \ +#-Dmapreduce.reduce.memory.mb=2048 \ +#-Dmapreduce.task.io.sort.mb=256 \ +#-Dyarn.app.mapreduce.am.resource.mb=1024 \ +#-Dmapred.reduce.tasks=1 \ +$HADOOP_PREFIX/bin/hadoop jar \ + $HADOOP_PREFIX/share/hadoop/mapreduce/hadoop-mapreduce-examples-*.jar \ + teravalidate \ + -Dmapred.reduce.tasks=1 \ + $OUTPUT $REPORT + +exit 0 diff --git a/qa/workunits/hadoop/wordcount.sh b/qa/workunits/hadoop/wordcount.sh index 1b1c20702431b..99e1e8ff89748 100755 --- a/qa/workunits/hadoop/wordcount.sh +++ b/qa/workunits/hadoop/wordcount.sh @@ -1,4 +1,11 @@ -#!/bin/sh -ex +#!/bin/bash + +set -e +set -x + +WC_INPUT=/wc_input +WC_OUTPUT=/wc_output +DATA_INPUT=$(mktemp -d) echo "starting hadoop-wordcount test" @@ -6,43 +13,23 @@ echo "starting hadoop-wordcount test" [ -z $TESTDIR ] && { echo "\$TESTDIR needs to be set, but is not. Exiting."; exit 1; } # if HADOOP_PREFIX is not set, use default -[ -z $HADOOP_PREFIX ] && { HADOOP_PREFIX=$TESTDIR/apache_hadoop; } +[ -z $HADOOP_PREFIX ] && { HADOOP_PREFIX=$TESTDIR/hadoop; } -# if HADOOP_MR_HOME is not set, use default -[ -z $HADOOP_MR_HOME ] && { HADOOP_MR_HOME=$TESTDIR/apache_hadoop/build; } +# Nuke hadoop directories +$HADOOP_PREFIX/bin/hadoop fs -rm -r $WC_INPUT $WC_OUTPUT || true -export JAVA_HOME=/usr/lib/jvm/default-java +# Fetch and import testing data set +curl http://ceph.com/qa/hadoop_input_files.tar | tar xf - -C $DATA_INPUT +$HADOOP_PREFIX/bin/hadoop fs -copyFromLocal $DATA_INPUT $WC_INPUT +rm -rf $DATA_INPUT -set -e -set -x +# Run the job +$HADOOP_PREFIX/bin/hadoop jar \ + $HADOOP_PREFIX/share/hadoop/mapreduce/hadoop-mapreduce-examples-*.jar \ + wordcount $WC_INPUT $WC_OUTPUT -# Clear out in case there was a previous run (idempotency) -if $HADOOP_PREFIX/bin/hadoop fs -ls /wordcount_output 2>/dev/null ; then - $HADOOP_PREFIX/bin/hadoop fs -rmr /wordcount_output -fi -if $HADOOP_PREFIX/bin/hadoop fs -ls /wordcount_input 2>/dev/null ; then - $HADOOP_PREFIX/bin/hadoop fs -rmr /wordcount_input -fi -rm -rf $TESTDIR/hadoop_input - -# Load input files into local filesystem -mkdir -p $TESTDIR/hadoop_input -wget http://ceph.com/qa/hadoop_input_files.tar -O $TESTDIR/hadoop_input/files.tar -cd $TESTDIR/hadoop_input -tar -xf $TESTDIR/hadoop_input/files.tar - -# Load input files into hadoop filesystem -$HADOOP_PREFIX/bin/hadoop fs -mkdir /wordcount_input -$HADOOP_PREFIX/bin/hadoop fs -put $TESTDIR/hadoop_input/*txt /wordcount_input/ - -# Execute job -$HADOOP_PREFIX/bin/hadoop jar $HADOOP_MR_HOME/hadoop*examples*jar wordcount /wordcount_input /wordcount_output - -# Clean up -$HADOOP_PREFIX/bin/hadoop fs -rmr /wordcount_output -$HADOOP_PREFIX/bin/hadoop fs -rmr /wordcount_input -cd $TESTDIR -rm -rf $TESTDIR/hadoop_input +# Cleanup +$HADOOP_PREFIX/bin/hadoop fs -rm -r $WC_INPUT $WC_OUTPUT || true echo "completed hadoop-wordcount test" exit 0 diff --git a/qa/workunits/kernel_untar_build.sh b/qa/workunits/kernel_untar_build.sh index 279335200e3a0..cdab906432e95 100755 --- a/qa/workunits/kernel_untar_build.sh +++ b/qa/workunits/kernel_untar_build.sh @@ -2,12 +2,11 @@ set -e -#wget -q http://ceph.newdream.net/qa/linux-2.6.33.tar.bz2 -wget -q http://ceph.com/qa/linux-3.2.9.tar.bz2 +wget -q http://ceph.com/qa/linux-4.0.5.tar.xz mkdir t cd t -tar jxvf ../linux*.bz2 +tar Jxvf ../linux*.xz cd linux* make defconfig make -j`grep -c processor /proc/cpuinfo` diff --git a/qa/workunits/libcephfs-java/test.sh b/qa/workunits/libcephfs-java/test.sh index cf5fbf0e481eb..f299e9597279f 100755 --- a/qa/workunits/libcephfs-java/test.sh +++ b/qa/workunits/libcephfs-java/test.sh @@ -12,8 +12,8 @@ fi ld_lib_path="$LD_LIBRARY_PATH" if [ -z "$ld_lib_path" ] ; then - echo "Setting ld_lib_path to /usr/lib/jni" - ld_lib_path="/usr/lib/jni" + echo "Setting ld_lib_path to /usr/lib/jni:/usr/lib64" + ld_lib_path="/usr/lib/jni:/usr/lib64" else echo "ld_lib_path was set to $ld_lib_path" fi diff --git a/qa/workunits/mon/crush_ops.sh b/qa/workunits/mon/crush_ops.sh index 80950032fb2f9..adb3162038cb6 100755 --- a/qa/workunits/mon/crush_ops.sh +++ b/qa/workunits/mon/crush_ops.sh @@ -63,6 +63,12 @@ ceph osd tree | grep -c host1 | grep -q 0 expect_false ceph osd crush rm bar # not empty ceph osd crush unlink host2 + +# reference foo and bar with a rule +ceph osd crush rule create-simple foo-rule foo host firstn +expect_false ceph osd crush rm foo +ceph osd crush rule rm foo-rule + ceph osd crush rm bar ceph osd crush rm foo ceph osd crush rm osd.$o2 host2 diff --git a/qa/workunits/post-file.sh b/qa/workunits/post-file.sh index a6fb765e3bdcf..02a4ca292beaa 100755 --- a/qa/workunits/post-file.sh +++ b/qa/workunits/post-file.sh @@ -1,7 +1,7 @@ #!/bin/bash -ex what="$1" -[ -z "$what" ] && what=/usr/share/base-files +[ -z "$what" ] && what=/etc/udev/rules.d ceph-post-file -d ceph-test-workunit $what echo OK diff --git a/qa/workunits/rados/test_cache_pool.sh b/qa/workunits/rados/test_cache_pool.sh index 73f37da4f43a5..4db965dad94bb 100755 --- a/qa/workunits/rados/test_cache_pool.sh +++ b/qa/workunits/rados/test_cache_pool.sh @@ -79,6 +79,8 @@ expect_false diff -q tmp.txt empty.txt # cleanup ceph osd tier remove-overlay base_pool +ceph osd tier remove base_pool wrong_cache +ceph osd tier remove base_pool partial_wrong ceph osd tier remove base_pool empty_cache ceph osd pool delete base_pool base_pool --yes-i-really-really-mean-it ceph osd pool delete empty_cache empty_cache --yes-i-really-really-mean-it @@ -127,7 +129,6 @@ rados -p cache ls - | wc -l | grep 0 # cleanup ceph osd tier remove-overlay base -ceph osd tier cache-mode cache none ceph osd tier remove base cache ceph osd pool delete cache cache --yes-i-really-really-mean-it diff --git a/qa/workunits/rados/test_rados_tool.sh b/qa/workunits/rados/test_rados_tool.sh index 3416cc93f04f4..47ea72b2d71f2 100755 --- a/qa/workunits/rados/test_rados_tool.sh +++ b/qa/workunits/rados/test_rados_tool.sh @@ -17,7 +17,11 @@ test_omap() { cleanup for i in $(seq 1 1 600) do - rados -p $POOL setomapval $OBJ $i $i + if [ $(($i % 2)) -eq 0 ]; then + rados -p $POOL setomapval $OBJ $i $i + else + echo -n "$i" | rados -p $POOL setomapval $OBJ $i + fi rados -p $POOL getomapval $OBJ $i | grep -q "\\: $i\$" done rados -p $POOL listomapvals $OBJ | grep -c value | grep 600 diff --git a/qa/workunits/rbd/concurrent.sh b/qa/workunits/rbd/concurrent.sh index ceb4563567fea..2f3ce9c3664dd 100755 --- a/qa/workunits/rbd/concurrent.sh +++ b/qa/workunits/rbd/concurrent.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/bash -e # Copyright (C) 2013 Inktank Storage, Inc. # @@ -76,10 +76,9 @@ function setup() { [ -d /sys/bus/rbd ] || sudo modprobe rbd - # This assumes it's easier to read a file than generate - # random data. Use busybox because it is a big executable. - dd if="/bin/busybox" of="{SOURCE_DATA}" bs=2048 count=66 \ - >/dev/null 2>&1 + # Use urandom to generate SOURCE_DATA + dd if=/dev/urandom of=${SOURCE_DATA} bs=2048 count=66 \ + >/dev/null 2>&1 # List of rbd id's *not* created by this script export INITIAL_RBD_IDS=$(ls /sys/bus/rbd/devices) @@ -88,14 +87,8 @@ function setup() { # Set up some environment for normal teuthology test setup. # This really should not be necessary but I found it was. - TOP="/tmp/cephtest" - export CEPH_ARGS="--conf ${TOP}/ceph.conf" - export CEPH_ARGS="${CEPH_ARGS} --keyring ${TOP}/data/client.0.keyring" - export CEPH_ARGS="${CEPH_ARGS} --name client.0" - - export LD_LIBRARY_PATH="${TOP}/binary/usr/local/lib:${LD_LIBRARY_PATH}" - export PATH="${TOP}/binary/usr/local/bin:${PATH}" - export PATH="${TOP}/binary/usr/local/sbin:${PATH}" + + export CEPH_ARGS=" --name client.0" } function cleanup() { @@ -116,7 +109,7 @@ function cleanup() { wait sync rm -f "${SOURCE_DATA}" - [ -d "${NAMES_DIR}" ] && rmdir -f "${NAMES_DIR}" + [ -d "${NAMES_DIR}" ] && rmdir "${NAMES_DIR}" sudo chown root /sys/bus/rbd/add /sys/bus/rbd/remove echo "Max concurrent rbd image count was $(get_max "${ID_COUNT_DIR}")" rm -rf "${ID_COUNT_DIR}" @@ -283,7 +276,7 @@ function rbd_write_image() { # Offset and size here are meant to ensure beginning and end # cross both (4K or 64K) page and (4MB) rbd object boundaries. # It assumes the SOURCE_DATA file has size 66 * 2048 bytes - dd "${SOURCE_DATA}" of="/dev/rbd${id}" bs=2048 seek=2015 \ + dd if="${SOURCE_DATA}" of="/dev/rbd${id}" bs=2048 seek=2015 \ > /dev/null 2>&1 } @@ -323,7 +316,7 @@ function rbd_read_image() { # zero-fills unwritten data when the target object doesn't # exist. dd if="/dev/rbd${id}" of=/dev/null bs=2048 count=34 skip=4098 \ - /dev/null 2>&1 + > /dev/null 2>&1 } function rbd_unmap_image() { diff --git a/qa/workunits/rbd/copy.sh b/qa/workunits/rbd/copy.sh index 27a1789c07a07..746dd389b2437 100755 --- a/qa/workunits/rbd/copy.sh +++ b/qa/workunits/rbd/copy.sh @@ -84,8 +84,8 @@ test_rename() { echo "testing rename..." remove_images - rbd create -s 1 foo - rbd create --new-format -s 1 bar + rbd create --image-format 1 -s 1 foo + rbd create --image-format 2 -s 1 bar rbd rename foo foo2 rbd rename foo2 bar 2>&1 | grep exists rbd rename bar bar2 @@ -108,8 +108,8 @@ test_ls() { echo "testing ls..." remove_images - rbd create -s 1 test1 - rbd create -s 1 test2 + rbd create --image-format 1 -s 1 test1 + rbd create --image-format 1 -s 1 test2 rbd ls | grep test1 rbd ls | grep test2 rbd ls | wc -l | grep 2 @@ -120,8 +120,8 @@ test_ls() { rbd rm test1 rbd rm test2 - rbd create --new-format -s 1 test1 - rbd create --new-format -s 1 test2 + rbd create --image-format 2 -s 1 test1 + rbd create --image-format 2 -s 1 test2 rbd ls | grep test1 rbd ls | grep test2 rbd ls | wc -l | grep 2 @@ -131,8 +131,8 @@ test_ls() { rbd rm test1 rbd rm test2 - rbd create --new-format -s 1 test1 - rbd create -s 1 test2 + rbd create --image-format 2 -s 1 test1 + rbd create --image-format 1 -s 1 test2 rbd ls | grep test1 rbd ls | grep test2 rbd ls | wc -l | grep 2 @@ -164,11 +164,11 @@ test_remove() { echo "testing remove..." remove_images - rbd create -s 1 test1 + rbd create --image-format 1 -s 1 test1 rbd rm test1 rbd ls | wc -l | grep "^0$" - rbd create --new-format -s 1 test2 + rbd create --image-format 2 -s 1 test2 rbd rm test2 rbd ls | wc -l | grep "^0$" @@ -177,21 +177,21 @@ test_remove() { # by removing some objects manually. # remove with header missing (old format) - rbd create -s 1 test1 + rbd create --image-format 1 -s 1 test1 rados rm -p rbd test1.rbd rbd rm test1 rbd ls | wc -l | grep "^0$" if [ $tiered -eq 0 ]; then # remove with header missing - rbd create --new-format -s 1 test2 + rbd create --image-format 2 -s 1 test2 HEADER=$(rados -p rbd ls | grep '^rbd_header') rados -p rbd rm $HEADER rbd rm test2 rbd ls | wc -l | grep "^0$" # remove with header and id missing - rbd create --new-format -s 1 test2 + rbd create --image-format 2 -s 1 test2 HEADER=$(rados -p rbd ls | grep '^rbd_header') rados -p rbd rm $HEADER rados -p rbd rm rbd_id.test2 @@ -201,7 +201,7 @@ test_remove() { # remove with rbd_children object missing (and, by extension, # with child not mentioned in rbd_children) - rbd create --new-format -s 1 test2 + rbd create --image-format 2 -s 1 test2 rbd snap create test2@snap rbd snap protect test2@snap rbd clone test2@snap clone @@ -233,11 +233,15 @@ test_locking() { rbd lock list test1 | grep ' 2 ' rbd lock add test1 id2 --shared tag rbd lock list test1 | grep ' 3 ' - LOCKER=$(rbd lock list test1 | tail -n 1 | awk '{print $1;}') - ID=$(rbd lock list test1 | tail -n 1 | awk '{print $2;}') - rbd lock remove test1 $ID $LOCKER - # locks don't prevent you from removing an image, - # just from taking a lock + rbd lock list test1 | tail -n 1 | awk '{print $2, $1;}' | xargs rbd lock remove test1 + if rbd info test1 | grep -qE "features:.*exclusive" + then + # new locking functionality requires all locks to be released + while [ -n "$(rbd lock list test1)" ] + do + rbd lock list test1 | tail -n 1 | awk '{print $2, $1;}' | xargs rbd lock remove test1 + done + fi rbd rm test1 } diff --git a/qa/workunits/rbd/huge-tickets.sh b/qa/workunits/rbd/huge-tickets.sh new file mode 100755 index 0000000000000..63a63846bd053 --- /dev/null +++ b/qa/workunits/rbd/huge-tickets.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +# This is a test for http://tracker.ceph.com/issues/8979 and the fallout +# from triaging it. #8979 itself was random crashes on corrupted memory +# due to a buffer overflow (for tickets larger than 256 bytes), further +# inspection showed that vmalloced tickets weren't handled correctly as +# well. +# +# What we are doing here is generating three huge keyrings and feeding +# them to libceph (through 'rbd map' on a scratch image). Bad kernels +# will crash reliably either on corrupted memory somewhere or a bad page +# fault in scatterwalk_pagedone(). + +set -ex + +function generate_keyring() { + local user=$1 + local n=$2 + + ceph-authtool -C -n client.$user --cap mon 'allow *' --gen-key /tmp/keyring-$user + + set +x # don't pollute trace with echos + echo -en "\tcaps osd = \"allow rwx pool=rbd" >>/tmp/keyring-$user + for i in $(seq 1 $n); do + echo -n ", allow rwx pool=pool$i" >>/tmp/keyring-$user + done + echo "\"" >>/tmp/keyring-$user + set -x +} + +generate_keyring foo 1000 # ~25K, kmalloc +generate_keyring bar 20000 # ~500K, vmalloc +generate_keyring baz 300000 # ~8M, vmalloc + sg chaining + +rbd create --size 1 test + +for user in {foo,bar,baz}; do + ceph auth import -i /tmp/keyring-$user + DEV=$(sudo rbd map -n client.$user --keyring /tmp/keyring-$user test) + sudo rbd unmap $DEV +done diff --git a/qa/workunits/rbd/image_read.sh b/qa/workunits/rbd/image_read.sh index d96a3c7ae8eb0..24fcea2a327f3 100755 --- a/qa/workunits/rbd/image_read.sh +++ b/qa/workunits/rbd/image_read.sh @@ -336,7 +336,8 @@ function create_image() { fi rbd create "${image_name}" --image-format "${FORMAT}" \ - --size "${IMAGE_SIZE}" --order "${OBJECT_ORDER}" + --size "${IMAGE_SIZE}" --order "${OBJECT_ORDER}" \ + --image-shared } function destroy_image() { @@ -476,7 +477,8 @@ function create_snap_clone() { fi rbd snap protect "${image_snap}" - rbd clone --order "${clone_order}" "${image_snap}" "${clone_name}" + rbd clone --order "${clone_order}" --image-shared \ + "${image_snap}" "${clone_name}" } function destroy_snap_clone() { diff --git a/qa/workunits/rbd/import_export.sh b/qa/workunits/rbd/import_export.sh index d1ba29a4f1ece..d3d164ddfbd05 100755 --- a/qa/workunits/rbd/import_export.sh +++ b/qa/workunits/rbd/import_export.sh @@ -9,7 +9,7 @@ objects () { # it doesn't necessarily make sense as they're hex, at least it makes # the list repeatable and comparable objects=$(rados ls -p rbd | grep $prefix | \ - sed -e 's/'$prefix'\.//' -e 's/^0*\([0-9a-f]\)/\1/' | sort) + sed -e 's/'$prefix'\.//' -e 's/^0*\([0-9a-f]\)/\1/' | sort -u) echo $objects } diff --git a/qa/workunits/rbd/merge_diff.sh b/qa/workunits/rbd/merge_diff.sh new file mode 100755 index 0000000000000..7fbcd2f9c86ec --- /dev/null +++ b/qa/workunits/rbd/merge_diff.sh @@ -0,0 +1,469 @@ +#!/bin/bash -ex + +pool=rbd +gen=$pool/gen +out=$pool/out +testno=1 + +mkdir -p merge_diff_test +pushd merge_diff_test + +function expect_false() +{ + if "$@"; then return 1; else return 0; fi +} + +function clear_all() +{ + fusermount -u mnt || true + + rbd snap purge --no-progress $gen || true + rbd rm --no-progress $gen || true + rbd snap purge --no-progress $out || true + rbd rm --no-progress $out || true + + rm -rf diffs || true +} + +function rebuild() +{ + clear_all + echo Starting test $testno + ((testno++)) + rbd create $gen --size 100 --order $1 --stripe_unit $2 --stripe_count $3 --image-format $4 + rbd create $out --size 1 --order 19 + mkdir -p mnt diffs + # lttng has atexit handlers that need to be fork/clone aware + LD_PRELOAD=liblttng-ust-fork.so.0 rbd-fuse -p $pool mnt +} + +function write() +{ + dd if=/dev/urandom of=mnt/gen bs=1M conv=notrunc seek=$1 count=$2 +} + +function snap() +{ + rbd snap create $gen@$1 +} + +function resize() +{ + rbd resize --no-progress $gen --size $1 --allow-shrink +} + +function export_diff() +{ + if [ $2 == "head" ]; then + target="$gen" + else + target="$gen@$2" + fi + if [ $1 == "null" ]; then + rbd export-diff --no-progress $target diffs/$1.$2 + else + rbd export-diff --no-progress $target --from-snap $1 diffs/$1.$2 + fi +} + +function merge_diff() +{ + rbd merge-diff diffs/$1.$2 diffs/$2.$3 diffs/$1.$3 +} + +function check() +{ + rbd import-diff --no-progress diffs/$1.$2 $out || return -1 + if [ "$2" == "head" ]; then + sum1=`rbd export $gen - | md5sum` + else + sum1=`rbd export $gen@$2 - | md5sum` + fi + sum2=`rbd export $out - | md5sum` + if [ "$sum1" != "$sum2" ]; then + exit -1 + fi + if [ "$2" != "head" ]; then + rbd snap ls $out | awk '{print $2}' | grep "^$2\$" || return -1 + fi +} + +#test f/t header +rebuild 22 4194304 1 2 +write 0 1 +snap a +write 1 1 +export_diff null a +export_diff a head +merge_diff null a head +check null head + +rebuild 22 4194304 1 2 +write 0 1 +snap a +write 1 1 +snap b +write 2 1 +export_diff null a +export_diff a b +export_diff b head +merge_diff null a b +check null b + +rebuild 22 4194304 1 2 +write 0 1 +snap a +write 1 1 +snap b +write 2 1 +export_diff null a +export_diff a b +export_diff b head +merge_diff a b head +check null a +check a head + +rebuild 22 4194304 1 2 +write 0 1 +snap a +write 1 1 +snap b +write 2 1 +export_diff null a +export_diff a b +export_diff b head +rbd merge-diff diffs/null.a diffs/a.b - | rbd merge-diff - diffs/b.head - > diffs/null.head +check null head + +#data test +rebuild 22 4194304 1 2 +write 4 2 +snap s101 +write 0 3 +write 8 2 +snap s102 +export_diff null s101 +export_diff s101 s102 +merge_diff null s101 s102 +check null s102 + +rebuild 22 4194304 1 2 +write 0 3 +write 2 5 +write 8 2 +snap s201 +write 0 2 +write 6 3 +snap s202 +export_diff null s201 +export_diff s201 s202 +merge_diff null s201 s202 +check null s202 + +rebuild 22 4194304 1 2 +write 0 4 +write 12 6 +snap s301 +write 0 6 +write 10 5 +write 16 4 +snap s302 +export_diff null s301 +export_diff s301 s302 +merge_diff null s301 s302 +check null s302 + +rebuild 22 4194304 1 2 +write 0 12 +write 14 2 +write 18 2 +snap s401 +write 1 2 +write 5 6 +write 13 3 +write 18 2 +snap s402 +export_diff null s401 +export_diff s401 s402 +merge_diff null s401 s402 +check null s402 + +rebuild 22 4194304 1 2 +write 2 4 +write 10 12 +write 27 6 +write 36 4 +snap s501 +write 0 24 +write 28 4 +write 36 4 +snap s502 +export_diff null s501 +export_diff s501 s502 +merge_diff null s501 s502 +check null s502 + +rebuild 22 4194304 1 2 +write 0 8 +resize 5 +snap r1 +resize 20 +write 12 8 +snap r2 +resize 8 +write 4 4 +snap r3 +export_diff null r1 +export_diff r1 r2 +export_diff r2 r3 +merge_diff null r1 r2 +merge_diff null r2 r3 +check null r3 + +rebuild 22 4194304 1 2 +write 0 8 +resize 5 +snap r1 +resize 20 +write 12 8 +snap r2 +resize 8 +write 4 4 +snap r3 +resize 10 +snap r4 +export_diff null r1 +export_diff r1 r2 +export_diff r2 r3 +export_diff r3 r4 +merge_diff null r1 r2 +merge_diff null r2 r3 +merge_diff null r3 r4 +check null r4 + +rebuild 22 65536 8 2 +write 0 32 +snap r1 +write 16 32 +snap r2 +export_diff null r1 +export_diff r1 r2 +expect_false merge_diff null r1 r2 + +rebuild 22 4194304 1 2 +write 0 1 +write 2 1 +write 4 1 +write 6 1 +snap s1 +write 1 1 +write 3 1 +write 5 1 +snap s2 +export_diff null s1 +export_diff s1 s2 +merge_diff null s1 s2 +check null s2 + +rebuild 22 4194304 1 2 +write 1 1 +write 3 1 +write 5 1 +snap s1 +write 0 1 +write 2 1 +write 4 1 +write 6 1 +snap s2 +export_diff null s1 +export_diff s1 s2 +merge_diff null s1 s2 +check null s2 + +rebuild 22 4194304 1 2 +write 0 3 +write 6 3 +write 12 3 +snap s1 +write 1 1 +write 7 1 +write 13 1 +snap s2 +export_diff null s1 +export_diff s1 s2 +merge_diff null s1 s2 +check null s2 + +rebuild 22 4194304 1 2 +write 0 3 +write 6 3 +write 12 3 +snap s1 +write 0 1 +write 6 1 +write 12 1 +snap s2 +export_diff null s1 +export_diff s1 s2 +merge_diff null s1 s2 +check null s2 + +rebuild 22 4194304 1 2 +write 0 3 +write 6 3 +write 12 3 +snap s1 +write 2 1 +write 8 1 +write 14 1 +snap s2 +export_diff null s1 +export_diff s1 s2 +merge_diff null s1 s2 +check null s2 + +rebuild 22 4194304 1 2 +write 1 1 +write 7 1 +write 13 1 +snap s1 +write 0 3 +write 6 3 +write 12 3 +snap s2 +export_diff null s1 +export_diff s1 s2 +merge_diff null s1 s2 +check null s2 + +rebuild 22 4194304 1 2 +write 0 1 +write 6 1 +write 12 1 +snap s1 +write 0 3 +write 6 3 +write 12 3 +snap s2 +export_diff null s1 +export_diff s1 s2 +merge_diff null s1 s2 +check null s2 + +rebuild 22 4194304 1 2 +write 2 1 +write 8 1 +write 14 1 +snap s1 +write 0 3 +write 6 3 +write 12 3 +snap s2 +export_diff null s1 +export_diff s1 s2 +merge_diff null s1 s2 +check null s2 + +rebuild 22 4194304 1 2 +write 0 3 +write 6 3 +write 12 3 +snap s1 +write 0 3 +write 6 3 +write 12 3 +snap s2 +export_diff null s1 +export_diff s1 s2 +merge_diff null s1 s2 +check null s2 + +rebuild 22 4194304 1 2 +write 2 4 +write 8 4 +write 14 4 +snap s1 +write 0 3 +write 6 3 +write 12 3 +snap s2 +export_diff null s1 +export_diff s1 s2 +merge_diff null s1 s2 +check null s2 + +rebuild 22 4194304 1 2 +write 0 4 +write 6 4 +write 12 4 +snap s1 +write 0 3 +write 6 3 +write 12 3 +snap s2 +export_diff null s1 +export_diff s1 s2 +merge_diff null s1 s2 +check null s2 + +rebuild 22 4194304 1 2 +write 0 6 +write 6 6 +write 12 6 +snap s1 +write 0 3 +write 6 3 +write 12 3 +snap s2 +export_diff null s1 +export_diff s1 s2 +merge_diff null s1 s2 +check null s2 + +rebuild 22 4194304 1 2 +write 3 6 +write 9 6 +write 15 6 +snap s1 +write 0 3 +write 6 3 +write 12 3 +snap s2 +export_diff null s1 +export_diff s1 s2 +merge_diff null s1 s2 +check null s2 + +rebuild 22 4194304 1 2 +write 0 8 +snap s1 +resize 2 +resize 100 +snap s2 +export_diff null s1 +export_diff s1 s2 +merge_diff null s1 s2 +check null s2 + +rebuild 22 4194304 1 2 +write 0 8 +snap s1 +resize 2 +resize 100 +snap s2 +write 20 2 +snap s3 +export_diff null s1 +export_diff s1 s2 +export_diff s2 s3 +merge_diff s1 s2 s3 +check null s1 +check s1 s3 + +#addme + +clear_all +popd +rm -rf merge_diff_test + +echo OK diff --git a/qa/workunits/rbd/notify_master.sh b/qa/workunits/rbd/notify_master.sh new file mode 100755 index 0000000000000..3d1b2243c56c0 --- /dev/null +++ b/qa/workunits/rbd/notify_master.sh @@ -0,0 +1,7 @@ +#!/bin/sh -ex + +CEPH_REF=${CEPH_REF:-master} +wget -O test_notify.py "https://git.ceph.com/?p=ceph.git;a=blob_plain;hb=$CEPH_REF;f=src/test/librbd/test_notify.py" + +python test_notify.py master +exit 0 diff --git a/qa/workunits/rbd/notify_slave.sh b/qa/workunits/rbd/notify_slave.sh new file mode 100755 index 0000000000000..e94894ac5b87a --- /dev/null +++ b/qa/workunits/rbd/notify_slave.sh @@ -0,0 +1,7 @@ +#!/bin/sh -ex + +CEPH_REF=${CEPH_REF:-master} +wget -O test_notify.py "https://git.ceph.com/?p=ceph.git;a=blob_plain;hb=$CEPH_REF;f=src/test/librbd/test_notify.py" + +python test_notify.py slave +exit 0 diff --git a/qa/workunits/rbd/permissions.sh b/qa/workunits/rbd/permissions.sh index a4b0c32fdee6e..b38ad067bf457 100755 --- a/qa/workunits/rbd/permissions.sh +++ b/qa/workunits/rbd/permissions.sh @@ -12,11 +12,11 @@ delete_pools() { } recreate_pools() { - create_pools delete_pools + create_pools } -delete_uers() { +delete_users() { (ceph auth del client.volumes || true) >/dev/null 2>&1 (ceph auth del client.images || true) >/dev/null 2>&1 } @@ -26,31 +26,54 @@ create_users() { ceph auth get-or-create client.images mon 'allow r' osd 'allow class-read object_prefix rbd_children, allow rwx pool images' >> $KEYRING } +expect() { + + set +e + + local expected_ret=$1 + local ret + + shift + cmd=$@ + + eval $cmd + ret=$? + + set -e + + if [[ $ret -ne $expected_ret ]]; then + echo "ERROR: running \'$cmd\': expected $expected_ret got $ret" + return 1 + fi + + return 0 +} + test_images_access() { - rbd -k $KEYRING --id images create --format 2 -s 1 images/foo + rbd -k $KEYRING --id images create --image-format 2 -s 1 images/foo rbd -k $KEYRING --id images snap create images/foo@snap rbd -k $KEYRING --id images snap protect images/foo@snap rbd -k $KEYRING --id images snap unprotect images/foo@snap rbd -k $KEYRING --id images snap protect images/foo@snap rbd -k $KEYRING --id images export images/foo@snap - >/dev/null - ! rbd -k $KEYRING --id images snap rm images/foo@snap + expect 16 rbd -k $KEYRING --id images snap rm images/foo@snap rbd -k $KEYRING --id volumes clone images/foo@snap volumes/child - ! rbd -k $KEYRING --id images snap unprotect images/foo@snap - ! rbd -k $KEYRING --id volumes snap unprotect images/foo@snap - ! rbd -k $KEYRING --id images flatten volumes/child + expect 16 rbd -k $KEYRING --id images snap unprotect images/foo@snap + expect 1 rbd -k $KEYRING --id volumes snap unprotect images/foo@snap + expect 1 rbd -k $KEYRING --id images flatten volumes/child rbd -k $KEYRING --id volumes flatten volumes/child - ! rbd -k $KEYRING --id volumes snap unprotect images/foo@snap + expect 1 rbd -k $KEYRING --id volumes snap unprotect images/foo@snap rbd -k $KEYRING --id images snap unprotect images/foo@snap - ! rbd -k $KEYRING --id images rm images/foo + expect 39 rbd -k $KEYRING --id images rm images/foo rbd -k $KEYRING --id images snap rm images/foo@snap rbd -k $KEYRING --id images rm images/foo rbd -k $KEYRING --id volumes rm volumes/child } test_volumes_access() { - rbd -k $KEYRING --id images create --format 2 -s 1 images/foo + rbd -k $KEYRING --id images create --image-format 2 -s 1 images/foo rbd -k $KEYRING --id images snap create images/foo@snap rbd -k $KEYRING --id images snap protect images/foo@snap @@ -64,16 +87,16 @@ test_volumes_access() { rbd -k $KEYRING --id volumes lock list images/foo # commands that fail with read-only access - ! rbd -k $KEYRING --id volumes resize -s 2 images/foo --allow-shrink - ! rbd -k $KEYRING --id volumes snap create images/foo@2 - ! rbd -k $KEYRING --id volumes snap rollback images/foo@snap - ! rbd -k $KEYRING --id volumes snap remove images/foo@snap - ! rbd -k $KEYRING --id volumes snap purge images/foo - ! rbd -k $KEYRING --id volumes snap unprotect images/foo@snap - ! rbd -k $KEYRING --id volumes flatten images/foo - ! rbd -k $KEYRING --id volumes lock add images/foo test - ! rbd -k $KEYRING --id volumes lock remove images/foo test locker - ! rbd -k $KEYRING --id volumes ls rbd + expect 1 rbd -k $KEYRING --id volumes resize -s 2 images/foo --allow-shrink + expect 1 rbd -k $KEYRING --id volumes snap create images/foo@2 + expect 1 rbd -k $KEYRING --id volumes snap rollback images/foo@snap + expect 1 rbd -k $KEYRING --id volumes snap remove images/foo@snap + expect 1 rbd -k $KEYRING --id volumes snap purge images/foo + expect 1 rbd -k $KEYRING --id volumes snap unprotect images/foo@snap + expect 1 rbd -k $KEYRING --id volumes flatten images/foo + expect 1 rbd -k $KEYRING --id volumes lock add images/foo test + expect 1 rbd -k $KEYRING --id volumes lock remove images/foo test locker + expect 1 rbd -k $KEYRING --id volumes ls rbd # create clone and snapshot rbd -k $KEYRING --id volumes clone images/foo@snap volumes/child @@ -82,14 +105,14 @@ test_volumes_access() { rbd -k $KEYRING --id volumes snap create volumes/child@snap2 # make sure original snapshot stays protected - ! rbd -k $KEYRING --id images snap unprotect images/foo@snap + expect 16 rbd -k $KEYRING --id images snap unprotect images/foo@snap rbd -k $KEYRING --id volumes flatten volumes/child - ! rbd -k $KEYRING --id images snap unprotect images/foo@snap + expect 16 rbd -k $KEYRING --id images snap unprotect images/foo@snap rbd -k $KEYRING --id volumes snap rm volumes/child@snap2 - ! rbd -k $KEYRING --id images snap unprotect images/foo@snap - ! rbd -k $KEYRING --id volumes snap rm volumes/child@snap2 + expect 16 rbd -k $KEYRING --id images snap unprotect images/foo@snap + expect 2 rbd -k $KEYRING --id volumes snap rm volumes/child@snap2 rbd -k $KEYRING --id volumes snap unprotect volumes/child@snap1 - ! rbd -k $KEYRING --id images snap unprotect images/foo@snap + expect 16 rbd -k $KEYRING --id images snap unprotect images/foo@snap # clean up rbd -k $KEYRING --id volumes snap rm volumes/child@snap1 diff --git a/qa/workunits/rbd/qemu-iotests.sh b/qa/workunits/rbd/qemu-iotests.sh index 9dfca3e6ed15f..9883e373b4e1d 100755 --- a/qa/workunits/rbd/qemu-iotests.sh +++ b/qa/workunits/rbd/qemu-iotests.sh @@ -8,33 +8,41 @@ # This will only work with particular qemu versions, like 1.0. Later # versions of qemu include qemu-iotests directly in the qemu # repository. -codevers=`lsb_release -sc` -iotests=qemu-iotests testlist='001 002 003 004 005 008 009 010 011 021 025' # See if we need to use the iotests suites in qemu (newer version). -# Right now, trusty is the only version that uses this. -for chkcode in "trusty" -do - if [ "$chkcode" = "$codevers" ] - then +case `lsb_release -sc` in + trusty|Maipo) iotests=qemu/tests/qemu-iotests - fi -done + ;; + *) + iotests=qemu-iotests + ;; +esac if [ "$iotests" = "qemu/tests/qemu-iotests" ] then - git clone git://apt-mirror.front.sepia.ceph.com/qemu.git + git clone https://github.com/qemu/qemu.git + # use v2.2.0-rc3 (last released version that handles all the tests + cd qemu + git checkout 2528043f1f299e0e88cb026f1ca7c40bbb4e1f80 + cd .. testlist=$testlist' 032 033 055 077' else - git clone git://ceph.com/git/qemu-iotests.git + git clone git://git.ceph.com/qemu-iotests.git fi cd "$iotests" mkdir bin # qemu-iotests expects a binary called just 'qemu' to be available -ln -s `which qemu-system-x86_64` bin/qemu +if [ -x '/usr/bin/qemu-system-x86_64' ] +then + QEMU='/usr/bin/qemu-system-x86_64' +else + QEMU='/usr/libexec/qemu-kvm' +fi +ln -s $QEMU bin/qemu # this is normally generated by configure, but has nothing but a python # binary definition, which we don't care about. for some reason it is diff --git a/qa/workunits/rbd/test_librbd.sh b/qa/workunits/rbd/test_librbd.sh index d35cfafb15942..f3999f4bba4a6 100755 --- a/qa/workunits/rbd/test_librbd.sh +++ b/qa/workunits/rbd/test_librbd.sh @@ -1,5 +1,9 @@ #!/bin/sh -e -ceph_test_librbd - +if [ -n "${VALGRIND}" ]; then + valgrind --tool=${VALGRIND} --suppressions=${TESTDIR}/valgrind.supp \ + ceph_test_librbd +else + ceph_test_librbd +fi exit 0 diff --git a/qa/workunits/rbd/test_librbd_api.sh b/qa/workunits/rbd/test_librbd_api.sh new file mode 100755 index 0000000000000..975144b88992c --- /dev/null +++ b/qa/workunits/rbd/test_librbd_api.sh @@ -0,0 +1,4 @@ +#!/bin/sh -e + +ceph_test_librbd_api +exit 0 diff --git a/qa/workunits/rbd/test_librbd_python.sh b/qa/workunits/rbd/test_librbd_python.sh index 0d89b369d0247..eba66bab6035a 100755 --- a/qa/workunits/rbd/test_librbd_python.sh +++ b/qa/workunits/rbd/test_librbd_python.sh @@ -4,5 +4,11 @@ CEPH_REF=${CEPH_REF:-master} #wget -q https://raw.github.com/ceph/ceph/$CEPH_REF/src/test/pybind/test_rbd.py wget -O test_rbd.py "https://ceph.com/git/?p=ceph.git;a=blob_plain;hb=$CEPH_REF;f=src/test/pybind/test_rbd.py" || \ wget -O test_rbd.py "https://ceph.com/git/?p=ceph.git;a=blob_plain;hb=ref/heads/$CEPH_REF;f=src/test/pybind/test_rbd.py" -nosetests -v test_rbd + +if [ -n "${VALGRIND}" ]; then + valgrind --tool=${VALGRIND} --suppressions=${TESTDIR}/valgrind.supp \ + nosetests -v test_rbd +else + nosetests -v test_rbd +fi exit 0 diff --git a/qa/workunits/rbd/test_lock_fence.sh b/qa/workunits/rbd/test_lock_fence.sh index ee015043fbdee..dc34cb1deb1e2 100755 --- a/qa/workunits/rbd/test_lock_fence.sh +++ b/qa/workunits/rbd/test_lock_fence.sh @@ -8,7 +8,7 @@ CEPH_REF=${CEPH_REF:-master} wget -O $RBDRW "https://ceph.com/git/?p=ceph.git;a=blob_plain;hb=$CEPH_REF;f=src/test/librbd/rbdrw.py" -rbd create $IMAGE --size 10 --image-format 2 || exit 1 +rbd create $IMAGE --size 10 --image-format 2 --image-shared || exit 1 # rbdrw loops doing I/O to $IMAGE after locking with lockid $LOCKID python $RBDRW $IMAGE $LOCKID & diff --git a/qa/workunits/rest/test.py b/qa/workunits/rest/test.py index 7dcaffe5e06b8..19808badb7c15 100755 --- a/qa/workunits/rest/test.py +++ b/qa/workunits/rest/test.py @@ -381,9 +381,9 @@ def expect_nofail(url, method, respcode, contenttype, extra_hdrs=None, expect('pg/set_full_ratio?ratio=0.85', 'PUT', 200, '') r = expect('pg/stat', 'GET', 200, 'json', JSONHDR) - assert('pg_stats_sum' in r.myjson['output']) + assert('num_pgs' in r.myjson['output']) r = expect('pg/stat', 'GET', 200, 'xml', XMLHDR) - assert(r.tree.find('output/pg_map/pg_stats_sum') is not None) + assert(r.tree.find('output/pg_summary/num_pgs') is not None) expect('tell/0.0/query', 'GET', 200, 'json', JSONHDR) expect('quorum?quorumcmd=enter', 'PUT', 200, 'json', JSONHDR) diff --git a/qa/workunits/rgw/s3_bucket_quota.pl b/qa/workunits/rgw/s3_bucket_quota.pl index ae6e273a17afc..6a4a1a4513c6d 100755 --- a/qa/workunits/rgw/s3_bucket_quota.pl +++ b/qa/workunits/rgw/s3_bucket_quota.pl @@ -37,6 +37,7 @@ =head1 ARGUMENTS use FindBin; use lib $FindBin::Bin; use s3_utilities; +use Net::Domain qw(hostfqdn); my $help; @@ -51,17 +52,16 @@ =head1 ARGUMENTS my $logmsg; my $kruft; my $s3; -my $domain = "front.sepia.ceph.com"; -my $host = get_hostname(); +my $hostdom = $ENV{RGW_FQDN}||hostfqdn(); my $port = $ENV{RGW_PORT}||7280; -our $hostname = "$host.$domain:$port"; +our $hostname = "$hostdom:$port"; our $testfileloc; my $rgw_user = "qa_user"; # Function that deletes the user $rgw_user and write to logfile. sub delete_user { - my $cmd = "sudo radosgw-admin user rm --uid=$rgw_user"; + my $cmd = "$radosgw_admin user rm --uid=$rgw_user"; my $cmd_op = get_command_output($cmd); if ($cmd_op !~ /aborting/){ print "user $rgw_user deleted\n"; @@ -73,7 +73,7 @@ sub delete_user } sub quota_set_max_size { - my $set_quota = `sudo radosgw-admin quota set --bucket=$bucketname --max-size=1048576000`; + my $set_quota = `$radosgw_admin quota set --bucket=$bucketname --max-size=1048576000`; if ($set_quota !~ /./){ print "quota set for the bucket: $bucketname \n"; } else { @@ -85,7 +85,7 @@ sub quota_set_max_size { sub quota_set_max_size_zero { run_s3($rgw_user); - my $set_quota = `sudo radosgw-admin quota set --bucket=$bucketname --max-size=0`; + my $set_quota = `$radosgw_admin quota set --bucket=$bucketname --max-size=0`; if ($set_quota !~ /./){ pass ("quota set for the bucket: $bucketname with max size as zero\n"); } else { @@ -96,7 +96,7 @@ sub quota_set_max_size_zero { sub quota_set_max_objs_zero { run_s3($rgw_user); - my $set_quota = `sudo radosgw-admin quota set --bucket=$bucketname --max-objects=0`; + my $set_quota = `$radosgw_admin quota set --bucket=$bucketname --max-objects=0`; if ($set_quota !~ /./){ pass ("quota set for the bucket: $bucketname with max objects as zero\n"); } else { @@ -107,7 +107,7 @@ sub quota_set_max_objs_zero { sub quota_set_neg_size { run_s3($rgw_user); - my $set_quota = `sudo radosgw-admin quota set --bucket=$bucketname --max-size=-1`; + my $set_quota = `$radosgw_admin quota set --bucket=$bucketname --max-size=-1`; if ($set_quota !~ /./){ pass ("quota set for the bucket: $bucketname with max size -1\n"); } else { @@ -118,7 +118,7 @@ sub quota_set_neg_size { sub quota_set_neg_objs { run_s3($rgw_user); - my $set_quota = `sudo radosgw-admin quota set --bucket=$bucketname --max-objects=-1`; + my $set_quota = `$radosgw_admin quota set --bucket=$bucketname --max-objects=-1`; if ($set_quota !~ /./){ pass ("quota set for the bucket: $bucketname max objects -1 \n"); } else { @@ -128,8 +128,8 @@ sub quota_set_neg_objs { } sub quota_set_user_objs { - my $set_quota = `sudo radosgw-admin quota set --uid=$rgw_user --quota-scope=bucket`; - my $set_quota1 = `sudo radosgw-admin quota set --bucket=$bucketname --max-objects=1`; + my $set_quota = `$radosgw_admin quota set --uid=$rgw_user --quota-scope=bucket`; + my $set_quota1 = `$radosgw_admin quota set --bucket=$bucketname --max-objects=1`; if ($set_quota1 !~ /./){ print "bucket quota max_objs set for the given user: $bucketname \n"; } else { @@ -140,8 +140,8 @@ sub quota_set_user_objs { } sub quota_set_user_size { - my $set_quota = `sudo radosgw-admin quota set --uid=$rgw_user --quota-scope=bucket`; - my $set_quota1 = `sudo radosgw-admin quota set --bucket=$bucketname --max-size=1048576000`; + my $set_quota = `$radosgw_admin quota set --uid=$rgw_user --quota-scope=bucket`; + my $set_quota1 = `$radosgw_admin quota set --bucket=$bucketname --max-size=1048576000`; if ($set_quota1 !~ /./){ print "bucket quota max size set for the given user: $bucketname \n"; } else { @@ -153,7 +153,7 @@ sub quota_set_user_size { sub quota_set_max_obj { # set max objects - my $set_quota = `sudo radosgw-admin quota set --bucket=$bucketname --max-objects=1`; + my $set_quota = `$radosgw_admin quota set --bucket=$bucketname --max-objects=1`; if ($set_quota !~ /./){ print "quota set for the bucket: $bucketname \n"; } else { @@ -164,7 +164,7 @@ sub quota_set_max_obj { } sub quota_enable { - my $en_quota = `sudo radosgw-admin quota enable --bucket=$bucketname`; + my $en_quota = `$radosgw_admin quota enable --bucket=$bucketname`; if ($en_quota !~ /./){ print "quota enabled for the bucket: $bucketname \n"; } else { @@ -175,7 +175,7 @@ sub quota_enable { } sub quota_disable { - my $dis_quota = `sudo radosgw-admin quota disable --bucket=$bucketname`; + my $dis_quota = `$radosgw_admin quota disable --bucket=$bucketname`; if ($dis_quota !~ /./){ print "quota disabled for the bucket: $bucketname \n"; } else { diff --git a/qa/workunits/rgw/s3_multipart_upload.pl b/qa/workunits/rgw/s3_multipart_upload.pl index 58be2aeae01d9..5bf7af23b14a6 100755 --- a/qa/workunits/rgw/s3_multipart_upload.pl +++ b/qa/workunits/rgw/s3_multipart_upload.pl @@ -36,6 +36,7 @@ =head1 ARGUMENTS use FindBin; use lib $FindBin::Bin; use s3_utilities; +use Net::Domain qw(hostfqdn); my $help; @@ -46,10 +47,9 @@ =head1 ARGUMENTS #== local variables === my $s3; -my $domain = "front.sepia.ceph.com"; -my $host = get_hostname(); +my $hostdom = $ENV{RGW_FQDN}||hostfqdn(); my $port = $ENV{RGW_PORT}||7280; -our $hostname = "$host.$domain:$port"; +our $hostname = "$hostdom:$port"; our $testfileloc; our $mytestfilename; diff --git a/qa/workunits/rgw/s3_user_quota.pl b/qa/workunits/rgw/s3_user_quota.pl index 045d297a701b9..fbda89a66d9e3 100755 --- a/qa/workunits/rgw/s3_user_quota.pl +++ b/qa/workunits/rgw/s3_user_quota.pl @@ -36,6 +36,7 @@ =head1 ARGUMENTS use FindBin; use lib $FindBin::Bin; use s3_utilities; +use Net::Domain qw(hostfqdn); my $help; @@ -50,17 +51,16 @@ =head1 ARGUMENTS my $logmsg; my $kruft; my $s3; -my $domain = "front.sepia.ceph.com"; -my $host = get_hostname(); +my $hostdom = $ENV{RGW_FQDN}||hostfqdn(); my $port = $ENV{RGW_PORT}||7280; -our $hostname = "$host.$domain:$port"; +our $hostname = "$hostdom:$port"; our $testfileloc; our $cnt; sub quota_set_max_size_per_user { my ($maxsize, $size1,$rgw_user) = @_; run_s3($rgw_user); - my $set_quota = `sudo radosgw-admin quota set --uid=$rgw_user --quota-scope=user --max-size=$maxsize`; + my $set_quota = `$radosgw_admin quota set --uid=$rgw_user --quota-scope=user --max-size=$maxsize`; if (($set_quota !~ /./)&&($maxsize == 0)){ my $ret = test_max_objs($size1, $rgw_user); if ($ret == 1){ @@ -84,7 +84,7 @@ sub quota_set_max_size_per_user { sub max_size_per_user { my ($maxsize, $size1,$rgw_user) = @_; run_s3($rgw_user); - my $set_quota = `sudo radosgw-admin quota set --uid=$rgw_user --quota-scope=user --max-size=$maxsize`; + my $set_quota = `$radosgw_admin quota set --uid=$rgw_user --quota-scope=user --max-size=$maxsize`; if (($set_quota !~ /./) && ($maxsize != 0)) { my $ret = test_max_objs($size1, $rgw_user); if ($ret == 0){ @@ -98,7 +98,7 @@ sub quota_set_max_obj_per_user { # set max objects my ($maxobjs, $size1, $rgw_user) = @_; run_s3($rgw_user); - my $set_quota = `sudo radosgw-admin quota set --uid=$rgw_user --quota-scope=user --max-objects=$maxobjs`; + my $set_quota = `$radosgw_admin quota set --uid=$rgw_user --quota-scope=user --max-objects=$maxobjs`; if (($set_quota !~ /./) && ($maxobjs == 0)){ my $ret = test_max_objs($size1, $rgw_user); if ($ret == 1){ @@ -120,7 +120,7 @@ sub quota_set_max_obj_per_user { sub quota_enable_user { my ($rgw_user) = @_; - my $en_quota = `sudo radosgw-admin quota enable --uid=$rgw_user --quota-scope=user`; + my $en_quota = `$radosgw_admin quota enable --uid=$rgw_user --quota-scope=user`; if ($en_quota !~ /./){ print "quota enabled for the user $rgw_user \n"; } else { @@ -131,7 +131,7 @@ sub quota_enable_user { } sub quota_disable_user { - my $dis_quota = `sudo radosgw-admin quota disable --uid=$rgw_user --quota-scope=user`; + my $dis_quota = `$radosgw_admin quota disable --uid=$rgw_user --quota-scope=user`; if ($dis_quota !~ /./){ print "quota disabled for the user $rgw_user \n"; } else { diff --git a/qa/workunits/rgw/s3_utilities.pm b/qa/workunits/rgw/s3_utilities.pm index e0c8b7664ba4c..8492dd328dcf5 100644 --- a/qa/workunits/rgw/s3_utilities.pm +++ b/qa/workunits/rgw/s3_utilities.pm @@ -11,6 +11,8 @@ my $isdst; my $PASS_CNT = 0; my $FAIL_CNT = 0; +our $radosgw_admin = $ENV{RGW_ADMIN}||"sudo radosgw-admin"; + # function to get the current time stamp from the test set up sub get_timestamp { ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time); @@ -89,12 +91,12 @@ sub print_border2 { sub get_user_info { my ($rgw_user) = @_; - my $cmd = "sudo radosgw-admin user create --uid=$rgw_user --display-name=$rgw_user"; + my $cmd = "$radosgw_admin user create --uid=$rgw_user --display-name=$rgw_user"; my $cmd_op = get_command_output($cmd); if ($cmd_op !~ /keys/){ return (0,0); } - my @get_user = (split/,/,$cmd_op); + my @get_user = (split/\n/,$cmd_op); foreach (@get_user) { if ($_ =~ /access_key/ ){ $get_acc_key = $_; @@ -107,13 +109,13 @@ sub get_user_info $acc_key =~ s/\\//g; $acc_key =~ s/ //g; $acc_key =~ s/"//g; + $acc_key =~ s/,//g; my $secret_key = $get_sec_key; my $sec_key = (split /:/, $secret_key)[1]; - chop($sec_key); - chop($sec_key); $sec_key =~ s/\\//g; $sec_key =~ s/ //g; $sec_key =~ s/"//g; + $sec_key =~ s/,//g; return ($acc_key, $sec_key); } @@ -121,7 +123,7 @@ sub get_user_info sub purge_data { my ($rgw_user) = @_; - my $cmd = "sudo radosgw-admin user rm --uid=$rgw_user --purge-data"; + my $cmd = "$radosgw_admin user rm --uid=$rgw_user --purge-data"; my $cmd_op = get_command_output($cmd); if ($cmd_op !~ /./){ print "user $rgw_user deleted\n"; diff --git a/qa/workunits/snaps/snaptest-0.sh b/qa/workunits/snaps/snaptest-0.sh index 6ce26044bac91..38707144ac362 100755 --- a/qa/workunits/snaps/snaptest-0.sh +++ b/qa/workunits/snaps/snaptest-0.sh @@ -8,6 +8,7 @@ expect_failure() { } set -e +ceph mds set allow_new_snaps false expect_failure mkdir .snap/foo ceph mds set allow_new_snaps true --yes-i-really-mean-it diff --git a/qa/workunits/snaps/snaptest-git-ceph.sh b/qa/workunits/snaps/snaptest-git-ceph.sh index f06280aefab35..1769fe82efb15 100755 --- a/qa/workunits/snaps/snaptest-git-ceph.sh +++ b/qa/workunits/snaps/snaptest-git-ceph.sh @@ -4,7 +4,7 @@ set -e ceph mds set allow_new_snaps true --yes-i-really-mean-it -git clone git://ceph.com/git/ceph.git +git clone git://git.ceph.com/ceph.git cd ceph versions=`seq 1 21` diff --git a/qa/workunits/snaps/snaptest-parents.sh b/qa/workunits/snaps/snaptest-parents.sh index 1310b45fe3755..6b76fdb2a971a 100644 --- a/qa/workunits/snaps/snaptest-parents.sh +++ b/qa/workunits/snaps/snaptest-parents.sh @@ -36,4 +36,6 @@ dir1=`find 1/ | wc -w` dir2=`find 2/.snap/barsnap2/a/b/c | wc -w` #diff $dir1 $dir2 && echo "Success!" test $dir1==$dir2 && echo "Sucess!" -echo "OK" \ No newline at end of file +rmdir 1/.snap/* +rmdir 2/.snap/* +echo "OK" diff --git a/qa/workunits/snaps/snaptest-snap-rename.sh b/qa/workunits/snaps/snaptest-snap-rename.sh new file mode 100755 index 0000000000000..744bdb43b819c --- /dev/null +++ b/qa/workunits/snaps/snaptest-snap-rename.sh @@ -0,0 +1,38 @@ +#!/bin/sh -x + +expect_failure() { + if [ `"$@"` -e 0 ]; then + return 1 + fi + return 0 +} +set -e + +ceph mds set allow_new_snaps true --yes-i-really-mean-it + +mkdir -p d1/d2 +mkdir -p d1/d3 +mkdir d1/.snap/foo +mkdir d1/d2/.snap/foo +mkdir d1/d3/.snap/foo +mkdir d1/d3/.snap/bar +mv d1/d2/.snap/foo d1/d2/.snap/bar +# snapshot name can't start with _ +expect_failure mv d1/d2/.snap/bar d1/d2/.snap/_bar +# can't rename parent snapshot +expect_failure mv d1/d2/.snap/_foo_* d1/d2/.snap/foo +expect_failure mv d1/d2/.snap/_foo_* d1/d2/.snap/_foo_1 +# can't rename snapshot to different directroy +expect_failure mv d1/d2/.snap/bar d1/.snap/ +# can't overwrite existing snapshot +expect_failure python -c "import os; os.rename('d1/d3/.snap/foo', 'd1/d3/.snap/bar')" +# can't move snaphost out of snapdir +expect_failure python -c "import os; os.rename('d1/.snap/foo', 'd1/foo')" + +rmdir d1/.snap/foo +rmdir d1/d2/.snap/bar +rmdir d1/d3/.snap/foo +rmdir d1/d3/.snap/bar +rm -rf d1 + +echo OK diff --git a/qa/workunits/suites/fsx.sh b/qa/workunits/suites/fsx.sh index 962d338657d85..238f77028d46d 100755 --- a/qa/workunits/suites/fsx.sh +++ b/qa/workunits/suites/fsx.sh @@ -2,7 +2,7 @@ set -e -git clone git://ceph.newdream.net/git/xfstests.git +git clone git://git.ceph.com/xfstests.git make -C xfstests cp xfstests/ltp/fsx . diff --git a/qa/workunits/suites/fsync-tester.sh b/qa/workunits/suites/fsync-tester.sh index b056e3be0d687..4c578d21e0ad3 100755 --- a/qa/workunits/suites/fsync-tester.sh +++ b/qa/workunits/suites/fsync-tester.sh @@ -7,5 +7,6 @@ gcc fsync-tester.c -o fsync-tester ./fsync-tester +echo $PATH +whereis lsof lsof - diff --git a/qa/workunits/suites/pjd.sh b/qa/workunits/suites/pjd.sh index 0f69de6b396e4..c8089e8396fff 100755 --- a/qa/workunits/suites/pjd.sh +++ b/qa/workunits/suites/pjd.sh @@ -6,6 +6,7 @@ set -e wget http://ceph.com/qa/pjd-fstest-20090130-RC-aclfixes.tgz tar zxvf pjd*.tgz cd pjd* +make clean make cd .. mkdir tmp diff --git a/qa/workunits/suites/tiobench.sh b/qa/workunits/suites/tiobench.sh deleted file mode 100755 index 9bc71c6304767..0000000000000 --- a/qa/workunits/suites/tiobench.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash -x - -die() { - echo $@ - exit 1 -} - -which tiotest || die "you must install the tiobench package" - -#timer_fail() { -# echo "timer expired: tiobench has timed out." -# exit 1 -#} - -#trap timer_fail ALRM - -#pid=$$ -#( sleep 420 ; kill -14 $pid ) & - -for i in `seq 1 10`; do - tiotest -f 20 -t 10 -d . -T -c -D 20 -r 1000 || die "tiotest failed" -done - -echo OK. diff --git a/run-make-check.sh b/run-make-check.sh new file mode 100755 index 0000000000000..0d9a2943af22b --- /dev/null +++ b/run-make-check.sh @@ -0,0 +1,77 @@ +#!/bin/bash +# +# Ceph distributed storage system +# +# Copyright (C) 2014 Red Hat +# +# Author: Loic Dachary +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# + +# +# Return true if the working tree is after the release that made +# make -j8 check possible +# +function can_parallel_make_check() { + local commit=$(git rev-parse tags/v0.88^{}) + git rev-list HEAD | grep --quiet $commit +} + +function maybe_parallel_make_check() { + if can_parallel_make_check ; then + echo -j$(get_processors) + fi +} +# +# Return MAX(1, (number of processors / 2)) by default or NPROC +# +function get_processors() { + if test -n "$NPROC" ; then + echo $NPROC + else + if test $(nproc) -ge 2 ; then + expr $(nproc) / 2 + else + echo 1 + fi + fi +} + +function run() { + # Same logic as install-deps.sh for finding package installer + local install_cmd + test -f /etc/redhat-release && install_cmd="yum install -y" + type apt-get > /dev/null 2>&1 && install_cmd="apt-get install -y" + type zypper > /dev/null 2>&1 && install_cmd="zypper --gpg-auto-import-keys --non-interactive install" + if [ -n "$install_cmd" ]; then + sudo $install_cmd ccache jq + else + echo "WARNING: Don't know how to install packages" >&2 + fi + sudo /sbin/modprobe rbd + + if test -f ./install-deps.sh ; then + $DRY_RUN ./install-deps.sh || return 1 + fi + $DRY_RUN ./autogen.sh || return 1 + $DRY_RUN ./configure "$@" --disable-static --with-radosgw --with-debug --without-lttng \ + CC="ccache gcc" CXX="ccache g++" CFLAGS="-Wall -g" CXXFLAGS="-Wall -g" || return 1 + $DRY_RUN make -j$(get_processors) || return 1 + $DRY_RUN make $(maybe_parallel_make_check) check || return 1 + $DRY_RUN make dist || return 1 +} + +function main() { + if run "$@" ; then + echo "make check: successful run on $(git rev-parse HEAD)" + return 0 + else + return 1 + fi +} + +main "$@" diff --git a/selinux/.gitignore b/selinux/.gitignore new file mode 100644 index 0000000000000..a26234fdd8713 --- /dev/null +++ b/selinux/.gitignore @@ -0,0 +1,3 @@ +/Makefile +/ceph.pp +/tmp diff --git a/selinux/Makefile.am b/selinux/Makefile.am new file mode 100644 index 0000000000000..280e7ecd4be6d --- /dev/null +++ b/selinux/Makefile.am @@ -0,0 +1,22 @@ +EXTRA_DIST = \ + ceph.te \ + ceph.fc \ + ceph.if + +SELINUXROOT = $(DESTDIR)$(datadir)/selinux + +ceph.pp: ceph.te ceph.fc ceph.if + $(MAKE) -j1 -f $(datadir)/selinux/devel/Makefile ceph.pp + +if ENABLE_SERVER +if WITH_SELINUX +all-local: ceph.pp + +install-exec-local: + $(INSTALL) -d $(SELINUXROOT)/packages + $(INSTALL) -m 644 ceph.pp $(SELINUXROOT)/packages/ + $(INSTALL) -d $(SELINUXROOT)/devel/include/contrib + $(INSTALL) -m 644 ceph.if $(SELINUXROOT)/devel/include/contrib/ + +endif +endif diff --git a/selinux/ceph.fc b/selinux/ceph.fc new file mode 100644 index 0000000000000..2eeee223056ba --- /dev/null +++ b/selinux/ceph.fc @@ -0,0 +1,11 @@ +/etc/rc\.d/init\.d/ceph -- gen_context(system_u:object_r:ceph_initrc_exec_t,s0) + +/usr/bin/ceph-mon -- gen_context(system_u:object_r:ceph_exec_t,s0) +/usr/bin/ceph-mds -- gen_context(system_u:object_r:ceph_exec_t,s0) +/usr/bin/ceph-osd -- gen_context(system_u:object_r:ceph_exec_t,s0) + +/var/lib/ceph(/.*)? gen_context(system_u:object_r:ceph_var_lib_t,s0) + +/var/log/ceph(/.*)? gen_context(system_u:object_r:ceph_log_t,s0) + +/var/run/ceph(/.*)? gen_context(system_u:object_r:ceph_var_run_t,s0) diff --git a/selinux/ceph.if b/selinux/ceph.if new file mode 100644 index 0000000000000..ed747a82ae8b3 --- /dev/null +++ b/selinux/ceph.if @@ -0,0 +1,265 @@ + +## policy for ceph + +######################################## +## +## Execute ceph_exec_t in the ceph domain. +## +## +## +## Domain allowed to transition. +## +## +# +interface(`ceph_domtrans',` + gen_require(` + type ceph_t, ceph_exec_t; + ') + + corecmd_search_bin($1) + domtrans_pattern($1, ceph_exec_t, ceph_t) +') + +###################################### +## +## Execute ceph in the caller domain. +## +## +## +## Domain allowed access. +## +## +# +interface(`ceph_exec',` + gen_require(` + type ceph_exec_t; + ') + + corecmd_search_bin($1) + can_exec($1, ceph_exec_t) +') + +######################################## +## +## Execute ceph server in the ceph domain. +## +## +## +## Domain allowed access. +## +## +# +interface(`ceph_initrc_domtrans',` + gen_require(` + type ceph_initrc_exec_t; + ') + + init_labeled_script_domtrans($1, ceph_initrc_exec_t) +') +######################################## +## +## Read ceph's log files. +## +## +## +## Domain allowed access. +## +## +## +# +interface(`ceph_read_log',` + gen_require(` + type ceph_log_t; + ') + + logging_search_logs($1) + read_files_pattern($1, ceph_log_t, ceph_log_t) +') + +######################################## +## +## Append to ceph log files. +## +## +## +## Domain allowed access. +## +## +# +interface(`ceph_append_log',` + gen_require(` + type ceph_log_t; + ') + + logging_search_logs($1) + append_files_pattern($1, ceph_log_t, ceph_log_t) +') + +######################################## +## +## Manage ceph log files +## +## +## +## Domain allowed access. +## +## +# +interface(`ceph_manage_log',` + gen_require(` + type ceph_log_t; + ') + + logging_search_logs($1) + manage_dirs_pattern($1, ceph_log_t, ceph_log_t) + manage_files_pattern($1, ceph_log_t, ceph_log_t) + manage_lnk_files_pattern($1, ceph_log_t, ceph_log_t) +') + +######################################## +## +## Search ceph lib directories. +## +## +## +## Domain allowed access. +## +## +# +interface(`ceph_search_lib',` + gen_require(` + type ceph_var_lib_t; + ') + + allow $1 ceph_var_lib_t:dir search_dir_perms; + files_search_var_lib($1) +') + +######################################## +## +## Read ceph lib files. +## +## +## +## Domain allowed access. +## +## +# +interface(`ceph_read_lib_files',` + gen_require(` + type ceph_var_lib_t; + ') + + files_search_var_lib($1) + read_files_pattern($1, ceph_var_lib_t, ceph_var_lib_t) +') + +######################################## +## +## Manage ceph lib files. +## +## +## +## Domain allowed access. +## +## +# +interface(`ceph_manage_lib_files',` + gen_require(` + type ceph_var_lib_t; + ') + + files_search_var_lib($1) + manage_files_pattern($1, ceph_var_lib_t, ceph_var_lib_t) +') + +######################################## +## +## Manage ceph lib directories. +## +## +## +## Domain allowed access. +## +## +# +interface(`ceph_manage_lib_dirs',` + gen_require(` + type ceph_var_lib_t; + ') + + files_search_var_lib($1) + manage_dirs_pattern($1, ceph_var_lib_t, ceph_var_lib_t) +') + +######################################## +## +## Read ceph PID files. +## +## +## +## Domain allowed access. +## +## +# +interface(`ceph_read_pid_files',` + gen_require(` + type ceph_var_run_t; + ') + + files_search_pids($1) + read_files_pattern($1, ceph_var_run_t, ceph_var_run_t) +') + + +######################################## +## +## All of the rules required to administrate +## an ceph environment +## +## +## +## Domain allowed access. +## +## +## +## +## Role allowed access. +## +## +## +# +interface(`ceph_admin',` + gen_require(` + type ceph_t; + type ceph_initrc_exec_t; + type ceph_log_t; + type ceph_var_lib_t; + type ceph_var_run_t; + ') + + allow $1 ceph_t:process { signal_perms }; + ps_process_pattern($1, ceph_t) + + tunable_policy(`deny_ptrace',`',` + allow $1 ceph_t:process ptrace; + ') + + ceph_initrc_domtrans($1) + domain_system_change_exemption($1) + role_transition $2 ceph_initrc_exec_t system_r; + allow $2 system_r; + + logging_search_logs($1) + admin_pattern($1, ceph_log_t) + + files_search_var_lib($1) + admin_pattern($1, ceph_var_lib_t) + + files_search_pids($1) + admin_pattern($1, ceph_var_run_t) + optional_policy(` + systemd_passwd_agent_exec($1) + systemd_read_fifo_file_passwd_run($1) + ') +') diff --git a/selinux/ceph.te b/selinux/ceph.te new file mode 100644 index 0000000000000..e25ec846ee323 --- /dev/null +++ b/selinux/ceph.te @@ -0,0 +1,114 @@ +policy_module(ceph, 1.0.0) + +require { + type sysfs_t; + type var_run_t; + type random_device_t; + type setfiles_t; + class sock_file unlink; + class lnk_file read; + class dir read; + class file { getattr read open }; +} + +######################################## +# +# Declarations +# + +type ceph_t; +type ceph_exec_t; +init_daemon_domain(ceph_t, ceph_exec_t) + +permissive ceph_t; + +type ceph_initrc_exec_t; +init_script_file(ceph_initrc_exec_t) + +type ceph_log_t; +logging_log_file(ceph_log_t) + +type ceph_var_lib_t; +files_type(ceph_var_lib_t) + +type ceph_var_run_t; +files_pid_file(ceph_var_run_t) + +######################################## +# +# ceph local policy +# + +allow ceph_t self:process { signal_perms }; +allow ceph_t self:fifo_file rw_fifo_file_perms; +allow ceph_t self:unix_stream_socket create_stream_socket_perms; +# not needed at the moment, for future releases, not needed at all if we switch to systemd init scripts +allow ceph_t self:capability { setuid setgid }; + +manage_dirs_pattern(ceph_t, ceph_log_t, ceph_log_t) +manage_files_pattern(ceph_t, ceph_log_t, ceph_log_t) +manage_lnk_files_pattern(ceph_t, ceph_log_t, ceph_log_t) + +manage_dirs_pattern(ceph_t, ceph_var_lib_t, ceph_var_lib_t) +manage_files_pattern(ceph_t, ceph_var_lib_t, ceph_var_lib_t) +manage_lnk_files_pattern(ceph_t, ceph_var_lib_t, ceph_var_lib_t) + +manage_dirs_pattern(ceph_t, ceph_var_run_t, ceph_var_run_t) +manage_files_pattern(ceph_t, ceph_var_run_t, ceph_var_run_t) +manage_lnk_files_pattern(ceph_t, ceph_var_run_t, ceph_var_run_t) + +kernel_read_system_state(ceph_t) + +corenet_all_recvfrom_unlabeled(ceph_t) +corenet_all_recvfrom_netlabel(ceph_t) +corenet_udp_sendrecv_generic_if(ceph_t) +corenet_udp_sendrecv_generic_node(ceph_t) +corenet_udp_bind_generic_node(ceph_t) +corenet_tcp_bind_generic_node(ceph_t) + +corenet_sendrecv_cyphesis_server_packets(ceph_t) +corenet_tcp_bind_cyphesis_port(ceph_t) +corenet_tcp_sendrecv_cyphesis_port(ceph_t) + +corecmd_exec_bin(ceph_t) +corecmd_exec_shell(ceph_t) + +dev_read_urand(ceph_t) + +fs_getattr_all_fs(ceph_t) + +auth_use_nsswitch(ceph_t) + +logging_send_syslog_msg(ceph_t) + +sysnet_dns_name_resolve(ceph_t) + +# added 2015-06-17, need review + +allow ceph_t ceph_var_run_t:sock_file create; +allow ceph_t self:capability sys_rawio; + +allow ceph_t self:tcp_socket { accept listen }; +corenet_tcp_connect_cyphesis_port(ceph_t) +corenet_tcp_connect_generic_port(ceph_t) +files_list_tmp(ceph_t) +fstools_exec(ceph_t) +nis_use_ypbind_uncond(ceph_t) +storage_raw_rw_fixed_disk(ceph_t) + +# added 2015-07-28, needs review just as well +allow ceph_t ceph_var_run_t:sock_file unlink; +allow ceph_t sysfs_t:dir read; +allow ceph_t sysfs_t:file { read getattr open }; +allow ceph_t sysfs_t:lnk_file read; + + +allow ceph_t random_device_t:chr_file getattr; +allow ceph_t self:process setpgid; +allow ceph_t var_run_t:dir { write create add_name }; +allow ceph_t var_run_t:file { write create open getattr }; + +fsadm_manage_pid(ceph_t) + +#============= setfiles_t ============== +allow setfiles_t ceph_var_lib_t:file write; diff --git a/selinux/ceph_selinux.8 b/selinux/ceph_selinux.8 new file mode 100644 index 0000000000000..de74807c8ed87 --- /dev/null +++ b/selinux/ceph_selinux.8 @@ -0,0 +1,324 @@ +.TH "ceph_selinux" "8" "15-06-17" "ceph" "SELinux Policy ceph" +.SH "NAME" +ceph_selinux \- Security Enhanced Linux Policy for the ceph processes +.SH "DESCRIPTION" + +Security-Enhanced Linux secures the ceph processes via flexible mandatory access control. + +The ceph processes execute with the ceph_t SELinux type. You can check if you have these processes running by executing the \fBps\fP command with the \fB\-Z\fP qualifier. + +For example: + +.B ps -eZ | grep ceph_t + + +.SH "ENTRYPOINTS" + +The ceph_t SELinux type can be entered via the \fBceph_exec_t\fP file type. + +The default entrypoint paths for the ceph_t domain are the following: + +/usr/bin/ceph-mon, /usr/bin/ceph-mds, /usr/bin/ceph-osd +.SH PROCESS TYPES +SELinux defines process types (domains) for each process running on the system +.PP +You can see the context of a process using the \fB\-Z\fP option to \fBps\bP +.PP +Policy governs the access confined processes have to files. +SELinux ceph policy is very flexible allowing users to setup their ceph processes in as secure a method as possible. +.PP +The following process types are defined for ceph: + +.EX +.B ceph_t +.EE +.PP +Note: +.B semanage permissive -a ceph_t +can be used to make the process type ceph_t permissive. SELinux does not deny access to permissive process types, but the AVC (SELinux denials) messages are still generated. + +.SH BOOLEANS +SELinux policy is customizable based on least access required. ceph policy is extremely flexible and has several booleans that allow you to manipulate the policy and run ceph with the tightest access possible. + + +.PP +If you want to allow users to resolve user passwd entries directly from ldap rather then using a sssd server, you must turn on the authlogin_nsswitch_use_ldap boolean. Disabled by default. + +.EX +.B setsebool -P authlogin_nsswitch_use_ldap 1 + +.EE + +.PP +If you want to allow all daemons to write corefiles to /, you must turn on the daemons_dump_core boolean. Disabled by default. + +.EX +.B setsebool -P daemons_dump_core 1 + +.EE + +.PP +If you want to enable cluster mode for daemons, you must turn on the daemons_enable_cluster_mode boolean. Disabled by default. + +.EX +.B setsebool -P daemons_enable_cluster_mode 1 + +.EE + +.PP +If you want to allow all daemons to use tcp wrappers, you must turn on the daemons_use_tcp_wrapper boolean. Disabled by default. + +.EX +.B setsebool -P daemons_use_tcp_wrapper 1 + +.EE + +.PP +If you want to allow all daemons the ability to read/write terminals, you must turn on the daemons_use_tty boolean. Disabled by default. + +.EX +.B setsebool -P daemons_use_tty 1 + +.EE + +.PP +If you want to deny any process from ptracing or debugging any other processes, you must turn on the deny_ptrace boolean. Disabled by default. + +.EX +.B setsebool -P deny_ptrace 1 + +.EE + +.PP +If you want to allow all domains to use other domains file descriptors, you must turn on the domain_fd_use boolean. Enabled by default. + +.EX +.B setsebool -P domain_fd_use 1 + +.EE + +.PP +If you want to allow all domains to have the kernel load modules, you must turn on the domain_kernel_load_modules boolean. Disabled by default. + +.EX +.B setsebool -P domain_kernel_load_modules 1 + +.EE + +.PP +If you want to allow all domains to execute in fips_mode, you must turn on the fips_mode boolean. Enabled by default. + +.EX +.B setsebool -P fips_mode 1 + +.EE + +.PP +If you want to enable reading of urandom for all domains, you must turn on the global_ssp boolean. Disabled by default. + +.EX +.B setsebool -P global_ssp 1 + +.EE + +.PP +If you want to allow confined applications to run with kerberos, you must turn on the kerberos_enabled boolean. Enabled by default. + +.EX +.B setsebool -P kerberos_enabled 1 + +.EE + +.PP +If you want to allow system to run with NIS, you must turn on the nis_enabled boolean. Disabled by default. + +.EX +.B setsebool -P nis_enabled 1 + +.EE + +.PP +If you want to allow confined applications to use nscd shared memory, you must turn on the nscd_use_shm boolean. Enabled by default. + +.EX +.B setsebool -P nscd_use_shm 1 + +.EE + +.SH "MANAGED FILES" + +The SELinux process type ceph_t can manage files labeled with the following file types. The paths listed are the default paths for these file types. Note the processes UID still need to have DAC permissions. + +.br +.B ceph_log_t + + /var/log/ceph(/.*)? +.br + +.br +.B ceph_var_lib_t + + /var/lib/ceph(/.*)? +.br + +.br +.B ceph_var_run_t + + /var/run/ceph(/.*)? +.br + +.br +.B cluster_conf_t + + /etc/cluster(/.*)? +.br + +.br +.B cluster_var_lib_t + + /var/lib/pcsd(/.*)? +.br + /var/lib/cluster(/.*)? +.br + /var/lib/openais(/.*)? +.br + /var/lib/pengine(/.*)? +.br + /var/lib/corosync(/.*)? +.br + /usr/lib/heartbeat(/.*)? +.br + /var/lib/heartbeat(/.*)? +.br + /var/lib/pacemaker(/.*)? +.br + +.br +.B cluster_var_run_t + + /var/run/crm(/.*)? +.br + /var/run/cman_.* +.br + /var/run/rsctmp(/.*)? +.br + /var/run/aisexec.* +.br + /var/run/heartbeat(/.*)? +.br + /var/run/cpglockd\.pid +.br + /var/run/corosync\.pid +.br + /var/run/rgmanager\.pid +.br + /var/run/cluster/rgmanager\.sk +.br + +.br +.B root_t + + / +.br + /initrd +.br + +.SH FILE CONTEXTS +SELinux requires files to have an extended attribute to define the file type. +.PP +You can see the context of a file using the \fB\-Z\fP option to \fBls\bP +.PP +Policy governs the access confined processes have to these files. +SELinux ceph policy is very flexible allowing users to setup their ceph processes in as secure a method as possible. +.PP + +.PP +.B STANDARD FILE CONTEXT + +SELinux defines the file context types for the ceph, if you wanted to +store files with these types in a diffent paths, you need to execute the semanage command to sepecify alternate labeling and then use restorecon to put the labels on disk. + +.B semanage fcontext -a -t ceph_var_run_t '/srv/myceph_content(/.*)?' +.br +.B restorecon -R -v /srv/myceph_content + +Note: SELinux often uses regular expressions to specify labels that match multiple files. + +.I The following file types are defined for ceph: + + +.EX +.PP +.B ceph_exec_t +.EE + +- Set files with the ceph_exec_t type, if you want to transition an executable to the ceph_t domain. + +.br +.TP 5 +Paths: +/usr/bin/ceph-mon, /usr/bin/ceph-mds, /usr/bin/ceph-osd + +.EX +.PP +.B ceph_initrc_exec_t +.EE + +- Set files with the ceph_initrc_exec_t type, if you want to transition an executable to the ceph_initrc_t domain. + + +.EX +.PP +.B ceph_log_t +.EE + +- Set files with the ceph_log_t type, if you want to treat the data as ceph log data, usually stored under the /var/log directory. + + +.EX +.PP +.B ceph_var_lib_t +.EE + +- Set files with the ceph_var_lib_t type, if you want to store the ceph files under the /var/lib directory. + + +.EX +.PP +.B ceph_var_run_t +.EE + +- Set files with the ceph_var_run_t type, if you want to store the ceph files under the /run or /var/run directory. + + +.PP +Note: File context can be temporarily modified with the chcon command. If you want to permanently change the file context you need to use the +.B semanage fcontext +command. This will modify the SELinux labeling database. You will need to use +.B restorecon +to apply the labels. + +.SH "COMMANDS" +.B semanage fcontext +can also be used to manipulate default file context mappings. +.PP +.B semanage permissive +can also be used to manipulate whether or not a process type is permissive. +.PP +.B semanage module +can also be used to enable/disable/install/remove policy modules. + +.B semanage boolean +can also be used to manipulate the booleans + +.PP +.B system-config-selinux +is a GUI tool available to customize SELinux policy settings. + +.SH AUTHOR +This manual page was auto-generated using +.B "sepolicy manpage". + +.SH "SEE ALSO" +selinux(8), ceph(8), semanage(8), restorecon(8), chcon(1), sepolicy(8) +, setsebool(8) \ No newline at end of file diff --git a/src/.gitignore b/src/.gitignore index 66b1f702a60e9..f657ab3122ebf 100644 --- a/src/.gitignore +++ b/src/.gitignore @@ -1,5 +1,6 @@ # generic entries Makefile +*.csv # local directory specific entries /.git_version @@ -11,6 +12,8 @@ Makefile /ceph-authtool /ceph-client-debug /cephfs-journal-tool +/cephfs-table-tool +/cephfs-data-scan /ceph-conf /ceph-coverage /ceph-crush-location @@ -24,12 +27,15 @@ Makefile /ceph-syn /ceph.conf /ceph_bench_log -/ceph_objectstore_tool -/ceph_mon_store_converter +/ceph-objectstore-tool /ceph_multi_stress_watch /ceph_erasure_code /ceph_erasure_code_benchmark /ceph_erasure_code_non_regression +/ceph_perf_local +/ceph_perf_msgr_server +/ceph_perf_msgr_client +/ceph_perf_objectstore /ceph_psim /ceph_radosacl /ceph_rgw_jsonparser @@ -51,6 +57,7 @@ Makefile /ceph-kvstore-tool /ceph_ver.h /dev +/get_command_descriptions /init-ceph /keyring /librados-config @@ -67,8 +74,9 @@ Makefile /rbd-fuse /rbd-replay /rbd-replay-prep -/rest-bench /sample.fetch_config +/simple_client +/simple_server /TAGS /tags /testmsgr @@ -76,9 +84,11 @@ Makefile /test-suite.log /cls_test_* /unittest_* -/get_command_descriptions -/ceph_perf_objectstore +/xio_client +/xio_server # old dir, may in use by older branches /leveldb /mkcephfs +/.ceph_port +/store_test_temp_dir diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000000000..327a300012e57 --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,896 @@ +include(GetGitRevisionDescription) + +enable_language(C ASM) +set(bindir ${CMAKE_INSTALL_PREFIX}/bin) +set(sbindir ${CMAKE_INSTALL_PREFIX}/sbin) +set(libdir ${CMAKE_INSTALL_PREFIX}/lib) +set(sysconfdir ${CMAKE_INSTALL_PREFIX}/etc) +set(pkgdatadir ${CMAKE_INSTALL_PREFIX}/share) +set(prefix ${CMAKE_INSTALL_PREFIX}) + +add_definitions("-DCEPH_LIBDIR=\"${libdir}\"") +add_definitions("-DCEPH_PKGLIBDIR=\"${libdir}\"") +add_definitions("-DHAVE_CONFIG_H -D__CEPH__ -D_FILE_OFFSET_BITS=64 -D_REENTRANT -D_THREAD_SAFE -D__STDC_FORMAT_MACROS -D_GNU_SOURCE") + +set(CMAKE_ASM_COMPILER ${PROJECT_SOURCE_DIR}/src/yasm-wrapper) +message(status " ams compiler ${CMAKE_ASM_COMPILER}") +set(CMAKE_ASM_FLAGS "-f elf64") +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -rdynamic -Wall -Wtype-limits -Wignored-qualifiers -Winit-self -Wpointer-arith -Werror=format-security -fno-strict-aliasing -fsigned-char -fPIC") + +execute_process( + COMMAND "yasm -f elf64 ${CMAKE_SOURCE_DIR}/src/common/crc32c_intel_fast_asm.S -o /dev/null" + RETURN_VALUE no_yasm + OUTPUT_QUIET) +if(no_yasm) + message("we do not have a modern/working yasm") +else(no_yasm) + message("we have a modern and working yasm") + execute_process( + COMMAND "arch" + OUTPUT_VARIABLE arch + OUTPUT_STRIP_TRAILING_WHITESPACE) + if(arch STREQUAL "x86_64") + message("we are x84_64") + include(CheckCXXSourceCompiles) + check_cxx_source_compiles(" + #if defined(__x86_64__) && defined(__ILP32__) + #error x32 + #endif + int main() {} + " not_arch_x32) + if(not_arch_x32) + message("we are not x32") + add_definitions("-DHAVE_GOOD_YASM_ELF64") + else(not_arch_x32) + message("we are x32; no yasm for you") + endif(not_arch_x32) + else(arch STREQUAL "x86_64") + message("we are not x86_64 && !x32") + endif(arch STREQUAL "x86_64") +endif(no_yasm) + +execute_process(COMMAND yasm -f elf64 -i ${CMAKE_SOURCE_DIR}/src/erasure-code/isa/isa-l/include/ ${CMAKE_SOURCE_DIR}/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx2.asm.s -o /dev/null + RESULT_VARIABLE rc + OUTPUT_QUIET) +if(NOT rc) + set(HAVE_BETTER_YASM_ELF64 1) +endif(NOT rc) +MESSAGE("HAVE_BETTER_YASM_ELF64=" ${HAVE_BETTER_YASM_ELF64}) + +set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -ftemplate-depth-1024 -Wno-invalid-offsetof -Wnon-virtual-dtor -Wno-invalid-offsetof -Wstrict-null-sentinel -Woverloaded-virtual") + +# require c++11 +include(CheckCXXCompilerFlag) +CHECK_CXX_COMPILER_FLAG("-std=c++11" COMPILER_SUPPORTS_CXX11) +if (COMPILER_SUPPORTS_CXX11) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") +else() + message(FATAL_ERROR "The compiler ${CMAKE_CXX_COMPILER} has no C++11 support.") +endif() + + +set(EXTRALIBS uuid rt dl ${Boost_LIBS}) + +if(${WITH_PROFILER}) + list(APPEND EXTRALIBS profiler) +endif(${WITH_PROFILER}) + +if(WITH_CDS) + list(APPEND EXTRALIBS ${CDS_LIBS}) +endif(WITH_CDS) + +if(USE_NSS) + if(NSS_FOUND) + if(NSPR_FOUND) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${NSS_INCLUDE_DIR} -I${NSPR_INCLUDE_DIR}") + endif(NSPR_FOUND) + endif(NSS_FOUND) +endif(USE_NSS) + +if(${HAVE_ATOMIC_OPS}) + set(EXTRALIBS + atomic_ops + ${EXTRALIBS}) +endif(${HAVE_ATOMIC_OPS}) + +set(GCOV_PREFIX_STRIP 4) + +get_git_head_revision(GIT_REFSPEC CEPH_GIT_VER) +git_describe(CEPH_GIT_NICE_VER --always) + +# Python stuff +find_package(PythonInterp 2 QUIET) +if(NOT PYTHONINTERP_FOUND) + message(FATAL_ERROR "Python 2 interpreter not found.") +endif(NOT PYTHONINTERP_FOUND) + +# if CMAKE_INSTALL_PREFIX is an empty string, must replace +# it with "/" to make PYTHON_INSTALL_TEMPLATE an absolute path to be +# consistent with all other installation paths. +if(CMAKE_INSTALL_PREFIX) + set(PYTHON_INSTALL_TEMPLATE "${CMAKE_INSTALL_PREFIX}") +else(CMAKE_INSTALL_PREFIX) + set(PYTHON_INSTALL_TEMPLATE "/") +endif(CMAKE_INSTALL_PREFIX) + +execute_process( + COMMAND + ${PYTHON_EXECUTABLE} -c "from distutils import sysconfig; print sysconfig.get_python_lib(1,0,prefix='${PYTHON_INSTALL_TEMPLATE}')" + OUTPUT_VARIABLE PYTHON_INSTDIR + OUTPUT_STRIP_TRAILING_WHITESPACE) + +if(HAVE_XIO) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${Xio_INCLUDE_DIR}") + list(APPEND EXTRALIBS ${Xio_LIBRARY} ibverbs rdmacm pthread rt) +endif(HAVE_XIO) + +if(${WITH_TCMALLOC}) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free") + set(TCMALLOC_LIBS tcmalloc) + set(TCMALLOC_srcs perfglue/heap_profiler.cc) +else() + set(TCMALLOC_srcs perfglue/disabled_heap_profiler.cc) +endif(${WITH_TCMALLOC}) + +# tcmalloc heap profiler +set(heap_profiler_files ${TCMALLOC_srcs}) +add_library(heap_profiler_objs OBJECT ${heap_profiler_files}) + +set(LIBEDIT_LIBS edit) + +# Common infrastructure +configure_file( + ${CMAKE_SOURCE_DIR}/src/ceph_ver.h.in.cmake + ${CMAKE_BINARY_DIR}/src/include/ceph_ver.h + @ONLY) + +set(arch_files + arch/arm.c + arch/intel.c + arch/probe.cc) + +set(auth_files + auth/AuthAuthorizeHandler.cc + auth/AuthClientHandler.cc + auth/AuthSessionHandler.cc + auth/AuthMethodList.cc + auth/cephx/CephxAuthorizeHandler.cc + auth/cephx/CephxClientHandler.cc + auth/cephx/CephxProtocol.cc + auth/cephx/CephxSessionHandler.cc + auth/none/AuthNoneAuthorizeHandler.cc + auth/unknown/AuthUnknownAuthorizeHandler.cc + auth/Crypto.cc + auth/KeyRing.cc + auth/RotatingKeyRing.cc) + +set(mds_files) +list(APPEND mds_files + mds/MDSMap.cc + mds/inode_backtrace.cc + mds/mdstypes.cc) + +set(crush_srcs + crush/builder.c + crush/mapper.c + crush/crush.c + crush/hash.c + crush/CrushWrapper.cc + crush/CrushCompiler.cc + crush/CrushTester.cc) + +add_subdirectory(json_spirit) + +set(xio_common_srcs) +if(HAVE_XIO) + list(APPEND xio_common_srcs + msg/xio/XioConnection.cc + msg/xio/XioMsg.cc + msg/xio/XioPool.cc + msg/xio/XioMessenger.cc + msg/xio/XioPortal.cc + msg/xio/QueueStrategy.cc) +endif(HAVE_XIO) + +set(libcommon_files + ${CMAKE_BINARY_DIR}/src/include/ceph_ver.h + ceph_ver.c + common/DecayCounter.cc + common/LogClient.cc + common/LogEntry.cc + common/PrebufferedStreambuf.cc + common/BackTrace.cc + common/perf_counters.cc + common/Mutex.cc + common/OutputDataSocket.cc + common/admin_socket.cc + common/admin_socket_client.cc + common/bloom_filter.cc + common/Readahead.cc + ${crush_srcs} + common/cmdparse.cc + common/escape.c + common/io_priority.cc + common/Clock.cc + common/Throttle.cc + common/Timer.cc + common/Finisher.cc + common/environment.cc + common/sctp_crc32.c + common/crc32c.cc + common/crc32c_intel_baseline.c + common/crc32c_intel_fast.c + common/crc32c_intel_fast_asm.S + common/crc32c_intel_fast_zero_asm.S + common/assert.cc + common/run_cmd.cc + common/WorkQueue.cc + common/ConfUtils.cc + common/MemoryModel.cc + common/fd.cc + common/xattr.c + common/str_list.cc + common/str_map.cc + common/snap_types.cc + common/errno.cc + common/TrackedOp.cc + common/SloppyCRCMap.cc + common/types.cc + common/TextTable.cc + log/Log.cc + log/SubsystemMap.cc + mon/MonCap.cc + mon/MonClient.cc + mon/MonMap.cc + msg/simple/Accepter.cc + msg/simple/DispatchQueue.cc + msg/Message.cc + osd/ECMsgTypes.cc + osd/HitSet.cc + common/RefCountedObj.cc + msg/Messenger.cc + msg/simple/Pipe.cc + msg/simple/PipeConnection.cc + msg/simple/SimpleMessenger.cc + msg/async/AsyncConnection.cc + msg/async/AsyncMessenger.cc + msg/async/Event.cc + msg/async/EventEpoll.cc + msg/async/EventSelect.cc + msg/async/net_handler.cc + ${xio_common_srcs} + msg/msg_types.cc + common/hobject.cc + osd/OSDMap.cc + common/histogram.cc + osd/osd_types.cc + common/blkdev.cc + common/common_init.cc + common/pipe.c + common/ceph_argparse.cc + common/ceph_context.cc + common/buffer.cc + common/code_environment.cc + common/dout.cc + common/signal.cc + common/simple_spin.cc + common/Thread.cc + common/Formatter.cc + common/HeartbeatMap.cc + common/ceph_fs.cc + common/ceph_hash.cc + common/ceph_strings.cc + common/ceph_frag.cc + common/config.cc + common/utf8.c + common/mime.c + common/strtol.cc + common/page.cc + common/lockdep.cc + common/version.cc + common/hex.cc + common/entity_name.cc + common/ceph_crypto.cc + common/ceph_crypto_cms.cc + common/ceph_json.cc + common/ipaddr.cc + common/pick_address.cc + common/address_helper.cc + common/linux_version.c + osdc/Striper.cc + osdc/Objecter.cc + ${arch_files} + ${auth_files} + ${mds_files}) +set(mon_common_files + auth/AuthSessionHandler.cc + auth/cephx/CephxSessionHandler.cc + erasure-code/ErasureCodePlugin.cc) +add_library(mon_common_objs OBJECT ${mon_common_files}) +set(common_mountcephfs_files + common/armor.c + common/safe_io.c + common/module.c + common/addr_parsing.c) +add_library(common_mountcephfs_objs OBJECT + ${common_mountcephfs_files}) + +if(${WITH_PROFILER}) + list(APPEND libcommon_files + perfglue/cpu_profiler.cc) +else() + list(APPEND libcommon_files + perfglue/disabled_stubs.cc) +endif(${WITH_PROFILER}) + +if(${ENABLE_SHARED}) + list(APPEND libcommon_files + $) +endif(${ENABLE_SHARED}) + +add_library(common STATIC ${libcommon_files} + $ + $) + +set_source_files_properties(${CMAKE_SOURCE_DIR}/src/ceph_ver.c + ${CMAKE_SOURCE_DIR}/src/common/version.cc + ${CMAKE_SOURCE_DIR}/src/test/encoding/ceph_dencoder.cc + APPEND PROPERTY OBJECT_DEPENDS ${CMAKE_BINARY_DIR}/src/include/ceph_ver.h) + +if(${WITH_PROFILER}) + target_link_libraries(common profiler) +endif(${WITH_PROFILER}) + +add_library(common_utf8 STATIC common/utf8.c) + +target_link_libraries( common json_spirit common_utf8 erasure_code rt uuid ${CRYPTO_LIBS} ${Boost_LIBRARIES}) + +set(libglobal_srcs + global/global_init.cc + global/pidfile.cc + global/signal_handler.cc) +set(global_common_files + global/global_context.cc) +add_library(global_common_objs OBJECT ${global_common_files}) +add_library(global STATIC ${libglobal_srcs} + $) +target_link_libraries(global common ${CMAKE_THREAD_LIBS_INIT} ${CRYPTO_LIBS} + ${EXTRALIBS}) +if(${ENABLE_SHARED}) + set_target_properties(global PROPERTIES + OUTPUT_NAME ceph-global VERSION "1.0.0" SOVERSION "1") +endif(${ENABLE_SHARED}) + +# rados object classes +add_subdirectory(cls) + +# RADOS client/library +set(osdc_files + osdc/Objecter.cc + osdc/Filer.cc) +set(osdc_rbd_files + osdc/ObjectCacher.cc + osdc/Striper.cc) +add_library(osdc_rbd_objs OBJECT ${osdc_rbd_files}) +add_library(osdc STATIC ${osdc_files} $) + +set(librados_srcs + librados/librados.cc + librados/RadosClient.cc + librados/IoCtxImpl.cc + librados/snap_set_diff.cc + librados/RadosXattrIter.cc + ) +add_library(librados ${CEPH_SHARED} ${librados_srcs} + $ + $ + $) +add_dependencies(librados osdc) +target_link_libraries(librados PRIVATE osdc osd os global common cls_lock_client + ${BLKID_LIBRARIES} + ${CRYPTO_LIBS} ${EXTRALIBS} ${TCMALLOC_LIBS}) +if(${ENABLE_SHARED}) + set_target_properties(librados PROPERTIES OUTPUT_NAME rados VERSION 2.0.0 + SOVERSION 2) +endif(${ENABLE_SHARED}) +install(FILES include/rados/librados.h + include/rados/rados_types.h + include/rados/rados_types.hpp + include/rados/librados.hpp + include/buffer.h + include/page.h + include/crc32c.h + DESTINATION include/rados) +install(TARGETS librados DESTINATION lib) + +set(libradosstriper_srcs + libradosstriper/libradosstriper.cc + libradosstriper/RadosStriperImpl.cc + libradosstriper/MultiAioCompletionImpl.cc) +add_library(libradosstriper ${libradosstriper_srcs}) +target_link_libraries(libradosstriper librados cls_lock_client) + +set(rados_srcs + tools/rados/rados.cc + tools/RadosDump.cc + tools/rados/RadosImport.cc + tools/rados/PoolDump.cc + common/obj_bencher.cc) +add_executable(rados ${rados_srcs} $) +target_link_libraries(rados librados global ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS} libradosstriper) + +if (WITH_CEPHFS) + set(cephfs_journal_tool_srcs + tools/cephfs/cephfs-journal-tool.cc + tools/cephfs/JournalTool.cc + tools/cephfs/JournalFilter.cc + tools/cephfs/JournalScanner.cc + tools/cephfs/EventOutput.cc + tools/cephfs/Dumper.cc + tools/cephfs/Resetter.cc + tools/cephfs/MDSUtility.cc) + add_executable(cephfs-journal-tool ${cephfs_journal_tool_srcs} + $) + target_link_libraries(cephfs-journal-tool librados mds osdc global + ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS}) + + set(cephfs_table_tool_srcs + tools/cephfs/cephfs-table-tool.cc + tools/cephfs/TableTool.cc + tools/cephfs/MDSUtility.cc) + add_executable(cephfs-table-tool ${cephfs_table_tool_srcs} + $) + target_link_libraries(cephfs-table-tool librados mds osdc global + ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS}) + + set(cephfs_data_scan_srcs + tools/cephfs/cephfs-data-scan.cc + tools/cephfs/DataScan.cc + tools/cephfs/MDSUtility.cc) + add_executable(cephfs-data-scan ${cephfs_data_scan_srcs} + $) + + target_link_libraries(cephfs-data-scan librados mds osdc global + cls_cephfs_client + ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS}) +endif (WITH_CEPHFS) + +set(librados_config_srcs + librados-config.cc) +add_executable(librados-config ${librados_config_srcs} + $) +target_link_libraries(librados-config librados global ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS} + ${TCMALLOC_LIBS}) + +install(TARGETS rados librados-config DESTINATION bin) + +install(DIRECTORY ${CMAKE_SOURCE_DIR}/src/pybind/ + DESTINATION ${PYTHON_INSTDIR}) + +# Monitor +set(lib_mon_srcs + auth/cephx/CephxKeyServer.cc + auth/cephx/CephxServiceHandler.cc + auth/AuthServiceHandler.cc + ${osd_mon_files} mon/Paxos.cc + mon/PaxosService.cc + mon/OSDMonitor.cc + mon/MDSMonitor.cc + mon/MonmapMonitor.cc + mon/LogMonitor.cc + mon/AuthMonitor.cc + mon/Elector.cc + mon/HealthMonitor.cc + ${os_mon_files} + mon/DataHealthService.cc + mon/PGMonitor.cc + mon/PGMap.cc + mon/ConfigKeyService.cc) + +set(common_util_src + common/util.cc) +add_library(common_util_obj OBJECT ${common_util_src}) +add_library(mon STATIC ${lib_mon_srcs} $ + $ $ + $) + +set(ceph_mon_srcs + ceph_mon.cc + common/TextTable.cc) +add_executable(ceph-mon ${ceph_mon_srcs} $) +target_link_libraries(ceph-mon mon boost_thread common os global ${EXTRALIBS} + ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS}) +install(TARGETS ceph-mon DESTINATION bin) + +# OSD +set(libos_srcs + os/FileJournal.cc + os/FileStore.cc + os/chain_xattr.cc + os/ObjectStore.cc + os/JournalingObjectStore.cc + os/LFNIndex.cc + os/IndexManager.cc + os/LevelDBStore.cc + os/DBObjectMap.cc + os/Transaction.cc + os/WBThrottle.cc + os/GenericFileStoreBackend.cc + os/BtrfsFileStoreBackend.cc + os/ZFSFileStoreBackend.cc + os/XfsFileStoreBackend.cc + os/KeyValueStore.cc + os/KeyValueDB.cc + os/MemStore.cc + os/GenericObjectMap.cc + os/HashIndex.cc) +set(os_mon_files + os/LevelDBStore.cc) +add_library(os_mon_objs OBJECT ${os_mon_files}) +add_library(os STATIC ${libos_srcs} $) +if(${HAVE_LIBAIO}) + target_link_libraries(os aio) +endif(${HAVE_LIBAIO}) +target_link_libraries(os leveldb snappy) + +set(cls_references_files objclass/class_api.cc) +add_library(cls_references_objs OBJECT ${cls_references_files}) + +set(osdc_osd_srcs + osdc/Objecter.cc + osdc/Striper.cc) + +set(osd_srcs + osd/OSD.cc + osd/Watch.cc + osd/ClassHandler.cc + osd/OpRequest.cc + osd/PG.cc + osd/PGLog.cc + osd/ReplicatedPG.cc + osd/ReplicatedBackend.cc + osd/ECBackend.cc + osd/ECTransaction.cc + osd/PGBackend.cc + osd/OSD.cc + osd/OSDCap.cc + osd/Watch.cc + osd/ClassHandler.cc + osd/OpRequest.cc + common/TrackedOp.cc + osd/SnapMapper.cc + osd/osd_types.cc + osd/ECUtil.cc + objclass/class_api.cc + ${osdc_osd_srcs}) +set(osd_mon_files + mon/Monitor.cc) +add_library(osd_mon_objs OBJECT ${osd_mon_files}) +add_library(osd STATIC ${osd_srcs} $ + $) +target_link_libraries(osd dl leveldb) + +set(ceph_osd_srcs + ceph_osd.cc + objclass/class_api.cc) +add_executable(ceph-osd ${ceph_osd_srcs} + $ + $) +target_link_libraries(ceph-osd osd os global ${BLKID_LIBRARIES} ${TCMALLOC_LIBS}) +install(TARGETS ceph-osd DESTINATION bin) + +# MDS +if(${WITH_MDS}) + set(mds_srcs + mds/Capability.cc + mds/MDSDaemon.cc + mds/MDSRank.cc + mds/Beacon.cc + mds/flock.cc + mds/locks.c + mds/journal.cc + mds/Server.cc + mds/Mutation.cc + mds/MDCache.cc + mds/RecoveryQueue.cc + mds/StrayManager.cc + mds/Locker.cc + mds/Migrator.cc + mds/MDBalancer.cc + mds/CDentry.cc + mds/CDir.cc + mds/CInode.cc + mds/LogEvent.cc + mds/MDSTable.cc + mds/InoTable.cc + mds/JournalPointer.cc + mds/MDSTableClient.cc + mds/MDSTableServer.cc + mds/SimpleLock.cc + mds/SnapRealm.cc + mds/SnapServer.cc + mds/snap.cc + mds/SessionMap.cc + mds/MDSContext.cc + mds/MDSAuthCaps.cc + mds/MDLog.cc + ${CMAKE_SOURCE_DIR}/src/common/TrackedOp.cc + ${CMAKE_SOURCE_DIR}/src/osdc/Journaler.cc) + add_library(mds ${mds_srcs}) + set(ceph_mds_srcs + ceph_mds.cc) + add_executable(ceph-mds ${ceph_mds_srcs} + $ + $) + target_link_libraries(ceph-mds mds osdc ${CMAKE_DL_LIBS} global + ${TCMALLOC_LIBS} boost_thread) + install(TARGETS ceph-mds DESTINATION bin) +endif(${WITH_MDS}) + +add_subdirectory(erasure-code) + +set(crushtool_srcs + tools/crushtool.cc) +add_executable(crushtool ${crushtool_srcs}) +target_link_libraries(crushtool global) + +# Support/Tools +add_subdirectory(gmock) +add_subdirectory(test) +set(cephfs_srcs cephfs.cc) +add_executable(cephfstool ${cephfs_srcs}) +target_link_libraries(cephfstool common ${EXTRALIBS}) +set_target_properties(cephfstool PROPERTIES OUTPUT_NAME cephfs) +install(TARGETS cephfstool DESTINATION bin) + +#set(ceph_srcs tools/ceph.cc tools/common.cc) +#add_executable(ceph ${ceph_srcs}) +#target_link_libraries(ceph global ${LIBEDIT_LIBS}) + +set(ceph_conf_srcs + tools/ceph_conf.cc) +add_executable(ceph-conf ${ceph_conf_srcs}) +target_link_libraries(ceph-conf global) +install(TARGETS ceph-conf DESTINATION bin) + +set(monmaptool_srcs + tools/monmaptool.cc) +add_executable(monmaptool ${monmaptool_srcs}) +target_link_libraries(monmaptool global) +install(TARGETS monmaptool DESTINATION bin) + +set(osdomaptool_srcs + tools/osdmaptool.cc) +add_executable(osdmaptool ${osdomaptool_srcs}) +target_link_libraries(osdmaptool global ${CMAKE_DL_LIBS}) +install(TARGETS osdmaptool DESTINATION bin) + +set(ceph_authtool_srcs + tools/ceph_authtool.cc) +add_executable(ceph-authtool ${ceph_authtool_srcs}) +target_link_libraries(ceph-authtool global ${EXTRALIBS} ${CRYPTO_LIBS}) +install(TARGETS ceph-authtool DESTINATION bin) + +configure_file(${CMAKE_SOURCE_DIR}/src/ceph-coverage.in + ${CMAKE_BINARY_DIR}/ceph-coverage @ONLY) + +configure_file(${CMAKE_SOURCE_DIR}/src/ceph-debugpack.in + ${CMAKE_BINARY_DIR}/ceph-debugpack @ONLY) + +configure_file(${CMAKE_SOURCE_DIR}/src/ceph.in.cmake + ${CMAKE_BINARY_DIR}/ceph @ONLY) + +configure_file(${CMAKE_SOURCE_DIR}/src/ceph-crush-location.in + ${CMAKE_BINARY_DIR}/ceph-crush-location @ONLY) + +configure_file(${CMAKE_SOURCE_DIR}/src/init-ceph.in + ${CMAKE_BINARY_DIR}/init-ceph @ONLY) + +install(PROGRAMS + ${CMAKE_BINARY_DIR}/ceph + ${CMAKE_BINARY_DIR}/ceph-debugpack + ${CMAKE_BINARY_DIR}/ceph-coverage + ${CMAKE_BINARY_DIR}/init-ceph + ${CMAKE_SOURCE_DIR}/src/ceph-run + ${CMAKE_SOURCE_DIR}/src/vstart.sh + ${CMAKE_SOURCE_DIR}/src/ceph-clsinfo + DESTINATION bin) + +install(FILES + ${CMAKE_SOURCE_DIR}/doc/start/ceph.conf + DESTINATION ${sysconfdir}/ceph/ RENAME ceph.conf.example) + +install(PROGRAMS + ${CMAKE_SOURCE_DIR}/src/ceph_common.sh + DESTINATION ${CMAKE_INSTALL_PREFIX}/lib/ceph) + +install(PROGRAMS + ${CMAKE_SOURCE_DIR}/src/ceph-create-keys + ${CMAKE_SOURCE_DIR}/src/ceph-disk + ${CMAKE_SOURCE_DIR}/src/ceph-disk-udev + DESTINATION sbin) + +set(parse_secret_files + common/secret.c) +add_library(parse_secret_objs OBJECT ${parse_secret_files}) + +if(WITH_LIBCEPHFS) + set(libclient_srcs + client/Client.cc + client/Dentry.cc + client/Inode.cc + client/MetaRequest.cc + client/ClientSnapRealm.cc + client/MetaSession.cc + client/Trace.cc) + add_library(client ${libclient_srcs}) + target_link_libraries(client osdc mds ${LIBEDIT_LIBS}) + set(libcephfs_srcs libcephfs.cc) + add_library(cephfs SHARED ${libcephfs_srcs}) + target_link_libraries(cephfs client global) + install(TARGETS cephfs DESTINATION lib) + install(DIRECTORY + "${CMAKE_SOURCE_DIR}/src/include/cephfs" + DESTINATION include) + set(ceph_syn_srcs + ceph_syn.cc + client/SyntheticClient.cc) + add_executable(ceph-syn ${ceph_syn_srcs}) + target_link_libraries(ceph-syn client global) + + set(mount_ceph_srcs + mount/mount.ceph.c) + add_executable(mount.ceph ${mount_ceph_srcs} + $ + $) + target_link_libraries(mount.ceph keyutils) + + install(TARGETS ceph-syn DESTINATION bin) + install(TARGETS mount.ceph DESTINATION sbin) + + if(WITH_FUSE) + set(ceph_fuse_srcs + ceph_fuse.cc + client/fuse_ll.cc) + add_executable(ceph-fuse ${ceph_fuse_srcs}) + target_link_libraries(ceph-fuse fuse client global) + install(TARGETS ceph-fuse DESTINATION bin) + endif(WITH_FUSE) +endif(WITH_LIBCEPHFS) + +if(${WITH_RBD}) + set(librbd_srcs + krbd.cc + common/ContextCompletion.cc + librbd/AioCompletion.cc + librbd/AioRequest.cc + librbd/AsyncFlattenRequest.cc + librbd/AsyncObjectThrottle.cc + librbd/AsyncOperation.cc + librbd/AsyncRequest.cc + librbd/AsyncResizeRequest.cc + librbd/AsyncTrimRequest.cc + librbd/CopyupRequest.cc + librbd/ImageCtx.cc + librbd/ImageWatcher.cc + librbd/WatchNotifyTypes.cc + librbd/internal.cc + librbd/librbd.cc + librbd/LibrbdWriteback.cc + librbd/ObjectMap.cc + librbd/RebuildObjectMapRequest.cc) + add_library(librbd ${CEPH_SHARED} ${librbd_srcs} + $ + $) + target_link_libraries(librbd PRIVATE librados common cls_lock_client cls_rbd_client + ${CMAKE_DL_LIBS}) + if(${ENABLE_SHARED}) + set_target_properties(librbd PROPERTIES VERSION "1.0.0" SOVERSION "1" + OUTPUT_NAME rbd) + endif(${ENABLE_SHARED}) + install(TARGETS librados librbd DESTINATION lib) + set(rbd_srcs + rbd.cc common/TextTable.cc) + add_executable(rbd ${rbd_srcs} $ + $ + $) + set_target_properties(rbd PROPERTIES OUTPUT_NAME rbd) + target_link_libraries(rbd librbd librados global common keyutils udev + ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS} ${TCMALLOC_LIBS}) + install(TARGETS rbd DESTINATION bin) + install(PROGRAMS ${CMAKE_SOURCE_DIR}/src/ceph-rbdnamer DESTINATION bin) +endif(${WITH_RBD}) + +# RadosGW +if(${WITH_KVS}) + set(kvs_srcs + key_value_store/cls_kvs.cc) + add_library(cls_kvs SHARED ${kvs_srcs}) + set_target_properties(cls_kvs PROPERTIES VERSION "1.0.0" SOVERSION "1") + install(TARGETS cls_kvs DESTINATION lib/rados-classes) +endif(${WITH_KVS}) + +if(${WITH_RADOSGW}) + set(rgw_a_srcs + rgw/librgw.cc + rgw/rgw_acl.cc + rgw/rgw_acl_s3.cc + rgw/rgw_acl_swift.cc + rgw/rgw_client_io.cc + rgw/rgw_fcgi.cc + rgw/rgw_xml.cc + rgw/rgw_usage.cc + rgw/rgw_json_enc.cc + rgw/rgw_user.cc + rgw/rgw_bucket.cc + rgw/rgw_tools.cc + rgw/rgw_rados.cc + rgw/rgw_http_client.cc + rgw/rgw_rest_client.cc + rgw/rgw_rest_conn.cc + rgw/rgw_op.cc + rgw/rgw_common.cc + rgw/rgw_cache.cc + rgw/rgw_formats.cc + rgw/rgw_log.cc + rgw/rgw_multi.cc + rgw/rgw_policy_s3.cc + rgw/rgw_gc.cc + rgw/rgw_multi_del.cc + rgw/rgw_env.cc + rgw/rgw_cors.cc + rgw/rgw_cors_s3.cc + rgw/rgw_auth_s3.cc + rgw/rgw_metadata.cc + rgw/rgw_replica_log.cc + rgw/rgw_keystone.cc + rgw/rgw_quota.cc + rgw/rgw_dencoder.cc) + + add_library(rgw_a STATIC ${rgw_a_srcs}) + + include_directories("${CMAKE_SOURCE_DIR}/src/civetweb/include") + + set(radosgw_srcs + rgw/rgw_resolve.cc + rgw/rgw_rest.cc + rgw/rgw_rest_swift.cc + rgw/rgw_rest_s3.cc + rgw/rgw_rest_usage.cc + rgw/rgw_rest_user.cc + rgw/rgw_rest_bucket.cc + rgw/rgw_rest_metadata.cc + rgw/rgw_replica_log.cc + rgw/rgw_rest_log.cc + rgw/rgw_rest_opstate.cc + rgw/rgw_rest_replica_log.cc + rgw/rgw_rest_config.cc + rgw/rgw_http_client.cc + rgw/rgw_swift.cc + rgw/rgw_swift_auth.cc + rgw/rgw_loadgen.cc + rgw/rgw_civetweb.cc + rgw/rgw_civetweb_log.cc + civetweb/src/civetweb.c + rgw/rgw_main.cc) + + set(radosgw_admin_srcs + rgw/rgw_admin.cc + rgw/rgw_orphan.cc) + + add_executable(radosgw ${radosgw_srcs} $) + target_link_libraries(radosgw rgw_a librados + cls_rgw_client cls_lock_client cls_refcount_client + cls_log_client cls_statelog_client cls_version_client + cls_replica_log_client cls_user_client + curl expat global fcgi resolv ${BLKID_LIBRARIES} ${TCMALLOC_LIBS}) + install(TARGETS radosgw DESTINATION bin) + + add_executable(radosgw-admin ${radosgw_admin_srcs} $) + target_link_libraries(radosgw-admin rgw_a librados + cls_rgw_client cls_lock_client cls_refcount_client + cls_log_client cls_statelog_client cls_version_client + cls_replica_log_client cls_user_client + curl expat global fcgi resolv ${BLKID_LIBRARIES} ${TCMALLOC_LIBS}) + install(TARGETS radosgw-admin DESTINATION bin) +endif(${WITH_RADOSGW}) diff --git a/src/Makefile-client.am b/src/Makefile-client.am new file mode 100644 index 0000000000000..d68d70db3185a --- /dev/null +++ b/src/Makefile-client.am @@ -0,0 +1,116 @@ +bash_completiondir = $(sysconfdir)/bash_completion.d +bash_completion_DATA = $(srcdir)/bash_completion/ceph + +bin_SCRIPTS += \ + ceph \ + ceph-post-file + +python_PYTHON += \ + pybind/ceph_argparse.py \ + pybind/ceph_daemon.py + +ceph_syn_SOURCES = ceph_syn.cc +ceph_syn_SOURCES += client/SyntheticClient.cc # uses g_conf.. needs cleanup +ceph_syn_LDADD = $(LIBCLIENT) $(CEPH_GLOBAL) +bin_PROGRAMS += ceph-syn + +# assemble Python script with global version variables +# NB: depends on format of ceph_ver.h + +ceph: ceph.in ./ceph_ver.h Makefile + rm -f $@ $@.tmp + echo "#!/usr/bin/env python" >$@.tmp + grep "#define CEPH_GIT_NICE_VER" $(srcdir)/ceph_ver.h | \ + sed -e 's/#define \(.*VER\) /\1=/' >>$@.tmp + grep "#define CEPH_GIT_VER" $(srcdir)/ceph_ver.h | \ + sed -e 's/#define \(.*VER\) /\1=/' -e 's/=\(.*\)$$/="\1"/' >>$@.tmp + cat $(srcdir)/$@.in >>$@.tmp + chmod a+x $@.tmp + chmod a-w $@.tmp + mv $@.tmp $@ + + +if WITH_RADOS + +bash_completion_DATA += \ + $(srcdir)/bash_completion/rados \ + $(srcdir)/bash_completion/radosgw-admin + +python_PYTHON += pybind/rados.py + +librados_config_SOURCES = librados-config.cc +librados_config_LDADD = $(LIBRADOS) $(CEPH_GLOBAL) +bin_PROGRAMS += librados-config + +if WITH_RBD + +bash_completion_DATA += \ + $(srcdir)/bash_completion/rbd + +bin_SCRIPTS += \ + ceph-rbdnamer \ + rbd-replay-many + +python_PYTHON += pybind/rbd.py + +libkrbd_la_SOURCES = krbd.cc +libkrbd_la_LIBADD = $(LIBSECRET) $(LIBCOMMON) -lblkid -ludev +if LINUX +noinst_LTLIBRARIES += libkrbd.la +endif # LINUX + +rbd_SOURCES = rbd.cc +rbd_LDADD = $(LIBKRBD) $(LIBRBD) $(LIBRADOS) $(CEPH_GLOBAL) +if LINUX +bin_PROGRAMS += rbd +endif # LINUX + +endif # WITH_RBD + +# Fuse targets + +if WITH_FUSE +ceph_fuse_SOURCES = ceph_fuse.cc +ceph_fuse_LDADD = $(LIBCLIENT_FUSE) $(CEPH_GLOBAL) +bin_PROGRAMS += ceph-fuse + +if WITH_RBD +rbd_fuse_SOURCES = rbd_fuse/rbd-fuse.cc +rbd_fuse_LDADD = -lfuse $(LIBRBD) $(LIBRADOS) $(CEPH_GLOBAL) +bin_PROGRAMS += rbd-fuse +endif # WITH_RBD +endif # WITH_FUSE + + +if WITH_CEPHFS +cephfs_SOURCES = cephfs.cc +cephfs_LDADD = $(LIBCOMMON) +bin_PROGRAMS += cephfs + +python_PYTHON += pybind/cephfs.py + +# libcephfs (this should go somewhere else in the future) + +libcephfs_la_SOURCES = libcephfs.cc +libcephfs_la_LIBADD = $(LIBCLIENT) $(LIBCOMMON) $(PTHREAD_LIBS) $(CRYPTO_LIBS) $(EXTRALIBS) +libcephfs_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 -export-symbols-regex '^ceph_.*' +libcephfs_la_LDFLAGS += -Xcompiler -Xlinker -Xcompiler '--exclude-libs=libcommon.a' +lib_LTLIBRARIES += libcephfs.la + +# jni library (java source is in src/java) + +if ENABLE_CEPHFS_JAVA +libcephfs_jni_la_SOURCES = \ + java/native/libcephfs_jni.cc \ + java/native/ScopedLocalRef.h \ + java/native/JniConstants.cpp \ + java/native/JniConstants.h +libcephfs_jni_la_LIBADD = $(LIBCEPHFS) $(EXTRALIBS) +libcephfs_jni_la_CPPFLAGS = $(JDK_CPPFLAGS) $(AM_CPPFLAGS) +libcephfs_jni_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 +lib_LTLIBRARIES += libcephfs_jni.la +endif # ENABLE_CEPHFS_JAVA + +endif # WITH_CEPHFS + +endif # WITH_RADOS diff --git a/src/Makefile-env.am b/src/Makefile-env.am index f2ab6558bc845..e349b4f919647 100644 --- a/src/Makefile-env.am +++ b/src/Makefile-env.am @@ -29,12 +29,16 @@ ceph_sbindir = $(sbindir) # certain things go straight into /sbin, though! su_sbindir = /sbin -# C/C++ tests to build will be appended to this -check_PROGRAMS = +# C/C++ tests to build and executed will be appended to this +check_TESTPROGRAMS = +check_PROGRAMS = $(check_TESTPROGRAMS) # tests scripts will be appended to this check_SCRIPTS = +# display the output of failed check_SCRIPTS after a failed make check +export VERBOSE = true + # python unit tests need to know where the scripts are located export PYTHONPATH=$(top_srcdir)/src/pybind @@ -47,6 +51,31 @@ endif ################################## ## automake environment +HARDENING_CFLAGS = \ + -O2 \ + -g \ + -pipe \ + -Wall \ + -Wp,-D_FORTIFY_SOURCE=2 \ + -fexceptions \ + --param=ssp-buffer-size=4 \ + -fPIE + +SET_STACK_PROTECTOR_STRONG = $(shell expr `gcc -dumpversion` \>= 4.9) + + ifeq ($(SET_STACK_PROTECTOR_STRONG),1) + HARDENING_CFLAGS += -fstack-protector-strong + else + HARDENING_CFLAGS += -fstack-protector + endif + + +HARDENING_LDFLAGS = \ + -pie \ + -Wl,-z,relro \ + -Wl,-z,now + + AM_COMMON_CPPFLAGS = \ -D__CEPH__ \ -D_FILE_OFFSET_BITS=64 \ @@ -64,21 +93,21 @@ AM_COMMON_CFLAGS = \ ${WARN_IGNORED_QUALIFIERS} \ -Winit-self \ -Wpointer-arith \ - -Werror=format-security \ + ${WARN_ERROR_FORMAT_SECURITY} \ -fno-strict-aliasing \ -fsigned-char if !CLANG AM_COMMON_CFLAGS += -rdynamic endif -AM_CFLAGS = $(AM_COMMON_CFLAGS) +AM_CFLAGS = $(AM_COMMON_CFLAGS) $(HARDENING_CFLAGS) AM_CPPFLAGS = $(AM_COMMON_CPPFLAGS) AM_CXXFLAGS = \ @AM_CXXFLAGS@ \ $(AM_COMMON_CFLAGS) \ -ftemplate-depth-1024 \ -Wnon-virtual-dtor \ - -Wno-invalid-offsetof + -Wno-invalid-offsetof $(HARDENING_CFLAGS) if !CLANG AM_CXXFLAGS += -Wstrict-null-sentinel endif @@ -93,7 +122,7 @@ endif # http://sigquit.wordpress.com/2011/02/16/why-asneeded-doesnt-work-as-expected-for-your-libraries-on-your-autotools-project/ AM_LDFLAGS = if LINUX -AM_LDFLAGS += -Wl,--as-needed +AM_LDFLAGS += -Wl,--as-needed $(HARDENING_LDFLAGS) endif if USE_BOOST_SPIRIT_OLD_HDR @@ -137,6 +166,7 @@ LIBPERFGLUE = libperfglue.la LIBAUTH = libauth.la LIBMSG = libmsg.la LIBCRUSH = libcrush.la +LIBCOMPRESSOR = libcompressor.la -lsnappy LIBJSON_SPIRIT = libjson_spirit.la LIBLOG = liblog.la LIBOS = libos.la @@ -152,7 +182,9 @@ LIBCLIENT_FUSE = libclient_fuse.la LIBRADOS = librados.la LIBRADOSSTRIPER = libradosstriper.la LIBRGW = librgw.la +LIBCIVETWEB = libcivetweb.la LIBRBD = librbd.la +LIBRBD_TYPES = librbd_types.la LIBKRBD = libkrbd.la LIBCEPHFS = libcephfs.la LIBERASURE_CODE = liberasure_code.la @@ -173,10 +205,21 @@ if WITH_LIBROCKSDB LIBOS += libos_rocksdb.la endif # WITH_LIBROCKSDB +if WITH_TCMALLOC_MINIMAL +LIBPERFGLUE += -ltcmalloc_minimal +endif # WITH_TCMALLOC_MINIMAL + if WITH_TCMALLOC LIBPERFGLUE += -ltcmalloc endif # WITH_TCMALLOC +if WITH_JEMALLOC +LIBMON += -ljemalloc +LIBOSD += -ljemalloc +LIBMDS += -ljemalloc +LIBRGW += -ljemalloc +endif # WITH_JEMALLOC + if ENABLE_COVERAGE EXTRALIBS += -lgcov endif # ENABLE_COVERAGE diff --git a/src/Makefile-rocksdb.am b/src/Makefile-rocksdb.am new file mode 100644 index 0000000000000..fb642912a00ad --- /dev/null +++ b/src/Makefile-rocksdb.am @@ -0,0 +1,347 @@ +if WITH_SLIBROCKSDB + SUBDIRS += rocksdb +else + EXTRA_DIST += \ + rocksdb/.gitignore \ + rocksdb/CONTRIBUTING.md \ + rocksdb/HISTORY.md \ + rocksdb/INSTALL.md \ + rocksdb/LICENSE \ + rocksdb/Makefile.am \ + rocksdb/PATENTS \ + rocksdb/README.md \ + rocksdb/ROCKSDB_LITE.md \ + rocksdb/configure.ac \ + rocksdb/Makefile.am \ + rocksdb/db/builder.cc \ + rocksdb/db/builder.h \ + rocksdb/db/c.cc \ + rocksdb/db/column_family.cc \ + rocksdb/db/column_family.h \ + rocksdb/db/compaction.cc \ + rocksdb/db/compaction.h \ + rocksdb/db/compaction_job.cc \ + rocksdb/db/compaction_job.h \ + rocksdb/db/compaction_picker.cc \ + rocksdb/db/compaction_picker.h \ + rocksdb/db/db_filesnapshot.cc \ + rocksdb/db/dbformat.cc \ + rocksdb/db/dbformat.h \ + rocksdb/db/db_impl.cc \ + rocksdb/db/db_impl_debug.cc \ + rocksdb/db/db_impl.h \ + rocksdb/db/db_impl_readonly.cc \ + rocksdb/db/db_impl_readonly.h \ + rocksdb/db/db_iter.cc \ + rocksdb/db/db_iter.h \ + rocksdb/db/file_indexer.cc \ + rocksdb/db/file_indexer.h \ + rocksdb/db/filename.cc \ + rocksdb/db/filename.h \ + rocksdb/db/flush_job.cc \ + rocksdb/db/flush_job.h \ + rocksdb/db/flush_scheduler.cc \ + rocksdb/db/flush_scheduler.h \ + rocksdb/db/forward_iterator.cc \ + rocksdb/db/forward_iterator.h \ + rocksdb/db/internal_stats.cc \ + rocksdb/db/internal_stats.h \ + rocksdb/db/job_context.h \ + rocksdb/db/log_format.h \ + rocksdb/db/log_reader.cc \ + rocksdb/db/log_reader.h \ + rocksdb/db/log_writer.cc \ + rocksdb/db/log_writer.h \ + rocksdb/db/managed_iterator.cc \ + rocksdb/db/managed_iterator.h \ + rocksdb/db/memtable_allocator.cc \ + rocksdb/db/memtable_allocator.h \ + rocksdb/db/memtable.cc \ + rocksdb/db/memtable.h \ + rocksdb/db/memtable_list.cc \ + rocksdb/db/memtable_list.h \ + rocksdb/db/merge_context.h \ + rocksdb/db/merge_helper.cc \ + rocksdb/db/merge_helper.h \ + rocksdb/db/merge_operator.cc \ + rocksdb/db/repair.cc \ + rocksdb/db/skiplist.h \ + rocksdb/db/slice.cc \ + rocksdb/db/snapshot.h \ + rocksdb/db/table_cache.cc \ + rocksdb/db/table_cache.h \ + rocksdb/db/table_properties_collector.cc \ + rocksdb/db/table_properties_collector.h \ + rocksdb/db/transaction_log_impl.cc \ + rocksdb/db/transaction_log_impl.h \ + rocksdb/db/version_builder.cc \ + rocksdb/db/version_builder.h \ + rocksdb/db/version_edit.cc \ + rocksdb/db/version_edit.h \ + rocksdb/db/version_set.cc \ + rocksdb/db/version_set.h \ + rocksdb/db/wal_manager.cc \ + rocksdb/db/wal_manager.h \ + rocksdb/db/write_batch_base.cc \ + rocksdb/db/write_batch.cc \ + rocksdb/db/write_batch_internal.h \ + rocksdb/db/writebuffer.h \ + rocksdb/db/write_controller.cc \ + rocksdb/db/write_controller.h \ + rocksdb/db/write_thread.cc \ + rocksdb/db/write_thread.h \ + rocksdb/hdfs/README \ + rocksdb/hdfs/env_hdfs.h \ + rocksdb/hdfs/setup.sh \ + rocksdb/include/rocksdb/cache.h \ + rocksdb/include/rocksdb/c.h \ + rocksdb/include/rocksdb/compaction_filter.h \ + rocksdb/include/rocksdb/comparator.h \ + rocksdb/include/rocksdb/db.h \ + rocksdb/include/rocksdb/env.h \ + rocksdb/include/rocksdb/filter_policy.h \ + rocksdb/include/rocksdb/flush_block_policy.h \ + rocksdb/include/rocksdb/immutable_options.h \ + rocksdb/include/rocksdb/iostats_context.h \ + rocksdb/include/rocksdb/iterator.h \ + rocksdb/include/rocksdb/ldb_tool.h \ + rocksdb/include/rocksdb/listener.h \ + rocksdb/include/rocksdb/memtablerep.h \ + rocksdb/include/rocksdb/merge_operator.h \ + rocksdb/include/rocksdb/metadata.h \ + rocksdb/include/rocksdb/options.h \ + rocksdb/include/rocksdb/perf_context.h \ + rocksdb/include/rocksdb/rate_limiter.h \ + rocksdb/include/rocksdb/slice.h \ + rocksdb/include/rocksdb/slice_transform.h \ + rocksdb/include/rocksdb/sst_dump_tool.h \ + rocksdb/include/rocksdb/statistics.h \ + rocksdb/include/rocksdb/status.h \ + rocksdb/include/rocksdb/table.h \ + rocksdb/include/rocksdb/table_properties.h \ + rocksdb/include/rocksdb/thread_status.h \ + rocksdb/include/rocksdb/transaction_log.h \ + rocksdb/include/rocksdb/types.h \ + rocksdb/include/rocksdb/universal_compaction.h \ + rocksdb/include/rocksdb/utilities/backupable_db.h \ + rocksdb/include/rocksdb/utilities/checkpoint.h \ + rocksdb/include/rocksdb/utilities/convenience.h \ + rocksdb/include/rocksdb/utilities/db_ttl.h \ + rocksdb/include/rocksdb/utilities/document_db.h \ + rocksdb/include/rocksdb/utilities/geo_db.h \ + rocksdb/include/rocksdb/utilities/json_document.h \ + rocksdb/include/rocksdb/utilities/leveldb_options.h \ + rocksdb/include/rocksdb/utilities/spatial_db.h \ + rocksdb/include/rocksdb/utilities/stackable_db.h \ + rocksdb/include/rocksdb/utilities/utility_db.h \ + rocksdb/include/rocksdb/utilities/write_batch_with_index.h \ + rocksdb/include/rocksdb/version.h \ + rocksdb/include/rocksdb/write_batch_base.h \ + rocksdb/include/rocksdb/write_batch.h \ + rocksdb/include/utilities/backupable_db.h \ + rocksdb/include/utilities/db_ttl.h \ + rocksdb/include/utilities/geo_db.h \ + rocksdb/include/utilities/stackable_db.h \ + rocksdb/include/utilities/utility_db.h \ + rocksdb/m4/libtool.m4 \ + rocksdb/m4/lt~obsolete.m4 \ + rocksdb/m4/ltoptions.m4 \ + rocksdb/m4/ltsugar.m4 \ + rocksdb/m4/ltversion.m4 \ + rocksdb/port/likely.h \ + rocksdb/port/port.h \ + rocksdb/port/port_posix.cc \ + rocksdb/port/port_posix.h \ + rocksdb/port/stack_trace.cc \ + rocksdb/port/stack_trace.h \ + rocksdb/table/adaptive_table_factory.cc \ + rocksdb/table/adaptive_table_factory.h \ + rocksdb/table/block_based_filter_block.cc \ + rocksdb/table/block_based_filter_block.h \ + rocksdb/table/block_based_table_builder.cc \ + rocksdb/table/block_based_table_builder.h \ + rocksdb/table/block_based_table_factory.cc \ + rocksdb/table/block_based_table_factory.h \ + rocksdb/table/block_based_table_reader.cc \ + rocksdb/table/block_based_table_reader.h \ + rocksdb/table/block_builder.cc \ + rocksdb/table/block_builder.h \ + rocksdb/table/block.cc \ + rocksdb/table/block.h \ + rocksdb/table/block_hash_index.cc \ + rocksdb/table/block_hash_index.h \ + rocksdb/table/block_prefix_index.cc \ + rocksdb/table/block_prefix_index.h \ + rocksdb/table/bloom_block.cc \ + rocksdb/table/bloom_block.h \ + rocksdb/table/cuckoo_table_builder.cc \ + rocksdb/table/cuckoo_table_builder.h \ + rocksdb/table/cuckoo_table_factory.cc \ + rocksdb/table/cuckoo_table_factory.h \ + rocksdb/table/cuckoo_table_reader.cc \ + rocksdb/table/cuckoo_table_reader.h \ + rocksdb/table/filter_block.h \ + rocksdb/table/flush_block_policy.cc \ + rocksdb/table/format.cc \ + rocksdb/table/format.h \ + rocksdb/table/full_filter_block.cc \ + rocksdb/table/full_filter_block.h \ + rocksdb/table/get_context.cc \ + rocksdb/table/get_context.h \ + rocksdb/table/iterator.cc \ + rocksdb/table/iterator_wrapper.h \ + rocksdb/table/iter_heap.h \ + rocksdb/table/merger.cc \ + rocksdb/table/merger.h \ + rocksdb/table/meta_blocks.cc \ + rocksdb/table/meta_blocks.h \ + rocksdb/table/mock_table.h \ + rocksdb/table/plain_table_builder.cc \ + rocksdb/table/plain_table_builder.h \ + rocksdb/table/plain_table_factory.cc \ + rocksdb/table/plain_table_factory.h \ + rocksdb/table/plain_table_index.cc \ + rocksdb/table/plain_table_index.h \ + rocksdb/table/plain_table_key_coding.cc \ + rocksdb/table/plain_table_key_coding.h \ + rocksdb/table/plain_table_reader.cc \ + rocksdb/table/plain_table_reader.h \ + rocksdb/table/table_builder.h \ + rocksdb/table/table_properties.cc \ + rocksdb/table/table_properties_internal.h \ + rocksdb/table/table_reader.h \ + rocksdb/table/two_level_iterator.cc \ + rocksdb/table/two_level_iterator.h \ + rocksdb/third-party/gtest-1.7.0/fused-src/gtest/gtest.h \ + rocksdb/third-party/gtest-1.7.0/fused-src/gtest/gtest-all.cc \ + rocksdb/third-party/fbson/FbsonDocument.h \ + rocksdb/third-party/fbson/FbsonJsonParser.h \ + rocksdb/third-party/fbson/FbsonStream.h \ + rocksdb/third-party/fbson/FbsonUtil.h \ + rocksdb/third-party/fbson/FbsonWriter.h \ + rocksdb/util/allocator.h \ + rocksdb/util/arena.cc \ + rocksdb/util/arena.h \ + rocksdb/util/auto_roll_logger.cc \ + rocksdb/util/auto_roll_logger.h \ + rocksdb/util/autovector.h \ + rocksdb/util/benchharness.h \ + rocksdb/util/bloom.cc \ + rocksdb/util/build_version.h \ + rocksdb/util/cache.cc \ + rocksdb/util/coding.cc \ + rocksdb/util/coding.h \ + rocksdb/util/comparator.cc \ + rocksdb/util/compression.h \ + rocksdb/util/crc32c.cc \ + rocksdb/util/crc32c.h \ + rocksdb/util/db_info_dumper.cc \ + rocksdb/util/db_info_dumper.h \ + rocksdb/util/dynamic_bloom.cc \ + rocksdb/util/dynamic_bloom.h \ + rocksdb/util/env.cc \ + rocksdb/util/env_hdfs.cc \ + rocksdb/util/env_posix.cc \ + rocksdb/util/event_logger.cc \ + rocksdb/util/event_logger.h \ + rocksdb/util/file_util.cc \ + rocksdb/util/file_util.h \ + rocksdb/util/filter_policy.cc \ + rocksdb/util/hash.cc \ + rocksdb/util/hash_cuckoo_rep.cc \ + rocksdb/util/hash_cuckoo_rep.h \ + rocksdb/util/hash.h \ + rocksdb/util/hash_linklist_rep.cc \ + rocksdb/util/hash_linklist_rep.h \ + rocksdb/util/hash_skiplist_rep.cc \ + rocksdb/util/hash_skiplist_rep.h \ + rocksdb/util/histogram.cc \ + rocksdb/util/histogram.h \ + rocksdb/util/instrumented_mutex.cc \ + rocksdb/util/instrumented_mutex.h \ + rocksdb/util/iostats_context.cc \ + rocksdb/util/iostats_context_imp.h \ + rocksdb/utilities/backupable/backupable_db.cc \ + rocksdb/utilities/checkpoint/checkpoint.cc \ + rocksdb/utilities/compacted_db/compacted_db_impl.cc \ + rocksdb/utilities/compacted_db/compacted_db_impl.h \ + rocksdb/utilities/convenience/convenience.cc \ + rocksdb/utilities/document/document_db.cc \ + rocksdb/utilities/document/json_document_builder.cc \ + rocksdb/utilities/document/json_document.cc \ + rocksdb/utilities/geodb/geodb_impl.cc \ + rocksdb/utilities/geodb/geodb_impl.h \ + rocksdb/utilities/leveldb_options/leveldb_options.cc \ + rocksdb/utilities/merge_operators/put.cc \ + rocksdb/utilities/merge_operators/string_append/stringappend2.cc \ + rocksdb/utilities/merge_operators/string_append/stringappend2.h \ + rocksdb/utilities/merge_operators/string_append/stringappend.cc \ + rocksdb/utilities/merge_operators/string_append/stringappend.h \ + rocksdb/utilities/merge_operators/uint64add.cc \ + rocksdb/utilities/merge_operators.h \ + rocksdb/utilities/redis/redis_list_exception.h \ + rocksdb/utilities/redis/redis_list_iterator.h \ + rocksdb/utilities/redis/redis_lists.cc \ + rocksdb/utilities/redis/redis_lists.h \ + rocksdb/utilities/spatialdb/spatial_db.cc \ + rocksdb/utilities/spatialdb/utils.h \ + rocksdb/utilities/ttl/db_ttl_impl.cc \ + rocksdb/utilities/ttl/db_ttl_impl.h \ + rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc \ + rocksdb/util/ldb_cmd.cc \ + rocksdb/util/ldb_cmd_execute_result.h \ + rocksdb/util/ldb_cmd.h \ + rocksdb/util/ldb_tool.cc \ + rocksdb/util/log_buffer.cc \ + rocksdb/util/log_buffer.h \ + rocksdb/util/logging.cc \ + rocksdb/util/logging.h \ + rocksdb/util/memenv.cc \ + rocksdb/util/mock_env.h \ + rocksdb/util/murmurhash.cc \ + rocksdb/util/murmurhash.h \ + rocksdb/util/mutable_cf_options.cc \ + rocksdb/util/mutable_cf_options.h \ + rocksdb/util/mutexlock.h \ + rocksdb/util/options_builder.cc \ + rocksdb/util/options.cc \ + rocksdb/util/options_helper.cc \ + rocksdb/util/options_helper.h \ + rocksdb/util/perf_context.cc \ + rocksdb/util/perf_context_imp.h \ + rocksdb/util/posix_logger.h \ + rocksdb/util/random.h \ + rocksdb/util/rate_limiter.cc \ + rocksdb/util/rate_limiter.h \ + rocksdb/util/scoped_arena_iterator.h \ + rocksdb/util/skiplistrep.cc \ + rocksdb/util/slice.cc \ + rocksdb/util/sst_dump_tool.cc \ + rocksdb/util/sst_dump_tool_imp.h \ + rocksdb/util/statistics.cc \ + rocksdb/util/statistics.h \ + rocksdb/util/status.cc \ + rocksdb/util/stl_wrappers.h \ + rocksdb/util/stop_watch.h \ + rocksdb/util/string_util.cc \ + rocksdb/util/string_util.h \ + rocksdb/util/sync_point.cc \ + rocksdb/util/sync_point.h \ + rocksdb/util/testharness.h \ + rocksdb/util/testutil.h \ + rocksdb/util/thread_local.cc \ + rocksdb/util/thread_local.h \ + rocksdb/util/thread_operation.h \ + rocksdb/util/thread_status_impl.cc \ + rocksdb/util/thread_status_updater.cc \ + rocksdb/util/thread_status_updater_debug.cc \ + rocksdb/util/thread_status_updater.h \ + rocksdb/util/thread_status_util.cc \ + rocksdb/util/thread_status_util_debug.cc \ + rocksdb/util/thread_status_util.h \ + rocksdb/util/vectorrep.cc \ + rocksdb/util/xfunc.cc \ + rocksdb/util/xfunc.h \ + rocksdb/util/xxhash.cc \ + rocksdb/util/xxhash.h +endif # WITH_SLIBROCKSDB diff --git a/src/Makefile-server.am b/src/Makefile-server.am new file mode 100644 index 0000000000000..689b5c445f67e --- /dev/null +++ b/src/Makefile-server.am @@ -0,0 +1,67 @@ +ceph_sbin_SCRIPTS = ceph-create-keys + +bin_SCRIPTS += \ + ceph-run \ + ceph-rest-api \ + ceph-debugpack \ + ceph-crush-location + +python_PYTHON += pybind/ceph_rest_api.py + +shell_scripts += ceph-coverage + +bin_SCRIPTS += ceph-coverage + +BUILT_SOURCES += init-ceph + +shell_scripts += init-ceph + +mount_ceph_SOURCES = mount/mount.ceph.c +mount_ceph_LDADD = $(LIBSECRET) $(LIBCOMMON) +if LINUX +su_sbin_PROGRAMS += mount.ceph +endif # LINUX +su_sbin_SCRIPTS += mount.fuse.ceph + + +if WITH_MON + +ceph_mon_SOURCES = ceph_mon.cc +ceph_mon_LDADD = $(LIBMON) $(LIBOS) $(CEPH_GLOBAL) $(LIBCOMMON) +bin_PROGRAMS += ceph-mon + +endif # WITH_MON + + +if WITH_OSD + +ceph_sbin_SCRIPTS += \ + ceph-disk \ + ceph-disk-udev + +bin_SCRIPTS += \ + ceph-clsinfo + +ceph_osd_SOURCES = ceph_osd.cc +ceph_osd_LDADD = $(LIBOSD) $(CEPH_GLOBAL) $(LIBCOMMON) +bin_PROGRAMS += ceph-osd + +endif # WITH_OSD + + +if WITH_MDS + +ceph_mds_SOURCES = ceph_mds.cc +ceph_mds_LDADD = $(LIBMDS) $(LIBOSDC) $(CEPH_GLOBAL) $(LIBCOMMON) +bin_PROGRAMS += ceph-mds + +endif # WITH_MDS + + +if ENABLE_COVERAGE + +COV_DIR = $(DESTDIR)$(libdir)/ceph/coverage +COV_FILES = $(srcdir)/*.gcno +COV_LIB_FILES = $(srcdir)/.libs/*.gcno + +endif # ENABLE_COVERAGE diff --git a/src/Makefile.am b/src/Makefile.am index 1cc80f9e7c1db..502f83a014fc4 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,15 +1,18 @@ include Makefile-env.am SUBDIRS += ocf java tracing -DIST_SUBDIRS += gtest ocf libs3 java tracing - +DIST_SUBDIRS += gmock ocf java tracing +if NO_GIT_VERSION +export NO_VERSION="yes" +endif # subdirs include arch/Makefile.am include auth/Makefile.am include brag/Makefile.am +include ceph-detect-init/Makefile.am include crush/Makefile.am include mon/Makefile.am include mds/Makefile.am @@ -35,95 +38,8 @@ include key_value_store/Makefile.am include rbd_replay/Makefile.am include test/Makefile.am include tools/Makefile.am - - -# core daemons - -ceph_mon_SOURCES = ceph_mon.cc -ceph_mon_LDADD = $(LIBMON) $(LIBOS) $(CEPH_GLOBAL) $(LIBCOMMON) -bin_PROGRAMS += ceph-mon - -ceph_osd_SOURCES = ceph_osd.cc -ceph_osd_LDADD = $(LIBOSD) $(CEPH_GLOBAL) $(LIBCOMMON) -bin_PROGRAMS += ceph-osd - -ceph_mds_SOURCES = ceph_mds.cc -ceph_mds_LDADD = $(LIBMDS) $(LIBOSDC) $(CEPH_GLOBAL) $(LIBCOMMON) -bin_PROGRAMS += ceph-mds - - -# admin tools - - -# user tools - -mount_ceph_SOURCES = mount/mount.ceph.c -mount_ceph_LDADD = $(LIBSECRET) $(LIBCOMMON) -if LINUX -su_sbin_PROGRAMS += mount.ceph -endif # LINUX -su_sbin_SCRIPTS += mount.fuse.ceph - -cephfs_SOURCES = cephfs.cc -cephfs_LDADD = $(LIBCOMMON) -bin_PROGRAMS += cephfs - -librados_config_SOURCES = librados-config.cc -librados_config_LDADD = $(LIBRADOS) $(CEPH_GLOBAL) -bin_PROGRAMS += librados-config - -ceph_syn_SOURCES = ceph_syn.cc -ceph_syn_SOURCES += client/SyntheticClient.cc # uses g_conf.. needs cleanup -ceph_syn_LDADD = $(LIBCLIENT) $(CEPH_GLOBAL) -bin_PROGRAMS += ceph-syn - - -libkrbd_la_SOURCES = krbd.cc -libkrbd_la_LIBADD = $(LIBSECRET) $(LIBCOMMON) -lblkid -ludev -if LINUX -noinst_LTLIBRARIES += libkrbd.la -endif LINUX - -rbd_SOURCES = rbd.cc -rbd_LDADD = $(LIBKRBD) $(LIBRBD) $(LIBRADOS) $(CEPH_GLOBAL) -if LINUX -bin_PROGRAMS += rbd -endif #LINUX - - -# Fuse targets - -if WITH_FUSE -ceph_fuse_SOURCES = ceph_fuse.cc -ceph_fuse_LDADD = $(LIBCLIENT_FUSE) $(CEPH_GLOBAL) -bin_PROGRAMS += ceph-fuse - -rbd_fuse_SOURCES = rbd_fuse/rbd-fuse.c -rbd_fuse_LDADD = -lfuse $(LIBRBD) $(LIBRADOS) $(CEPH_GLOBAL) -bin_PROGRAMS += rbd-fuse -endif # WITH_FUSE - - -# libcephfs (this should go somewhere else in the future) - -libcephfs_la_SOURCES = libcephfs.cc -libcephfs_la_LIBADD = $(LIBCLIENT) $(LIBCOMMON) $(PTHREAD_LIBS) $(CRYPTO_LIBS) $(EXTRALIBS) -libcephfs_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 -export-symbols-regex '^ceph_.*' -lib_LTLIBRARIES += libcephfs.la - -# jni library (java source is in src/java) - -if ENABLE_CEPHFS_JAVA -libcephfs_jni_la_SOURCES = \ - java/native/libcephfs_jni.cc \ - java/native/ScopedLocalRef.h \ - java/native/JniConstants.cpp \ - java/native/JniConstants.h -libcephfs_jni_la_LIBADD = $(LIBCEPHFS) $(EXTRALIBS) -libcephfs_jni_la_CPPFLAGS = $(JDK_CPPFLAGS) $(AM_CPPFLAGS) -libcephfs_jni_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 -lib_LTLIBRARIES += libcephfs_jni.la -endif +include Makefile-rocksdb.am +include compressor/Makefile.am # shell scripts @@ -152,21 +68,18 @@ CLEANFILES += $(shell_scripts) # extra bits EXTRA_DIST += \ - $(srcdir)/verify-mds-journal.sh \ $(srcdir)/vstart.sh \ $(srcdir)/stop.sh \ ceph-run \ $(srcdir)/ceph-osd-prestart.sh \ $(srcdir)/ceph_common.sh \ $(srcdir)/init-radosgw \ - $(srcdir)/init-radosgw.sysv \ $(srcdir)/init-rbdmap \ $(srcdir)/ceph-clsinfo \ $(srcdir)/make_version \ - $(srcdir)/check_version \ $(srcdir)/.git_version \ $(srcdir)/ceph-rbdnamer \ - $(srcdir)/test/encoding/readable.sh \ + $(srcdir)/tools/ceph-monstore-update-crush.sh \ $(srcdir)/upstart/ceph-all.conf \ $(srcdir)/upstart/ceph-mon.conf \ $(srcdir)/upstart/ceph-mon-all.conf \ @@ -184,8 +97,6 @@ EXTRA_DIST += \ $(srcdir)/upstart/rbdmap.conf \ ceph.in \ ceph-disk \ - ceph-disk-prepare \ - ceph-disk-activate \ ceph-disk-udev \ ceph-create-keys \ ceph-rest-api \ @@ -196,398 +107,8 @@ EXTRA_DIST += \ yasm-wrapper EXTRA_DIST += \ - libs3/COPYING \ - libs3/ChangeLog \ - libs3/GNUmakefile \ - libs3/GNUmakefile.mingw \ - libs3/GNUmakefile.osx \ - libs3/INSTALL \ - libs3/LICENSE \ - libs3/README \ - libs3/TODO \ - libs3/archlinux \ - libs3/debian \ - libs3/doxyfile \ - libs3/inc \ - libs3/libs3.spec \ - libs3/mswin \ - libs3/src \ - libs3/test \ unittest_bufferlist.sh -if WITH_SLIBROCKSDB - SUBDIRS += rocksdb -else - EXTRA_DIST += \ - rocksdb/.arcconfig \ - rocksdb/.clang-format \ - rocksdb/.gitignore \ - rocksdb/CONTRIBUTING.md \ - rocksdb/HISTORY.md \ - rocksdb/INSTALL.md \ - rocksdb/LICENSE \ - rocksdb/Makefile.am \ - rocksdb/PATENTS \ - rocksdb/README \ - rocksdb/ROCKSDB_LITE.md \ - rocksdb/build_tools/build_detect_platform \ - rocksdb/build_tools/build_detect_version \ - rocksdb/build_tools/fbcode.clang31.sh \ - rocksdb/build_tools/fbcode.gcc471.sh \ - rocksdb/build_tools/fbcode.gcc481.sh \ - rocksdb/build_tools/format-diff.sh \ - rocksdb/build_tools/mac-install-gflags.sh \ - rocksdb/build_tools/make_new_version.sh \ - rocksdb/build_tools/regression_build_test.sh \ - rocksdb/build_tools/valgrind_test.sh \ - rocksdb/configure.ac \ - rocksdb/coverage/coverage_test.sh \ - rocksdb/coverage/parse_gcov_output.py \ - rocksdb/db/builder.cc \ - rocksdb/db/builder.h \ - rocksdb/db/c.cc \ - rocksdb/db/c_test.c \ - rocksdb/db/column_family.cc \ - rocksdb/db/column_family.h \ - rocksdb/db/column_family_test.cc \ - rocksdb/db/compaction.cc \ - rocksdb/db/compaction.h \ - rocksdb/db/compaction_picker.cc \ - rocksdb/db/compaction_picker.h \ - rocksdb/db/corruption_test.cc \ - rocksdb/db/db_bench.cc \ - rocksdb/db/db_filesnapshot.cc \ - rocksdb/db/db_impl.cc \ - rocksdb/db/db_impl.h \ - rocksdb/db/db_impl_debug.cc \ - rocksdb/db/db_impl_readonly.cc \ - rocksdb/db/db_impl_readonly.h \ - rocksdb/db/db_iter.cc \ - rocksdb/db/db_iter.h \ - rocksdb/db/db_stats_logger.cc \ - rocksdb/db/db_test.cc \ - rocksdb/db/dbformat.cc \ - rocksdb/db/dbformat.h \ - rocksdb/db/dbformat_test.cc \ - rocksdb/db/deletefile_test.cc \ - rocksdb/db/file_indexer.cc \ - rocksdb/db/file_indexer.h \ - rocksdb/db/file_indexer_test.cc \ - rocksdb/db/filename.cc \ - rocksdb/db/filename.h \ - rocksdb/db/filename_test.cc \ - rocksdb/db/internal_stats.cc \ - rocksdb/db/internal_stats.h \ - rocksdb/db/log_and_apply_bench.cc \ - rocksdb/db/log_format.h \ - rocksdb/db/log_reader.cc \ - rocksdb/db/log_reader.h \ - rocksdb/db/log_test.cc \ - rocksdb/db/log_writer.cc \ - rocksdb/db/log_writer.h \ - rocksdb/db/memtable.cc \ - rocksdb/db/memtable.h \ - rocksdb/db/memtable_list.cc \ - rocksdb/db/memtable_list.h \ - rocksdb/db/merge_context.h \ - rocksdb/db/merge_helper.cc \ - rocksdb/db/merge_helper.h \ - rocksdb/db/merge_operator.cc \ - rocksdb/db/merge_test.cc \ - rocksdb/db/perf_context_test.cc \ - rocksdb/db/plain_table_db_test.cc \ - rocksdb/db/prefix_test.cc \ - rocksdb/db/repair.cc \ - rocksdb/db/simple_table_db_test.cc \ - rocksdb/db/skiplist.h \ - rocksdb/db/skiplist_test.cc \ - rocksdb/db/snapshot.h \ - rocksdb/db/table_cache.cc \ - rocksdb/db/table_cache.h \ - rocksdb/db/table_properties_collector.cc \ - rocksdb/db/table_properties_collector.h \ - rocksdb/db/table_properties_collector_test.cc \ - rocksdb/db/tailing_iter.cc \ - rocksdb/db/tailing_iter.h \ - rocksdb/db/transaction_log_impl.cc \ - rocksdb/db/transaction_log_impl.h \ - rocksdb/db/version_edit.cc \ - rocksdb/db/version_edit.h \ - rocksdb/db/version_edit_test.cc \ - rocksdb/db/version_set.cc \ - rocksdb/db/version_set.h \ - rocksdb/db/version_set_test.cc \ - rocksdb/db/write_batch.cc \ - rocksdb/db/write_batch_internal.h \ - rocksdb/db/write_batch_test.cc \ - rocksdb/doc/doc.css \ - rocksdb/doc/index.html \ - rocksdb/doc/log_format.txt \ - rocksdb/doc/rockslogo.jpg \ - rocksdb/doc/rockslogo.png \ - rocksdb/hdfs/README \ - rocksdb/hdfs/env_hdfs.h \ - rocksdb/hdfs/hdfs.h \ - rocksdb/hdfs/libhdfs.a \ - rocksdb/helpers/memenv/memenv.cc \ - rocksdb/helpers/memenv/memenv_test.cc \ - rocksdb/include/rocksdb/c.h \ - rocksdb/include/rocksdb/cache.h \ - rocksdb/include/rocksdb/compaction_filter.h \ - rocksdb/include/rocksdb/comparator.h \ - rocksdb/include/rocksdb/db.h \ - rocksdb/include/rocksdb/env.h \ - rocksdb/include/rocksdb/filter_policy.h \ - rocksdb/include/rocksdb/flush_block_policy.h \ - rocksdb/include/rocksdb/iterator.h \ - rocksdb/include/rocksdb/ldb_tool.h \ - rocksdb/include/rocksdb/memtablerep.h \ - rocksdb/include/rocksdb/merge_operator.h \ - rocksdb/include/rocksdb/options.h \ - rocksdb/include/rocksdb/perf_context.h \ - rocksdb/include/rocksdb/slice.h \ - rocksdb/include/rocksdb/slice_transform.h \ - rocksdb/include/rocksdb/statistics.h \ - rocksdb/include/rocksdb/status.h \ - rocksdb/include/rocksdb/table.h \ - rocksdb/include/rocksdb/table_properties.h \ - rocksdb/include/rocksdb/transaction_log.h \ - rocksdb/include/rocksdb/types.h \ - rocksdb/include/rocksdb/universal_compaction.h \ - rocksdb/include/rocksdb/version.h \ - rocksdb/include/rocksdb/write_batch.h \ - rocksdb/include/utilities/backupable_db.h \ - rocksdb/include/utilities/db_ttl.h \ - rocksdb/include/utilities/geo_db.h \ - rocksdb/include/utilities/stackable_db.h \ - rocksdb/include/utilities/utility_db.h \ - rocksdb/java/Makefile \ - rocksdb/java/RocksDBSample.java \ - rocksdb/java/jdb_bench.sh \ - rocksdb/java/org/rocksdb/BackupableDB.java \ - rocksdb/java/org/rocksdb/BackupableDBOptions.java \ - rocksdb/java/org/rocksdb/BloomFilter.java \ - rocksdb/java/org/rocksdb/Filter.java \ - rocksdb/java/org/rocksdb/HashLinkedListMemTableConfig.java \ - rocksdb/java/org/rocksdb/HashSkipListMemTableConfig.java \ - rocksdb/java/org/rocksdb/HistogramData.java \ - rocksdb/java/org/rocksdb/HistogramType.java \ - rocksdb/java/org/rocksdb/Iterator.java \ - rocksdb/java/org/rocksdb/MemTableConfig.java \ - rocksdb/java/org/rocksdb/Options.java \ - rocksdb/java/org/rocksdb/PlainTableConfig.java \ - rocksdb/java/org/rocksdb/ReadOptions.java \ - rocksdb/java/org/rocksdb/RocksDB.java \ - rocksdb/java/org/rocksdb/RocksDBException.java \ - rocksdb/java/org/rocksdb/RocksObject.java \ - rocksdb/java/org/rocksdb/SkipListMemTableConfig.java \ - rocksdb/java/org/rocksdb/Statistics.java \ - rocksdb/java/org/rocksdb/TableFormatConfig.java \ - rocksdb/java/org/rocksdb/TickerType.java \ - rocksdb/java/org/rocksdb/VectorMemTableConfig.java \ - rocksdb/java/org/rocksdb/WriteBatch.java \ - rocksdb/java/org/rocksdb/WriteBatchTest.java \ - rocksdb/java/org/rocksdb/WriteOptions.java \ - rocksdb/java/org/rocksdb/benchmark/DbBenchmark.java \ - rocksdb/java/org/rocksdb/test/BackupableDBTest.java \ - rocksdb/java/org/rocksdb/test/OptionsTest.java \ - rocksdb/java/org/rocksdb/test/ReadOptionsTest.java \ - rocksdb/java/org/rocksdb/util/Environment.java \ - rocksdb/java/org/rocksdb/util/SizeUnit.java \ - rocksdb/java/rocksjni/backupablejni.cc \ - rocksdb/java/rocksjni/filter.cc \ - rocksdb/java/rocksjni/iterator.cc \ - rocksdb/java/rocksjni/memtablejni.cc \ - rocksdb/java/rocksjni/options.cc \ - rocksdb/java/rocksjni/portal.h \ - rocksdb/java/rocksjni/rocksjni.cc \ - rocksdb/java/rocksjni/statistics.cc \ - rocksdb/java/rocksjni/table.cc \ - rocksdb/java/rocksjni/write_batch.cc \ - rocksdb/linters/__phutil_library_init__.php \ - rocksdb/linters/__phutil_library_map__.php \ - rocksdb/linters/cpp_linter/ArcanistCpplintLinter.php \ - rocksdb/linters/cpp_linter/FbcodeCppLinter.php \ - rocksdb/linters/cpp_linter/PfffCppLinter.php \ - rocksdb/linters/cpp_linter/cpplint.py \ - rocksdb/linters/lint_engine/FacebookFbcodeLintEngine.php \ - rocksdb/m4/libtool.m4 \ - rocksdb/m4/ltoptions.m4 \ - rocksdb/m4/ltsugar.m4 \ - rocksdb/m4/ltversion.m4 \ - rocksdb/m4/lt~obsolete.m4 \ - rocksdb/port/README \ - rocksdb/port/atomic_pointer.h \ - rocksdb/port/likely.h \ - rocksdb/port/port.h \ - rocksdb/port/port_example.h \ - rocksdb/port/port_posix.cc \ - rocksdb/port/port_posix.h \ - rocksdb/port/stack_trace.cc \ - rocksdb/port/stack_trace.h \ - rocksdb/port/win/stdint.h \ - rocksdb/table/block.cc \ - rocksdb/table/block.h \ - rocksdb/table/block_based_table_builder.cc \ - rocksdb/table/block_based_table_builder.h \ - rocksdb/table/block_based_table_factory.cc \ - rocksdb/table/block_based_table_factory.h \ - rocksdb/table/block_based_table_reader.cc \ - rocksdb/table/block_based_table_reader.h \ - rocksdb/table/block_builder.cc \ - rocksdb/table/block_builder.h \ - rocksdb/table/block_hash_index.cc \ - rocksdb/table/block_hash_index.h \ - rocksdb/table/block_hash_index_test.cc \ - rocksdb/table/block_test.cc \ - rocksdb/table/filter_block.cc \ - rocksdb/table/filter_block.h \ - rocksdb/table/filter_block_test.cc \ - rocksdb/table/flush_block_policy.cc \ - rocksdb/table/format.cc \ - rocksdb/table/format.h \ - rocksdb/table/iter_heap.h \ - rocksdb/table/iterator.cc \ - rocksdb/table/iterator_wrapper.h \ - rocksdb/table/merger.cc \ - rocksdb/table/merger.h \ - rocksdb/table/meta_blocks.cc \ - rocksdb/table/meta_blocks.h \ - rocksdb/table/plain_table_builder.cc \ - rocksdb/table/plain_table_builder.h \ - rocksdb/table/plain_table_factory.cc \ - rocksdb/table/plain_table_factory.h \ - rocksdb/table/plain_table_reader.cc \ - rocksdb/table/plain_table_reader.h \ - rocksdb/table/table_builder.h \ - rocksdb/table/table_properties.cc \ - rocksdb/table/table_reader.h \ - rocksdb/table/table_reader_bench.cc \ - rocksdb/table/table_test.cc \ - rocksdb/table/two_level_iterator.cc \ - rocksdb/table/two_level_iterator.h \ - rocksdb/tools/auto_sanity_test.sh \ - rocksdb/tools/blob_store_bench.cc \ - rocksdb/tools/db_crashtest.py \ - rocksdb/tools/db_crashtest2.py \ - rocksdb/tools/db_repl_stress.cc \ - rocksdb/tools/db_sanity_test.cc \ - rocksdb/tools/db_stress.cc \ - rocksdb/tools/ldb.cc \ - rocksdb/tools/ldb_test.py \ - rocksdb/tools/reduce_levels_test.cc \ - rocksdb/tools/sst_dump.cc \ - rocksdb/util/arena.cc \ - rocksdb/util/arena.h \ - rocksdb/util/arena_test.cc \ - rocksdb/util/auto_roll_logger.cc \ - rocksdb/util/auto_roll_logger.h \ - rocksdb/util/auto_roll_logger_test.cc \ - rocksdb/util/autovector.h \ - rocksdb/util/autovector_test.cc \ - rocksdb/util/benchharness.cc \ - rocksdb/util/benchharness.h \ - rocksdb/util/benchharness_test.cc \ - rocksdb/util/blob_store.cc \ - rocksdb/util/blob_store.h \ - rocksdb/util/blob_store_test.cc \ - rocksdb/util/bloom.cc \ - rocksdb/util/bloom_test.cc \ - rocksdb/util/build_version.h \ - rocksdb/util/cache.cc \ - rocksdb/util/cache_test.cc \ - rocksdb/util/coding.cc \ - rocksdb/util/coding.h \ - rocksdb/util/coding_test.cc \ - rocksdb/util/comparator.cc \ - rocksdb/util/crc32c.cc \ - rocksdb/util/crc32c.h \ - rocksdb/util/crc32c_test.cc \ - rocksdb/util/dynamic_bloom.cc \ - rocksdb/util/dynamic_bloom.h \ - rocksdb/util/dynamic_bloom_test.cc \ - rocksdb/util/env.cc \ - rocksdb/util/env_hdfs.cc \ - rocksdb/util/env_posix.cc \ - rocksdb/util/env_test.cc \ - rocksdb/util/filelock_test.cc \ - rocksdb/util/filter_policy.cc \ - rocksdb/util/hash.cc \ - rocksdb/util/hash.h \ - rocksdb/util/hash_cuckoo_rep.cc \ - rocksdb/util/hash_cuckoo_rep.h \ - rocksdb/util/hash_linklist_rep.cc \ - rocksdb/util/hash_linklist_rep.h \ - rocksdb/util/hash_skiplist_rep.cc \ - rocksdb/util/hash_skiplist_rep.h \ - rocksdb/util/histogram.cc \ - rocksdb/util/histogram.h \ - rocksdb/util/histogram_test.cc \ - rocksdb/util/ldb_cmd.cc \ - rocksdb/util/ldb_cmd.h \ - rocksdb/util/ldb_cmd_execute_result.h \ - rocksdb/util/ldb_tool.cc \ - rocksdb/util/log_buffer.cc \ - rocksdb/util/log_buffer.h \ - rocksdb/util/log_write_bench.cc \ - rocksdb/util/logging.cc \ - rocksdb/util/logging.h \ - rocksdb/util/manual_compaction_test.cc \ - rocksdb/util/murmurhash.cc \ - rocksdb/util/murmurhash.h \ - rocksdb/util/mutexlock.h \ - rocksdb/util/options.cc \ - rocksdb/util/perf_context.cc \ - rocksdb/util/perf_context_imp.h \ - rocksdb/util/posix_logger.h \ - rocksdb/util/random.h \ - rocksdb/util/signal_test.cc \ - rocksdb/util/skiplistrep.cc \ - rocksdb/util/slice.cc \ - rocksdb/util/statistics.cc \ - rocksdb/util/statistics.h \ - rocksdb/util/stats_logger.h \ - rocksdb/util/status.cc \ - rocksdb/util/stl_wrappers.h \ - rocksdb/util/stop_watch.h \ - rocksdb/util/string_util.cc \ - rocksdb/util/string_util.h \ - rocksdb/util/sync_point.cc \ - rocksdb/util/sync_point.h \ - rocksdb/util/testharness.cc \ - rocksdb/util/testharness.h \ - rocksdb/util/testutil.cc \ - rocksdb/util/testutil.h \ - rocksdb/util/thread_local.cc \ - rocksdb/util/thread_local.h \ - rocksdb/util/thread_local_test.cc \ - rocksdb/util/vectorrep.cc \ - rocksdb/util/xxhash.cc \ - rocksdb/util/xxhash.h \ - rocksdb/utilities/backupable/backupable_db.cc \ - rocksdb/utilities/backupable/backupable_db_test.cc \ - rocksdb/utilities/geodb/geodb_impl.cc \ - rocksdb/utilities/geodb/geodb_impl.h \ - rocksdb/utilities/geodb/geodb_test.cc \ - rocksdb/utilities/merge_operators.h \ - rocksdb/utilities/merge_operators/put.cc \ - rocksdb/utilities/merge_operators/string_append/stringappend.cc \ - rocksdb/utilities/merge_operators/string_append/stringappend.h \ - rocksdb/utilities/merge_operators/string_append/stringappend2.cc \ - rocksdb/utilities/merge_operators/string_append/stringappend2.h \ - rocksdb/utilities/merge_operators/string_append/stringappend_test.cc \ - rocksdb/utilities/merge_operators/uint64add.cc \ - rocksdb/utilities/redis/README \ - rocksdb/utilities/redis/redis_list_exception.h \ - rocksdb/utilities/redis/redis_list_iterator.h \ - rocksdb/utilities/redis/redis_lists.cc \ - rocksdb/utilities/redis/redis_lists.h \ - rocksdb/utilities/redis/redis_lists_test.cc \ - rocksdb/utilities/ttl/db_ttl_impl.cc \ - rocksdb/utilities/ttl/db_ttl_impl.h \ - rocksdb/utilities/ttl/ttl_test.cc -endif - # work around old versions of automake that don't define $docdir # NOTE: this won't work on suse, where docdir is /usr/share/doc/packages/$package. @@ -603,49 +124,18 @@ shell_common_SCRIPTS = ceph_common.sh ceph_libexecdir = $(libexecdir)/ceph ceph_libexec_SCRIPTS = ceph-osd-prestart.sh -bash_completiondir = $(sysconfdir)/bash_completion.d -bash_completion_DATA = $(srcdir)/bash_completion/ceph \ - $(srcdir)/bash_completion/rados \ - $(srcdir)/bash_completion/rbd \ - $(srcdir)/bash_completion/radosgw-admin - -ceph_sbin_SCRIPTS = \ - ceph-disk \ - ceph-disk-prepare \ - ceph-disk-activate \ - ceph-disk-udev \ - ceph-create-keys - -bin_SCRIPTS += \ - ceph \ - ceph-run \ - ceph-rest-api \ - ceph-clsinfo \ - ceph-debugpack \ - ceph-rbdnamer \ - ceph-post-file \ - ceph-crush-location \ - rbd-replay-many - -BUILT_SOURCES += init-ceph - -shell_scripts += init-ceph - - - - # tests to actually run on "make check"; if you need extra, non-test, # executables built, you need to replace this with manual assignments # target by target TESTS = \ - $(check_PROGRAMS) \ + $(check_TESTPROGRAMS) \ $(check_SCRIPTS) -check-local: - $(top_srcdir)/qa/workunits/erasure-code/encode-decode-non-regression.sh - $(srcdir)/test/encoding/readable.sh ../ceph-object-corpus +check_SCRIPTS += \ + ../qa/workunits/erasure-code/encode-decode-non-regression.sh \ + test/encoding/readable.sh if WITH_LTTNG # TODO: If we're running the parallel test harness (the preferred harness), this should be AM_TESTS_ENVIRONMENT instead. @@ -660,8 +150,8 @@ core-daemons: ceph-mon ceph-osd ceph-mds radosgw admin-tools: monmaptool osdmaptool crushtool ceph-authtool base: core-daemons admin-tools \ cephfs ceph-syn ceph-conf \ - rados librados-config \ - init-ceph ceph_mon_store_converter ceph-post-file \ + rados radosgw-admin librados-config \ + init-ceph ceph-post-file \ ceph @@ -669,19 +159,17 @@ base: core-daemons admin-tools \ FORCE: .git_version: FORCE - $(srcdir)/check_version $(srcdir)/.git_version + $(srcdir)/make_version -g $(srcdir)/.git_version # if NO_VERSION is set, only generate a new ceph_ver.h if there currently # is none, and call "make_version -n" to fill it with a fixed string. # Otherwise, set it from the contents of .git_version. -ceph_ver.h: .git_version +ceph_ver.h: .git_version FORCE if [ -n "$$NO_VERSION" ] ; then \ - if [ ! -f ./ceph_ver.h ] ; then \ - $(srcdir)/make_version -n ./ceph_ver.h ; \ - fi; \ - else \ - $(srcdir)/make_version $(srcdir)/.git_version ./ceph_ver.h ; \ + $(srcdir)/make_version -g $(srcdir)/.git_version -c $(srcdir)/ceph_ver.h -n ; \ + else \ + $(srcdir)/make_version -g $(srcdir)/.git_version -c $(srcdir)/ceph_ver.h ; \ fi ceph_ver.c: ./ceph_ver.h @@ -692,29 +180,14 @@ sample.fetch_config: fetch_config cp -f $(srcdir)/fetch_config ./sample.fetch_config dist-hook: - $(srcdir)/check_version $(srcdir)/.git_version + $(srcdir)/make_version -g $(srcdir)/.git_version CLEANFILES += ceph_ver.h sample.fetch_config -# assemble Python script with global version variables -# NB: depends on format of ceph_ver.h - -ceph: ceph.in ./ceph_ver.h Makefile - rm -f $@ $@.tmp - echo "#!/usr/bin/env python" >$@.tmp - grep "#define CEPH_GIT_NICE_VER" ./ceph_ver.h | \ - sed -e 's/#define \(.*VER\) /\1=/' >>$@.tmp - grep "#define CEPH_GIT_VER" ./ceph_ver.h | \ - sed -e 's/#define \(.*VER\) /\1=/' -e 's/=\(.*\)$$/="\1"/' >>$@.tmp - cat $(srcdir)/$@.in >>$@.tmp - chmod a+x $@.tmp - chmod a-w $@.tmp - mv $@.tmp $@ - # cleaning -clean-local: +clean-local:: rm -f *.so find . -name '*.gcno' -o -name '*.gcda' -o -name '*.lcov' -o -name "*.o" -o -name "*.lo" | xargs rm -f rm -f ceph java/java/com/ceph/crush/Bucket.class @@ -722,12 +195,7 @@ clean-local: # pybind -python_PYTHON = pybind/rados.py \ - pybind/rbd.py \ - pybind/cephfs.py \ - pybind/ceph_argparse.py \ - pybind/ceph_rest_api.py - +python_PYTHON = # everything else we want to include in a 'make dist' @@ -748,15 +216,6 @@ noinst_HEADERS += \ # coverage -shell_scripts += ceph-coverage -bin_SCRIPTS += ceph-coverage - - -if ENABLE_COVERAGE -COV_DIR = $(DESTDIR)$(libdir)/ceph/coverage -COV_FILES = $(srcdir)/*.gcno -COV_LIB_FILES = $(srcdir)/.libs/*.gcno -endif install-coverage: if ENABLE_COVERAGE @@ -778,12 +237,12 @@ if ENABLE_COVERAGE -test/coverage.sh -d $(srcdir) -o check-coverage make check endif -install-data-local: install-coverage +install-data-local:: install-coverage -mkdir -p $(DESTDIR)$(sysconfdir)/ceph -mkdir -p $(DESTDIR)$(localstatedir)/log/ceph -mkdir -p $(DESTDIR)$(localstatedir)/lib/ceph/tmp -uninstall-local: uninstall-coverage +uninstall-local:: uninstall-coverage -rmdir -p $(DESTDIR)$(sysconfdir)/ceph/ -rmdir -p $(DESTDIR)$(localstatedir)/log/ceph -rmdir -p $(DESTDIR)$(localstatedir)/lib/ceph/tmp @@ -804,3 +263,12 @@ project.tgz: clean coverity-submit: scp project.tgz ceph.com:/home/ceph_site/ceph.com/coverity/`git describe`.tgz curl --data "project=ceph&password=`cat ~/coverity.build.pass.txt`&email=sage@newdream.net&url=http://ceph.com/coverity/`git describe`.tgz" http://scan5.coverity.com/cgi-bin/submit_build.py + +if ENABLE_CLIENT +include Makefile-client.am +endif + +if ENABLE_SERVER +include Makefile-server.am +endif + diff --git a/src/arch/arm.c b/src/arch/arm.c index 93d079ade965a..3560eb7b6738f 100644 --- a/src/arch/arm.c +++ b/src/arch/arm.c @@ -2,6 +2,7 @@ /* flags we export */ int ceph_arch_neon = 0; +int ceph_arch_aarch64_crc32 = 0; #include @@ -47,6 +48,11 @@ int ceph_arch_arm_probe(void) ceph_arch_neon = (get_hwcap() & HWCAP_NEON) == HWCAP_NEON; #elif __aarch64__ && __linux__ ceph_arch_neon = (get_hwcap() & HWCAP_ASIMD) == HWCAP_ASIMD; +# ifdef HWCAP_CRC32 + ceph_arch_aarch64_crc32 = (get_hwcap() & HWCAP_CRC32) == HWCAP_CRC32; +# else + ceph_arch_aarch64_crc32 = false; // sorry! +# endif #else if (0) get_hwcap(); // make compiler shut up diff --git a/src/arch/arm.h b/src/arch/arm.h index f61343833d20f..1659b2e94dec4 100644 --- a/src/arch/arm.h +++ b/src/arch/arm.h @@ -6,6 +6,7 @@ extern "C" { #endif extern int ceph_arch_neon; /* true if we have ARM NEON or ASIMD abilities */ +extern int ceph_arch_aarch64_crc32; /* true if we have AArch64 CRC32/CRC32C abilities */ extern int ceph_arch_arm_probe(void); diff --git a/src/auth/Auth.h b/src/auth/Auth.h index 2d89a033e5ce8..12d49094b437b 100644 --- a/src/auth/Auth.h +++ b/src/auth/Auth.h @@ -136,7 +136,7 @@ struct AuthAuthorizer { bufferlist bl; CryptoKey session_key; - AuthAuthorizer(__u32 p) : protocol(p) {} + explicit AuthAuthorizer(__u32 p) : protocol(p) {} virtual ~AuthAuthorizer() {} virtual bool verify_reply(bufferlist::iterator& reply) = 0; }; diff --git a/src/auth/Crypto.cc b/src/auth/Crypto.cc index e401c9605b18b..24c4bd0d5617d 100644 --- a/src/auth/Crypto.cc +++ b/src/auth/Crypto.cc @@ -17,7 +17,7 @@ # include # include # include -#elif USE_NSS +#elif defined(USE_NSS) # include # include # include @@ -62,160 +62,273 @@ uint64_t get_random(uint64_t min_val, uint64_t max_val) return r; } + // --------------------------------------------------- -int CryptoNone::create(bufferptr& secret) -{ - return 0; -} +class CryptoNoneKeyHandler : public CryptoKeyHandler { +public: + int encrypt(const bufferlist& in, + bufferlist& out, std::string *error) const { + out = in; + return 0; + } + int decrypt(const bufferlist& in, + bufferlist& out, std::string *error) const { + out = in; + return 0; + } +}; + +class CryptoNone : public CryptoHandler { +public: + CryptoNone() { } + ~CryptoNone() {} + int get_type() const { + return CEPH_CRYPTO_NONE; + } + int create(bufferptr& secret) { + return 0; + } + int validate_secret(const bufferptr& secret) { + return 0; + } + CryptoKeyHandler *get_key_handler(const bufferptr& secret, string& error) { + return new CryptoNoneKeyHandler; + } +}; -int CryptoNone::validate_secret(bufferptr& secret) -{ - return 0; -} -void CryptoNone::encrypt(const bufferptr& secret, const bufferlist& in, - bufferlist& out, std::string &error) const -{ - out = in; -} +// --------------------------------------------------- -void CryptoNone::decrypt(const bufferptr& secret, const bufferlist& in, - bufferlist& out, std::string &error) const -{ - out = in; -} +class CryptoAES : public CryptoHandler { +public: + CryptoAES() { } + ~CryptoAES() {} + int get_type() const { + return CEPH_CRYPTO_AES; + } + int create(bufferptr& secret); + int validate_secret(const bufferptr& secret); + CryptoKeyHandler *get_key_handler(const bufferptr& secret, string& error); +}; -// --------------------------------------------------- #ifdef USE_CRYPTOPP # define AES_KEY_LEN ((size_t)CryptoPP::AES::DEFAULT_KEYLENGTH) # define AES_BLOCK_LEN ((size_t)CryptoPP::AES::BLOCKSIZE) -#elif USE_NSS -// when we say AES, we mean AES-128 -# define AES_KEY_LEN 16 -# define AES_BLOCK_LEN 16 -static void nss_aes_operation(CK_ATTRIBUTE_TYPE op, const bufferptr& secret, - const bufferlist& in, bufferlist& out, std::string &error) -{ - const CK_MECHANISM_TYPE mechanism = CKM_AES_CBC_PAD; +class CryptoAESKeyHandler : public CryptoKeyHandler { +public: + CryptoPP::AES::Encryption *enc_key; + CryptoPP::AES::Decryption *dec_key; + + CryptoAESKeyHandler() + : enc_key(NULL), + dec_key(NULL) {} + ~CryptoAESKeyHandler() { + delete enc_key; + delete dec_key; + } - // sample source said this has to be at least size of input + 8, - // but i see 15 still fail with SEC_ERROR_OUTPUT_LEN - bufferptr out_tmp(in.length()+16); + int init(const bufferptr& s, ostringstream& err) { + secret = s; - PK11SlotInfo *slot; + enc_key = new CryptoPP::AES::Encryption( + (byte*)secret.c_str(), CryptoPP::AES::DEFAULT_KEYLENGTH); + dec_key = new CryptoPP::AES::Decryption( + (byte*)secret.c_str(), CryptoPP::AES::DEFAULT_KEYLENGTH); - slot = PK11_GetBestSlot(mechanism, NULL); - if (!slot) { - ostringstream oss; - oss << "cannot find NSS slot to use: " << PR_GetError(); - error = oss.str(); - goto err; + return 0; } - SECItem keyItem; - - keyItem.type = siBuffer; - keyItem.data = (unsigned char*)secret.c_str(); - keyItem.len = secret.length(); - - PK11SymKey *key; + int encrypt(const bufferlist& in, + bufferlist& out, std::string *error) const { + string ciphertext; + CryptoPP::StringSink *sink = new CryptoPP::StringSink(ciphertext); + CryptoPP::CBC_Mode_ExternalCipher::Encryption cbc( + *enc_key, (const byte*)CEPH_AES_IV); + CryptoPP::StreamTransformationFilter stfEncryptor(cbc, sink); - key = PK11_ImportSymKey(slot, mechanism, PK11_OriginUnwrap, CKA_ENCRYPT, - &keyItem, NULL); - if (!key) { - ostringstream oss; - oss << "cannot convert AES key for NSS: " << PR_GetError(); - error = oss.str(); - goto err_slot; + for (std::list::const_iterator it = in.buffers().begin(); + it != in.buffers().end(); ++it) { + const unsigned char *in_buf = (const unsigned char *)it->c_str(); + stfEncryptor.Put(in_buf, it->length()); + } + try { + stfEncryptor.MessageEnd(); + } catch (CryptoPP::Exception& e) { + if (error) { + ostringstream oss; + oss << "encryptor.MessageEnd::Exception: " << e.GetWhat(); + *error = oss.str(); + } + return -1; + } + out.append((const char *)ciphertext.c_str(), ciphertext.length()); + return 0; } - SECItem ivItem; - - ivItem.type = siBuffer; - // losing constness due to SECItem.data; IV should never be - // modified, regardless - ivItem.data = (unsigned char*)CEPH_AES_IV; - ivItem.len = sizeof(CEPH_AES_IV); + int decrypt(const bufferlist& in, + bufferlist& out, std::string *error) const { + string decryptedtext; + CryptoPP::StringSink *sink = new CryptoPP::StringSink(decryptedtext); + CryptoPP::CBC_Mode_ExternalCipher::Decryption cbc( + *dec_key, (const byte*)CEPH_AES_IV ); + CryptoPP::StreamTransformationFilter stfDecryptor(cbc, sink); + for (std::list::const_iterator it = in.buffers().begin(); + it != in.buffers().end(); ++it) { + const unsigned char *in_buf = (const unsigned char *)it->c_str(); + stfDecryptor.Put(in_buf, it->length()); + } - SECItem *param; + try { + stfDecryptor.MessageEnd(); + } catch (CryptoPP::Exception& e) { + if (error) { + ostringstream oss; + oss << "decryptor.MessageEnd::Exception: " << e.GetWhat(); + *error = oss.str(); + } + return -1; + } - param = PK11_ParamFromIV(mechanism, &ivItem); - if (!param) { - ostringstream oss; - oss << "cannot set NSS IV param: " << PR_GetError(); - error = oss.str(); - goto err_key; + out.append((const char *)decryptedtext.c_str(), decryptedtext.length()); + return 0; } +}; - PK11Context *ctx; +#elif defined(USE_NSS) +// when we say AES, we mean AES-128 +# define AES_KEY_LEN 16 +# define AES_BLOCK_LEN 16 - ctx = PK11_CreateContextBySymKey(mechanism, op, key, param); - if (!ctx) { - ostringstream oss; - oss << "cannot create NSS context: " << PR_GetError(); - error = oss.str(); - goto err_param; - } +static int nss_aes_operation(CK_ATTRIBUTE_TYPE op, + CK_MECHANISM_TYPE mechanism, + PK11SymKey *key, + SECItem *param, + const bufferlist& in, bufferlist& out, + std::string *error) +{ + // sample source said this has to be at least size of input + 8, + // but i see 15 still fail with SEC_ERROR_OUTPUT_LEN + bufferptr out_tmp(in.length()+16); + bufferlist incopy; SECStatus ret; int written; - // in is const, and PK11_CipherOp is not; C++ makes this hard to cheat, - // so just copy it to a temp buffer, at least for now - unsigned in_len; unsigned char *in_buf; - in_len = in.length(); - in_buf = (unsigned char*)malloc(in_len); - if (!in_buf) - throw std::bad_alloc(); - in.copy(0, in_len, (char*)in_buf); - ret = PK11_CipherOp(ctx, (unsigned char*)out_tmp.c_str(), &written, out_tmp.length(), + + PK11Context *ectx; + ectx = PK11_CreateContextBySymKey(mechanism, op, key, param); + assert(ectx); + + incopy = in; // it's a shallow copy! + in_buf = (unsigned char*)incopy.c_str(); + ret = PK11_CipherOp(ectx, + (unsigned char*)out_tmp.c_str(), &written, out_tmp.length(), in_buf, in.length()); - free(in_buf); if (ret != SECSuccess) { - ostringstream oss; - oss << "NSS AES failed: " << PR_GetError(); - error = oss.str(); - goto err_op; + PK11_DestroyContext(ectx, PR_TRUE); + if (error) { + ostringstream oss; + oss << "NSS AES failed: " << PR_GetError(); + *error = oss.str(); + } + return -1; } unsigned int written2; - ret = PK11_DigestFinal(ctx, (unsigned char*)out_tmp.c_str()+written, &written2, + ret = PK11_DigestFinal(ectx, + (unsigned char*)out_tmp.c_str()+written, &written2, out_tmp.length()-written); + PK11_DestroyContext(ectx, PR_TRUE); if (ret != SECSuccess) { - ostringstream oss; - oss << "NSS AES final round failed: " << PR_GetError(); - error = oss.str(); - goto err_op; + PK11_DestroyContext(ectx, PR_TRUE); + if (error) { + ostringstream oss; + oss << "NSS AES final round failed: " << PR_GetError(); + *error = oss.str(); + } + return -1; } out_tmp.set_length(written + written2); out.append(out_tmp); - - PK11_DestroyContext(ctx, PR_TRUE); - SECITEM_FreeItem(param, PR_TRUE); - PK11_FreeSymKey(key); - PK11_FreeSlot(slot); - return; - - err_op: - PK11_DestroyContext(ctx, PR_TRUE); - err_param: - SECITEM_FreeItem(param, PR_TRUE); - err_key: - PK11_FreeSymKey(key); - err_slot: - PK11_FreeSlot(slot); - err: - ; + return 0; } +class CryptoAESKeyHandler : public CryptoKeyHandler { + CK_MECHANISM_TYPE mechanism; + PK11SlotInfo *slot; + PK11SymKey *key; + SECItem *param; + +public: + CryptoAESKeyHandler() + : mechanism(CKM_AES_CBC_PAD), + slot(NULL), + key(NULL), + param(NULL) {} + ~CryptoAESKeyHandler() { + SECITEM_FreeItem(param, PR_TRUE); + PK11_FreeSymKey(key); + PK11_FreeSlot(slot); + } + + int init(const bufferptr& s, ostringstream& err) { + secret = s; + + slot = PK11_GetBestSlot(mechanism, NULL); + if (!slot) { + err << "cannot find NSS slot to use: " << PR_GetError(); + return -1; + } + + SECItem keyItem; + keyItem.type = siBuffer; + keyItem.data = (unsigned char*)secret.c_str(); + keyItem.len = secret.length(); + key = PK11_ImportSymKey(slot, mechanism, PK11_OriginUnwrap, CKA_ENCRYPT, + &keyItem, NULL); + if (!key) { + err << "cannot convert AES key for NSS: " << PR_GetError(); + return -1; + } + + SECItem ivItem; + ivItem.type = siBuffer; + // losing constness due to SECItem.data; IV should never be + // modified, regardless + ivItem.data = (unsigned char*)CEPH_AES_IV; + ivItem.len = sizeof(CEPH_AES_IV); + + param = PK11_ParamFromIV(mechanism, &ivItem); + if (!param) { + err << "cannot set NSS IV param: " << PR_GetError(); + return -1; + } + + return 0; + } + + int encrypt(const bufferlist& in, + bufferlist& out, std::string *error) const { + return nss_aes_operation(CKA_ENCRYPT, mechanism, key, param, in, out, error); + } + int decrypt(const bufferlist& in, + bufferlist& out, std::string *error) const { + return nss_aes_operation(CKA_DECRYPT, mechanism, key, param, in, out, error); + } +}; + #else # error "No supported crypto implementation found." #endif + + +// ------------------------------------------------------------ + int CryptoAES::create(bufferptr& secret) { bufferlist bl; @@ -226,7 +339,7 @@ int CryptoAES::create(bufferptr& secret) return 0; } -int CryptoAES::validate_secret(bufferptr& secret) +int CryptoAES::validate_secret(const bufferptr& secret) { if (secret.length() < (size_t)AES_KEY_LEN) { return -EINVAL; @@ -235,140 +348,103 @@ int CryptoAES::validate_secret(bufferptr& secret) return 0; } -void CryptoAES::encrypt(const bufferptr& secret, const bufferlist& in, bufferlist& out, - std::string &error) const +CryptoKeyHandler *CryptoAES::get_key_handler(const bufferptr& secret, + string& error) { - if (secret.length() < AES_KEY_LEN) { - error = "key is too short"; - return; - } -#ifdef USE_CRYPTOPP - { - const unsigned char *key = (const unsigned char *)secret.c_str(); - - string ciphertext; - CryptoPP::AES::Encryption aesEncryption(key, CryptoPP::AES::DEFAULT_KEYLENGTH); - CryptoPP::CBC_Mode_ExternalCipher::Encryption cbcEncryption( aesEncryption, (const byte*)CEPH_AES_IV ); - CryptoPP::StringSink *sink = new CryptoPP::StringSink(ciphertext); - CryptoPP::StreamTransformationFilter stfEncryptor(cbcEncryption, sink); - - for (std::list::const_iterator it = in.buffers().begin(); - it != in.buffers().end(); ++it) { - const unsigned char *in_buf = (const unsigned char *)it->c_str(); - stfEncryptor.Put(in_buf, it->length()); - } - try { - stfEncryptor.MessageEnd(); - } catch (CryptoPP::Exception& e) { - ostringstream oss; - oss << "encryptor.MessageEnd::Exception: " << e.GetWhat(); - error = oss.str(); - return; - } - out.append((const char *)ciphertext.c_str(), ciphertext.length()); + CryptoAESKeyHandler *ckh = new CryptoAESKeyHandler; + ostringstream oss; + if (ckh->init(secret, oss) < 0) { + error = oss.str(); + return NULL; } -#elif USE_NSS - nss_aes_operation(CKA_ENCRYPT, secret, in, out, error); -#else -# error "No supported crypto implementation found." -#endif + return ckh; } -void CryptoAES::decrypt(const bufferptr& secret, const bufferlist& in, - bufferlist& out, std::string &error) const -{ -#ifdef USE_CRYPTOPP - const unsigned char *key = (const unsigned char *)secret.c_str(); - - CryptoPP::AES::Decryption aesDecryption(key, CryptoPP::AES::DEFAULT_KEYLENGTH); - CryptoPP::CBC_Mode_ExternalCipher::Decryption cbcDecryption( aesDecryption, (const byte*)CEPH_AES_IV ); - string decryptedtext; - CryptoPP::StringSink *sink = new CryptoPP::StringSink(decryptedtext); - CryptoPP::StreamTransformationFilter stfDecryptor(cbcDecryption, sink); - for (std::list::const_iterator it = in.buffers().begin(); - it != in.buffers().end(); ++it) { - const unsigned char *in_buf = (const unsigned char *)it->c_str(); - stfDecryptor.Put(in_buf, it->length()); - } - try { - stfDecryptor.MessageEnd(); - } catch (CryptoPP::Exception& e) { - ostringstream oss; - oss << "decryptor.MessageEnd::Exception: " << e.GetWhat(); - error = oss.str(); - return; - } - out.append((const char *)decryptedtext.c_str(), decryptedtext.length()); -#elif USE_NSS - nss_aes_operation(CKA_DECRYPT, secret, in, out, error); -#else -# error "No supported crypto implementation found." -#endif -} +// -- // --------------------------------------------------- -int CryptoKey::set_secret(CephContext *cct, int type, bufferptr& s) -{ - this->type = type; - created = ceph_clock_now(cct); - - CryptoHandler *h = cct->get_crypto_handler(type); - if (!h) { - lderr(cct) << "ERROR: cct->get_crypto_handler(type=" << type << ") returned NULL" << dendl; - return -EOPNOTSUPP; - } - int ret = h->validate_secret(s); - if (ret < 0) - return ret; +void CryptoKey::encode(bufferlist& bl) const +{ + ::encode(type, bl); + ::encode(created, bl); + __u16 len = secret.length(); + ::encode(len, bl); + bl.append(secret); +} - secret = s; +void CryptoKey::decode(bufferlist::iterator& bl) +{ + ::decode(type, bl); + ::decode(created, bl); + __u16 len; + ::decode(len, bl); + bufferptr tmp; + bl.copy(len, tmp); + if (_set_secret(type, tmp) < 0) + throw buffer::malformed_input("malformed secret"); +} +int CryptoKey::set_secret(int type, const bufferptr& s, utime_t c) +{ + int r = _set_secret(type, s); + if (r < 0) + return r; + this->created = c; return 0; } -int CryptoKey::create(CephContext *cct, int t) +int CryptoKey::_set_secret(int t, const bufferptr& s) { - type = t; - created = ceph_clock_now(cct); - - CryptoHandler *h = cct->get_crypto_handler(type); - if (!h) { - lderr(cct) << "ERROR: cct->get_crypto_handler(type=" << type << ") returned NULL" << dendl; - return -EOPNOTSUPP; + if (s.length() == 0) { + secret = s; + ckh.reset(); + return 0; } - return h->create(secret); -} -void CryptoKey::encrypt(CephContext *cct, const bufferlist& in, bufferlist& out, std::string &error) const -{ - if (!ch || ch->get_type() != type) { - ch = cct->get_crypto_handler(type); - if (!ch) { - ostringstream oss; - oss << "CryptoKey::encrypt: key type " << type << " not supported."; - return; + CryptoHandler *ch = CryptoHandler::create(t); + if (ch) { + int ret = ch->validate_secret(s); + if (ret < 0) { + delete ch; + return ret; + } + string error; + ckh.reset(ch->get_key_handler(s, error)); + delete ch; + if (error.length()) { + return -EIO; } } - ch->encrypt(this->secret, in, out, error); + type = t; + secret = s; + return 0; } -void CryptoKey::decrypt(CephContext *cct, const bufferlist& in, bufferlist& out, std::string &error) const +int CryptoKey::create(CephContext *cct, int t) { - if (!ch || ch->get_type() != type) { - ch = cct->get_crypto_handler(type); - if (!ch) { - ostringstream oss; - oss << "CryptoKey::decrypt: key type " << type << " not supported."; - return; - } + CryptoHandler *ch = CryptoHandler::create(t); + if (!ch) { + if (cct) + lderr(cct) << "ERROR: cct->get_crypto_handler(type=" << t << ") returned NULL" << dendl; + return -EOPNOTSUPP; } - ch->decrypt(this->secret, in, out, error); + bufferptr s; + int r = ch->create(s); + delete ch; + if (r < 0) + return r; + + r = _set_secret(t, s); + if (r < 0) + return r; + created = ceph_clock_now(cct); + return r; } void CryptoKey::print(std::ostream &out) const @@ -396,3 +472,18 @@ void CryptoKey::encode_plaintext(bufferlist &bl) { bl.append(encode_base64()); } + + +// ------------------ + +CryptoHandler *CryptoHandler::create(int type) +{ + switch (type) { + case CEPH_CRYPTO_NONE: + return new CryptoNone; + case CEPH_CRYPTO_AES: + return new CryptoAES; + default: + return NULL; + } +} diff --git a/src/auth/Crypto.h b/src/auth/Crypto.h index c8112220aa6a3..3bfc5aabd17a8 100644 --- a/src/auth/Crypto.h +++ b/src/auth/Crypto.h @@ -17,6 +17,7 @@ #include "include/types.h" #include "include/utime.h" +#include "include/memory.h" #include "common/Formatter.h" #include "include/buffer.h" @@ -25,6 +26,22 @@ class CephContext; class CryptoHandler; +class CryptoKeyContext; + +/* + * some per-key context that is specific to a particular crypto backend + */ +class CryptoKeyHandler { +public: + bufferptr secret; + + virtual ~CryptoKeyHandler() {} + + virtual int encrypt(const bufferlist& in, + bufferlist& out, std::string *error) const = 0; + virtual int decrypt(const bufferlist& in, + bufferlist& out, std::string *error) const = 0; +}; /* * match encoding of struct ceph_secret @@ -33,38 +50,32 @@ class CryptoKey { protected: __u16 type; utime_t created; - bufferptr secret; + bufferptr secret; // must set this via set_secret()! - // cache a pointer to the handler, so we don't have to look it up - // for each crypto operation - mutable CryptoHandler *ch; + // cache a pointer to the implementation-specific key handler, so we + // don't have to create it for every crypto operation. + mutable ceph::shared_ptr ckh; + + int _set_secret(int type, const bufferptr& s); public: - CryptoKey() : type(0), ch(NULL) { } - CryptoKey(int t, utime_t c, bufferptr& s) : type(t), created(c), secret(s), ch(NULL) { } - - void encode(bufferlist& bl) const { - ::encode(type, bl); - ::encode(created, bl); - __u16 len = secret.length(); - ::encode(len, bl); - bl.append(secret); + CryptoKey() : type(0) { } + CryptoKey(int t, utime_t c, bufferptr& s) + : created(c) { + _set_secret(t, s); } - void decode(bufferlist::iterator& bl) { - ::decode(type, bl); - ::decode(created, bl); - __u16 len; - ::decode(len, bl); - bl.copy(len, secret); - secret.c_str(); // make sure it's a single buffer! + ~CryptoKey() { } + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator& bl); + int get_type() const { return type; } utime_t get_created() const { return created; } void print(std::ostream& out) const; - int set_secret(CephContext *cct, int type, bufferptr& s); - bufferptr& get_secret() { return secret; } + int set_secret(int type, const bufferptr& s, utime_t created); + const bufferptr& get_secret() { return secret; } const bufferptr& get_secret() const { return secret; } void encode_base64(string& s) const { @@ -94,8 +105,14 @@ class CryptoKey { // -- int create(CephContext *cct, int type); - void encrypt(CephContext *cct, const bufferlist& in, bufferlist& out, std::string &error) const; - void decrypt(CephContext *cct, const bufferlist& in, bufferlist& out, std::string &error) const; + int encrypt(CephContext *cct, const bufferlist& in, bufferlist& out, + std::string *error) const { + return ckh->encrypt(in, out, error); + } + int decrypt(CephContext *cct, const bufferlist& in, bufferlist& out, + std::string *error) const { + return ckh->decrypt(in, out, error); + } void to_str(std::string& s) const; }; @@ -119,44 +136,14 @@ class CryptoHandler { virtual ~CryptoHandler() {} virtual int get_type() const = 0; virtual int create(bufferptr& secret) = 0; - virtual int validate_secret(bufferptr& secret) = 0; - virtual void encrypt(const bufferptr& secret, const bufferlist& in, - bufferlist& out, std::string &error) const = 0; - virtual void decrypt(const bufferptr& secret, const bufferlist& in, - bufferlist& out, std::string &error) const = 0; + virtual int validate_secret(const bufferptr& secret) = 0; + virtual CryptoKeyHandler *get_key_handler(const bufferptr& secret, + string& error) = 0; + + static CryptoHandler *create(int type); }; extern int get_random_bytes(char *buf, int len); extern uint64_t get_random(uint64_t min_val, uint64_t max_val); -class CryptoNone : public CryptoHandler { -public: - CryptoNone() { } - ~CryptoNone() {} - int get_type() const { - return CEPH_CRYPTO_NONE; - } - int create(bufferptr& secret); - int validate_secret(bufferptr& secret); - void encrypt(const bufferptr& secret, const bufferlist& in, - bufferlist& out, std::string &error) const; - void decrypt(const bufferptr& secret, const bufferlist& in, - bufferlist& out, std::string &error) const; -}; - -class CryptoAES : public CryptoHandler { -public: - CryptoAES() { } - ~CryptoAES() {} - int get_type() const { - return CEPH_CRYPTO_AES; - } - int create(bufferptr& secret); - int validate_secret(bufferptr& secret); - void encrypt(const bufferptr& secret, const bufferlist& in, - bufferlist& out, std::string &error) const; - void decrypt(const bufferptr& secret, const bufferlist& in, - bufferlist& out, std::string &error) const; -}; - #endif diff --git a/src/auth/KeyRing.cc b/src/auth/KeyRing.cc index 7aeb9e833a06f..f5793239e8534 100644 --- a/src/auth/KeyRing.cc +++ b/src/auth/KeyRing.cc @@ -33,21 +33,22 @@ #undef dout_prefix #define dout_prefix *_dout << "auth: " -using std::auto_ptr; using namespace std; int KeyRing::from_ceph_context(CephContext *cct) { const md_config_t *conf = cct->_conf; - - int ret = -ENOENT; string filename; - if (ceph_resolve_file_search(conf->keyring, filename)) { + int ret = ceph_resolve_file_search(conf->keyring, filename); + if (!ret) { ret = load(cct, filename); if (ret < 0) lderr(cct) << "failed to load " << filename << ": " << cpp_strerror(ret) << dendl; + } else { + lderr(cct) << "unable to find a keyring on " << conf->keyring + << ": " << cpp_strerror(ret) << dendl; } if (!conf->key.empty()) { diff --git a/src/auth/cephx/CephxClientHandler.cc b/src/auth/cephx/CephxClientHandler.cc index b6d3501ecd572..ff32a425757b6 100644 --- a/src/auth/cephx/CephxClientHandler.cc +++ b/src/auth/cephx/CephxClientHandler.cc @@ -40,7 +40,11 @@ int CephxClientHandler::build_request(bufferlist& bl) const ::encode(header, bl); CryptoKey secret; - keyring->get_secret(cct->_conf->name, secret); + const bool got = keyring->get_secret(cct->_conf->name, secret); + if (!got) { + ldout(cct, 20) << "no secret found for entity: " << cct->_conf->name << dendl; + return -ENOENT; + } CephXAuthenticate req; get_random_bytes((char *)&req.client_challenge, sizeof(req.client_challenge)); @@ -113,7 +117,11 @@ int CephxClientHandler::handle_response(int ret, bufferlist::iterator& indata) { ldout(cct, 10) << " get_auth_session_key" << dendl; CryptoKey secret; - keyring->get_secret(cct->_conf->name, secret); + const bool got = keyring->get_secret(cct->_conf->name, secret); + if (!got) { + ldout(cct, 0) << "key not found for " << cct->_conf->name << dendl; + return -ENOENT; + } if (!tickets.verify_service_ticket_reply(secret, indata)) { ldout(cct, 0) << "could not verify service_ticket reply" << dendl; @@ -150,7 +158,11 @@ int CephxClientHandler::handle_response(int ret, bufferlist::iterator& indata) if (rotating_secrets) { RotatingSecrets secrets; CryptoKey secret_key; - keyring->get_secret(cct->_conf->name, secret_key); + const bool got = keyring->get_secret(cct->_conf->name, secret_key); + if (!got) { + ldout(cct, 0) << "key not found for " << cct->_conf->name << dendl; + return -ENOENT; + } std::string error; if (decode_decrypt(cct, secrets, secret_key, indata, error)) { ldout(cct, 0) << "could not set rotating key: decode_decrypt failed. error:" diff --git a/src/auth/cephx/CephxKeyServer.cc b/src/auth/cephx/CephxKeyServer.cc index b2c0c672aedeb..81c0a66b679cc 100644 --- a/src/auth/cephx/CephxKeyServer.cc +++ b/src/auth/cephx/CephxKeyServer.cc @@ -268,7 +268,7 @@ bool KeyServer::generate_secret(CryptoKey& secret) if (crypto->create(bp) < 0) return false; - secret.set_secret(cct, CEPH_CRYPTO_AES, bp); + secret.set_secret(CEPH_CRYPTO_AES, bp, ceph_clock_now(NULL)); return true; } diff --git a/src/auth/cephx/CephxProtocol.cc b/src/auth/cephx/CephxProtocol.cc index f57f06358644d..f2a00dd75a05d 100644 --- a/src/auth/cephx/CephxProtocol.cc +++ b/src/auth/cephx/CephxProtocol.cc @@ -25,14 +25,13 @@ void cephx_calc_client_server_challenge(CephContext *cct, CryptoKey& secret, uint64_t server_challenge, - uint64_t client_challenge, uint64_t *key, std::string &ret) + uint64_t client_challenge, uint64_t *key, std::string &error) { CephXChallengeBlob b; b.server_challenge = server_challenge; b.client_challenge = client_challenge; bufferlist enc; - std::string error; if (encode_encrypt(cct, b, secret, enc, error)) return; diff --git a/src/auth/cephx/CephxProtocol.h b/src/auth/cephx/CephxProtocol.h index d72a23d87d61f..f08f07d84b9ff 100644 --- a/src/auth/cephx/CephxProtocol.h +++ b/src/auth/cephx/CephxProtocol.h @@ -433,8 +433,7 @@ void decode_decrypt_enc_bl(CephContext *cct, T& t, CryptoKey key, bufferlist& bl uint64_t magic; bufferlist bl; - key.decrypt(cct, bl_enc, bl, error); - if (!error.empty()) + if (key.decrypt(cct, bl_enc, bl, &error) < 0) return; bufferlist::iterator iter2 = bl.begin(); @@ -462,7 +461,7 @@ void encode_encrypt_enc_bl(CephContext *cct, const T& t, const CryptoKey& key, ::encode(magic, bl); ::encode(t, bl); - key.encrypt(cct, bl, out, error); + key.encrypt(cct, bl, out, &error); } template diff --git a/src/auth/cephx/CephxSessionHandler.cc b/src/auth/cephx/CephxSessionHandler.cc index b2d402d2af36b..eaebd152fb263 100644 --- a/src/auth/cephx/CephxSessionHandler.cc +++ b/src/auth/cephx/CephxSessionHandler.cc @@ -24,47 +24,65 @@ #define dout_subsys ceph_subsys_auth +int CephxSessionHandler::_calc_signature(Message *m, uint64_t *psig) +{ + const ceph_msg_header& header = m->get_header(); + const ceph_msg_footer& footer = m->get_footer(); + + // optimized signature calculation + // - avoid temporary allocated buffers from encode_encrypt[_enc_bl] + // - skip the leading 4 byte wrapper from encode_encrypt + struct { + __u8 v; + __le64 magic; + __le32 len; + __le32 header_crc; + __le32 front_crc; + __le32 middle_crc; + __le32 data_crc; + } __attribute__ ((packed)) sigblock = { + 1, AUTH_ENC_MAGIC, 4*4, + header.crc, footer.front_crc, footer.middle_crc, footer.data_crc + }; + bufferlist bl_plaintext; + bl_plaintext.append(buffer::create_static(sizeof(sigblock), (char*)&sigblock)); + + bufferlist bl_ciphertext; + if (key.encrypt(cct, bl_plaintext, bl_ciphertext, NULL) < 0) { + lderr(cct) << __func__ << " failed to encrypt signature block" << dendl; + return -1; + } + + bufferlist::iterator ci = bl_ciphertext.begin(); + ::decode(*psig, ci); + + ldout(cct, 10) << __func__ << " seq " << m->get_seq() + << " front_crc_ = " << footer.front_crc + << " middle_crc = " << footer.middle_crc + << " data_crc = " << footer.data_crc + << " sig = " << *psig + << dendl; + return 0; +} + int CephxSessionHandler::sign_message(Message *m) { // If runtime signing option is off, just return success without signing. if (!cct->_conf->cephx_sign_messages) { return 0; } - bufferlist bl_plaintext, bl_encrypted; - ceph_msg_header header = m->get_header(); - std::string error; - - ceph_msg_footer& en_footer = m->get_footer(); - - ::encode(header.crc, bl_plaintext); - ::encode(en_footer.front_crc, bl_plaintext); - ::encode(en_footer.middle_crc, bl_plaintext); - ::encode(en_footer.data_crc, bl_plaintext); - - ldout(cct, 10) << "sign_message: seq # " << header.seq << " CRCs are: header " << header.crc - << " front " << en_footer.front_crc << " middle " << en_footer.middle_crc - << " data " << en_footer.data_crc << dendl; - - if (encode_encrypt(cct, bl_plaintext, key, bl_encrypted, error)) { - ldout(cct, 0) << "error encrypting message signature: " << error << dendl; - ldout(cct, 0) << "no signature put on message" << dendl; - return SESSION_SIGNATURE_FAILURE; - } - - bufferlist::iterator ci = bl_encrypted.begin(); - // Skip the magic number up front. PLR - ci.advance(4); - ::decode(en_footer.sig, ci); - - // There's potentially an issue with whether the encoding and decoding done here will work - // properly when a big endian and little endian machine are talking. We think it's OK, - // but it should be tested to be sure. PLR - - // Receiver won't trust this flag to decide if msg should have been signed. It's primarily - // to debug problems where sender and receiver disagree on need to sign msg. PLR - en_footer.flags = (unsigned)en_footer.flags | CEPH_MSG_FOOTER_SIGNED; + + uint64_t sig; + int r = _calc_signature(m, &sig); + if (r < 0) + return r; + + ceph_msg_footer& f = m->get_footer(); + f.sig = sig; + f.flags = (unsigned)f.flags | CEPH_MSG_FOOTER_SIGNED; messages_signed++; - ldout(cct, 20) << "Putting signature in client message(seq # " << header.seq << "): sig = " << en_footer.sig << dendl; + ldout(cct, 20) << "Putting signature in client message(seq # " << m->get_seq() + << "): sig = " << sig << dendl; return 0; } @@ -74,57 +92,34 @@ int CephxSessionHandler::check_message_signature(Message *m) if (!cct->_conf->cephx_sign_messages) { return 0; } - - bufferlist bl_plaintext, bl_ciphertext; - std::string sig_error; - ceph_msg_header& header = m->get_header(); - ceph_msg_footer& footer = m->get_footer(); - if ((features & CEPH_FEATURE_MSG_AUTH) == 0) { // it's fine, we didn't negotiate this feature. return 0; } - signatures_checked++; + uint64_t sig; + int r = _calc_signature(m, &sig); + if (r < 0) + return r; - ldout(cct, 10) << "check_message_signature: seq # = " << m->get_seq() << " front_crc_ = " << footer.front_crc - << " middle_crc = " << footer.middle_crc << " data_crc = " << footer.data_crc << dendl; - ::encode(header.crc, bl_plaintext); - ::encode(footer.front_crc, bl_plaintext); - ::encode(footer.middle_crc, bl_plaintext); - ::encode(footer.data_crc, bl_plaintext); - - // Encrypt the buffer containing the checksums to calculate the signature. PLR - if (encode_encrypt(cct, bl_plaintext, key, bl_ciphertext, sig_error)) { - ldout(cct, 0) << "error in encryption for checking message signature: " << sig_error << dendl; - return (SESSION_SIGNATURE_FAILURE); - } - - bufferlist::iterator ci = bl_ciphertext.begin(); - // Skip the magic number at the front. PLR - ci.advance(4); - uint64_t sig_check; - ::decode(sig_check, ci); - - // There's potentially an issue with whether the encoding and decoding done here will work - // properly when a big endian and little endian machine are talking. We think it's OK, - // but it should be tested to be sure. PLR + signatures_checked++; - if (sig_check != footer.sig) { + if (sig != m->get_footer().sig) { // Should have been signed, but signature check failed. PLR - if (!(footer.flags & CEPH_MSG_FOOTER_SIGNED)) { - ldout(cct, 0) << "SIGN: MSG " << header.seq << " Sender did not set CEPH_MSG_FOOTER_SIGNED." << dendl; + if (!(m->get_footer().flags & CEPH_MSG_FOOTER_SIGNED)) { + ldout(cct, 0) << "SIGN: MSG " << m->get_seq() << " Sender did not set CEPH_MSG_FOOTER_SIGNED." << dendl; } - ldout(cct, 0) << "SIGN: MSG " << header.seq << " Message signature does not match contents." << dendl; - ldout(cct, 0) << "SIGN: MSG " << header.seq << "Signature on message:" << dendl; - ldout(cct, 0) << "SIGN: MSG " << header.seq << " sig: " << footer.sig << dendl; - ldout(cct, 0) << "SIGN: MSG " << header.seq << "Locally calculated signature:" << dendl; - ldout(cct, 0) << "SIGN: MSG " << header.seq << " sig_check:" << sig_check << dendl; - - // For the moment, printing an error message to the log and returning failure is sufficient. - // In the long term, we should probably have code parsing the log looking for this kind - // of security failure, particularly when there are large numbers of them, since the latter - // is a potential sign of an attack. PLR + ldout(cct, 0) << "SIGN: MSG " << m->get_seq() << " Message signature does not match contents." << dendl; + ldout(cct, 0) << "SIGN: MSG " << m->get_seq() << "Signature on message:" << dendl; + ldout(cct, 0) << "SIGN: MSG " << m->get_seq() << " sig: " << m->get_footer().sig << dendl; + ldout(cct, 0) << "SIGN: MSG " << m->get_seq() << "Locally calculated signature:" << dendl; + ldout(cct, 0) << "SIGN: MSG " << m->get_seq() << " sig_check:" << sig << dendl; + + // For the moment, printing an error message to the log and + // returning failure is sufficient. In the long term, we should + // probably have code parsing the log looking for this kind of + // security failure, particularly when there are large numbers of + // them, since the latter is a potential sign of an attack. PLR signatures_failed++; ldout(cct, 0) << "Signature failed." << dendl; diff --git a/src/auth/cephx/CephxSessionHandler.h b/src/auth/cephx/CephxSessionHandler.h index 52a112e29a29e..7b46e076b8a20 100644 --- a/src/auth/cephx/CephxSessionHandler.h +++ b/src/auth/cephx/CephxSessionHandler.h @@ -31,8 +31,9 @@ class CephxSessionHandler : public AuthSessionHandler { return false; } - int sign_message(Message *m); + int _calc_signature(Message *m, uint64_t *psig); + int sign_message(Message *m); int check_message_signature(Message *m) ; // Cephx does not currently encrypt messages, so just return 0 if called. PLR diff --git a/src/auth/none/AuthNoneServiceHandler.h b/src/auth/none/AuthNoneServiceHandler.h index 1c37d79e75514..c2bbbc8c8cf01 100644 --- a/src/auth/none/AuthNoneServiceHandler.h +++ b/src/auth/none/AuthNoneServiceHandler.h @@ -32,7 +32,6 @@ class AuthNoneServiceHandler : public AuthServiceHandler { return CEPH_AUTH_NONE; } int handle_request(bufferlist::iterator& indata, bufferlist& result_bl, uint64_t& global_id, AuthCapsInfo& caps, uint64_t *auid = NULL) { - assert(0); // shouldn't get called return 0; } void build_cephx_response_header(int request_type, int status, bufferlist& bl) { } diff --git a/src/brag/Makefile.am b/src/brag/Makefile.am index 35c735baf8fec..32456b01c751c 100644 --- a/src/brag/Makefile.am +++ b/src/brag/Makefile.am @@ -1,3 +1,4 @@ - +if ENABLE_CLIENT bin_SCRIPTS += brag/client/ceph-brag EXTRA_DIST += brag/server brag/README.md brag/client +endif # ENABLE_CLIENT diff --git a/src/ceph-create-keys b/src/ceph-create-keys index 0359228d5f851..57eaf1744c1a6 100755 --- a/src/ceph-create-keys +++ b/src/ceph-create-keys @@ -217,6 +217,10 @@ def main(): cluster=args.cluster, type_='osd', ) + bootstrap_key( + cluster=args.cluster, + type_='rgw', + ) bootstrap_key( cluster=args.cluster, type_='mds', diff --git a/src/ceph-debugpack.in b/src/ceph-debugpack.in index 7be14e4429edc..9ac8f3bba798b 100644 --- a/src/ceph-debugpack.in +++ b/src/ceph-debugpack.in @@ -22,7 +22,7 @@ usage_exit() { wait_pid_exit() { pid=$1 - for i in {1..10}; do + for i in $(seq 10); do [ -e /proc/$pid ] || return sleep 1 done diff --git a/src/ceph-detect-init/.gitignore b/src/ceph-detect-init/.gitignore new file mode 100644 index 0000000000000..8c130b316655f --- /dev/null +++ b/src/ceph-detect-init/.gitignore @@ -0,0 +1,13 @@ +*~ +*.pyc +*.pyo +.coverage +.tox +*.egg-info +*.egg +dist +build +wheelhouse* +*.log +*.trs + diff --git a/src/ceph-detect-init/AUTHORS.rst b/src/ceph-detect-init/AUTHORS.rst new file mode 100644 index 0000000000000..3818d35db430e --- /dev/null +++ b/src/ceph-detect-init/AUTHORS.rst @@ -0,0 +1,2 @@ +- Owen Synge +- Loic Dachary diff --git a/src/ceph-detect-init/MANIFEST.in b/src/ceph-detect-init/MANIFEST.in new file mode 100644 index 0000000000000..23abe0d38f4f1 --- /dev/null +++ b/src/ceph-detect-init/MANIFEST.in @@ -0,0 +1 @@ +include AUTHORS.rst diff --git a/src/ceph-detect-init/Makefile.am b/src/ceph-detect-init/Makefile.am new file mode 100644 index 0000000000000..a2c885aae697a --- /dev/null +++ b/src/ceph-detect-init/Makefile.am @@ -0,0 +1,72 @@ +# +# Copyright (C) 2015 SUSE LINUX GmbH +# Copyright (C) 2015 +# +# Author: Owen Synge +# Author: Loic Dachary +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see ``. +# +check_SCRIPTS += ceph-detect-init/run-tox.sh + +EXTRA_DIST += \ + ceph-detect-init/AUTHORS.rst \ + ceph-detect-init/ceph_detect_init/centos/__init__.py \ + ceph-detect-init/ceph_detect_init/exc.py \ + ceph-detect-init/ceph_detect_init/main.py \ + ceph-detect-init/ceph_detect_init/__init__.py \ + ceph-detect-init/ceph_detect_init/rhel/__init__.py \ + ceph-detect-init/ceph_detect_init/fedora/__init__.py \ + ceph-detect-init/ceph_detect_init/debian/__init__.py \ + ceph-detect-init/ceph_detect_init/suse/__init__.py \ + ceph-detect-init/integration/centos-6.dockerfile \ + ceph-detect-init/integration/debian-wheezy.dockerfile \ + ceph-detect-init/integration/debian-sid.dockerfile \ + ceph-detect-init/integration/debian-jessie.dockerfile \ + ceph-detect-init/integration/opensuse-13.1.dockerfile \ + ceph-detect-init/integration/fedora-21.dockerfile \ + ceph-detect-init/integration/ubuntu-14.04.dockerfile \ + ceph-detect-init/integration/test_main.py \ + ceph-detect-init/integration/opensuse-13.2.dockerfile \ + ceph-detect-init/integration/ubuntu-12.04.dockerfile \ + ceph-detect-init/integration/centos-7.dockerfile \ + ceph-detect-init/integration/ubuntu-15.04.dockerfile \ + ceph-detect-init/integration/debian-squeeze.dockerfile \ + ceph-detect-init/Makefile.am \ + ceph-detect-init/MANIFEST.in \ + ceph-detect-init/README.rst \ + ceph-detect-init/requirements.txt \ + ceph-detect-init/run-tox.sh \ + ceph-detect-init/setup.py \ + ceph-detect-init/test-requirements.txt \ + ceph-detect-init/tests/test_all.py \ + ceph-detect-init/tox.ini + +all-local:: + cd $(srcdir)/ceph-detect-init ; python setup.py build + +clean-local:: + cd $(srcdir)/ceph-detect-init ; python setup.py clean ; rm -fr wheelhouse .tox build .coverage *.egg-info + +install-data-local:: + cd $(srcdir)/ceph-detect-init ; \ + if test "$(DESTDIR)" ; then \ + if lsb_release -si | grep --quiet 'Ubuntu\|Debian\|Devuan' ; then \ + options=--install-layout=deb ; \ + else \ + options=--prefix=/usr ; \ + fi ; \ + root="--root=$(DESTDIR)" ; \ + fi ; \ + python setup.py install $$root $$options diff --git a/src/ceph-detect-init/README.rst b/src/ceph-detect-init/README.rst new file mode 100644 index 0000000000000..e40f22fd9aaf0 --- /dev/null +++ b/src/ceph-detect-init/README.rst @@ -0,0 +1,28 @@ +ceph-detect-init +================ + +ceph-detect-init is a command line tool that displays a normalized +string describing the init system of the host on which it is running: + +Home page : https://pypi.python.org/pypi/ceph-detect-init + +Hacking +======= + +* Get the code : git clone https://git.ceph.com/ceph.git +* Run the unit tests : tox +* Run the integration tests (requires docker) : tox -e integration +* Check the documentation : rst2html < README.rst > /tmp/a.html +* Prepare a new version + + - version=1.0.0 ; perl -pi -e "s/^version.*/version='$version',/" setup.py ; do python setup.py sdist ; amend=$(git log -1 --oneline | grep --quiet "version $version" && echo --amend) ; git commit $amend -m "version $version" setup.py ; git tag -a -f -m "version $version" $version ; done + +* Publish a new version + + - python setup.py sdist upload --sign + - git push ; git push --tags + +* pypi maintenance + + - python setup.py register # if the project does not yet exist + - trim old versions at https://pypi.python.org/pypi/ceph-detect-init diff --git a/src/ceph-detect-init/ceph_detect_init/__init__.py b/src/ceph-detect-init/ceph_detect_init/__init__.py new file mode 100644 index 0000000000000..cc9b2c0a28d64 --- /dev/null +++ b/src/ceph-detect-init/ceph_detect_init/__init__.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python +# +# Copyright (C) 2015 +# +# Author: Alfredo Deza +# Author: Loic Dachary +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# +from ceph_detect_init import centos +from ceph_detect_init import debian +from ceph_detect_init import exc +from ceph_detect_init import fedora +from ceph_detect_init import rhel +from ceph_detect_init import suse +import logging +import platform + + +def get(use_rhceph=False): + distro_name, release, codename = platform_information() + if not codename or not _get_distro(distro_name): + raise exc.UnsupportedPlatform( + distro=distro_name, + codename=codename, + release=release) + + module = _get_distro(distro_name, use_rhceph=use_rhceph) + module.name = distro_name + module.normalized_name = _normalized_distro_name(distro_name) + module.distro = module.normalized_name + module.is_el = module.normalized_name in ['redhat', 'centos', + 'fedora', 'scientific'] + module.release = release + module.codename = codename + module.init = module.choose_init() + return module + + +def _get_distro(distro, use_rhceph=False): + if not distro: + return + + distro = _normalized_distro_name(distro) + distributions = { + 'debian': debian, + 'ubuntu': debian, + 'linuxmint': debian, + 'centos': centos, + 'scientific': centos, + 'redhat': centos, + 'fedora': fedora, + 'suse': suse, + } + + if distro == 'redhat' and use_rhceph: + return rhel + else: + return distributions.get(distro) + + +def _normalized_distro_name(distro): + distro = distro.lower() + if distro.startswith(('redhat', 'red hat')): + return 'redhat' + elif distro.startswith(('scientific', 'scientific linux')): + return 'scientific' + elif distro.startswith(('suse', 'opensuse')): + return 'suse' + elif distro.startswith('centos'): + return 'centos' + return distro + + +def platform_information(): + """detect platform information from remote host.""" + logging.debug('platform_information: linux_distribution = ' + + str(platform.linux_distribution())) + distro, release, codename = platform.linux_distribution() + # this could be an empty string in Debian + if not codename and 'debian' in distro.lower(): + debian_codenames = { + '8': 'jessie', + '7': 'wheezy', + '6': 'squeeze', + } + major_version = release.split('.')[0] + codename = debian_codenames.get(major_version, '') + + # In order to support newer jessie/sid or wheezy/sid strings + # we test this if sid is buried in the minor, we should use + # sid anyway. + if not codename and '/' in release: + major, minor = release.split('/') + if minor == 'sid': + codename = minor + else: + codename = major + + return ( + str(distro).rstrip(), + str(release).rstrip(), + str(codename).rstrip() + ) diff --git a/src/ceph-detect-init/ceph_detect_init/centos/__init__.py b/src/ceph-detect-init/ceph_detect_init/centos/__init__.py new file mode 100644 index 0000000000000..f7bf85beda8c4 --- /dev/null +++ b/src/ceph-detect-init/ceph_detect_init/centos/__init__.py @@ -0,0 +1,11 @@ +distro = None +release = None +codename = None + + +def choose_init(): + """Select a init system + + Returns the name of a init system (upstart, sysvinit ...). + """ + return 'sysvinit' diff --git a/src/ceph-detect-init/ceph_detect_init/debian/__init__.py b/src/ceph-detect-init/ceph_detect_init/debian/__init__.py new file mode 100644 index 0000000000000..7518562e2f31c --- /dev/null +++ b/src/ceph-detect-init/ceph_detect_init/debian/__init__.py @@ -0,0 +1,13 @@ +distro = None +release = None +codename = None + + +def choose_init(): + """Select a init system + + Returns the name of a init system (upstart, sysvinit ...). + """ + if distro.lower() == 'ubuntu' or distro.lower() == 'linuxmint': + return 'upstart' + return 'sysvinit' diff --git a/src/ceph-detect-init/ceph_detect_init/exc.py b/src/ceph-detect-init/ceph_detect_init/exc.py new file mode 100644 index 0000000000000..61d9752ff7f87 --- /dev/null +++ b/src/ceph-detect-init/ceph_detect_init/exc.py @@ -0,0 +1,35 @@ +# +# Copyright (C) 2015 +# +# Author: Alfredo Deza +# Author: Loic Dachary +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see ``. +# + + +class UnsupportedPlatform(Exception): + """Platform is not supported.""" + def __init__(self, distro, codename, release): + self.distro = distro + self.codename = codename + self.release = release + + def __str__(self): + return '{doc}: {distro} {codename} {release}'.format( + doc=self.__doc__.strip(), + distro=self.distro, + codename=self.codename, + release=self.release, + ) diff --git a/src/ceph-detect-init/ceph_detect_init/fedora/__init__.py b/src/ceph-detect-init/ceph_detect_init/fedora/__init__.py new file mode 100644 index 0000000000000..f7bf85beda8c4 --- /dev/null +++ b/src/ceph-detect-init/ceph_detect_init/fedora/__init__.py @@ -0,0 +1,11 @@ +distro = None +release = None +codename = None + + +def choose_init(): + """Select a init system + + Returns the name of a init system (upstart, sysvinit ...). + """ + return 'sysvinit' diff --git a/src/ceph-detect-init/ceph_detect_init/main.py b/src/ceph-detect-init/ceph_detect_init/main.py new file mode 100644 index 0000000000000..320ae1703206d --- /dev/null +++ b/src/ceph-detect-init/ceph_detect_init/main.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python +# +# Copyright (C) 2015 +# Copyright (C) 2015 SUSE LINUX GmbH +# +# Author: Alfredo Deza +# Author: Owen Synge +# Author: Loic Dachary +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# +import argparse +import logging + +import ceph_detect_init +from ceph_detect_init import exc + + +def parser(): + parser = argparse.ArgumentParser( + 'ceph-detect-init', + ) + parser.add_argument( + "-v", + "--verbose", + action="store_true", + default=None, + ) + parser.add_argument( + "--use-rhceph", + action="store_true", + default=False, + ) + parser.add_argument( + "--default", + default=None, + ) + return parser + + +def run(argv=None, namespace=None): + args = parser().parse_args(argv, namespace) + + if args.verbose: + logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', + level=logging.DEBUG) + try: + print(ceph_detect_init.get(args.use_rhceph).init) + except exc.UnsupportedPlatform: + if args.default: + print(args.default) + else: + raise + + return 0 diff --git a/src/ceph-detect-init/ceph_detect_init/rhel/__init__.py b/src/ceph-detect-init/ceph_detect_init/rhel/__init__.py new file mode 100644 index 0000000000000..f7bf85beda8c4 --- /dev/null +++ b/src/ceph-detect-init/ceph_detect_init/rhel/__init__.py @@ -0,0 +1,11 @@ +distro = None +release = None +codename = None + + +def choose_init(): + """Select a init system + + Returns the name of a init system (upstart, sysvinit ...). + """ + return 'sysvinit' diff --git a/src/ceph-detect-init/ceph_detect_init/suse/__init__.py b/src/ceph-detect-init/ceph_detect_init/suse/__init__.py new file mode 100644 index 0000000000000..69bf7c481e72f --- /dev/null +++ b/src/ceph-detect-init/ceph_detect_init/suse/__init__.py @@ -0,0 +1,17 @@ +distro = None +release = None +codename = None + + +def choose_init(): + """Select a init system + + Returns the name of a init system (upstart, sysvinit ...). + """ + init_mapping = { + '11': 'sysvinit', # SLE_11 + '12': 'systemd', # SLE_12 + '13.1': 'systemd', # openSUSE_13.1 + '13.2': 'systemd', # openSUSE_13.2 + } + return init_mapping.get(release, 'sysvinit') diff --git a/src/ceph-detect-init/integration/centos-6.dockerfile b/src/ceph-detect-init/integration/centos-6.dockerfile new file mode 100644 index 0000000000000..7cb5095fefbcc --- /dev/null +++ b/src/ceph-detect-init/integration/centos-6.dockerfile @@ -0,0 +1,4 @@ +FROM centos:6 + +RUN yum install -y yum-utils && yum-config-manager --add-repo https://dl.fedoraproject.org/pub/epel/6/x86_64/ && yum install --nogpgcheck -y epel-release && rpm --import /etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-6 && rm /etc/yum.repos.d/dl.fedoraproject.org* +RUN yum install -y python-pip python-virtualenv git diff --git a/src/ceph-detect-init/integration/centos-7.dockerfile b/src/ceph-detect-init/integration/centos-7.dockerfile new file mode 100644 index 0000000000000..59a5748ebaf91 --- /dev/null +++ b/src/ceph-detect-init/integration/centos-7.dockerfile @@ -0,0 +1,4 @@ +FROM centos:7 + +RUN yum install -y yum-utils && yum-config-manager --add-repo https://dl.fedoraproject.org/pub/epel/7/x86_64/ && yum install --nogpgcheck -y epel-release && rpm --import /etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-7 && rm /etc/yum.repos.d/dl.fedoraproject.org* +RUN yum install -y python-pip python-virtualenv git diff --git a/src/ceph-detect-init/integration/debian-jessie.dockerfile b/src/ceph-detect-init/integration/debian-jessie.dockerfile new file mode 100644 index 0000000000000..bca22d52009e1 --- /dev/null +++ b/src/ceph-detect-init/integration/debian-jessie.dockerfile @@ -0,0 +1,6 @@ +FROM debian:jessie + +RUN apt-get update +RUN apt-get install -y python-virtualenv python-pip git + + diff --git a/src/ceph-detect-init/integration/debian-sid.dockerfile b/src/ceph-detect-init/integration/debian-sid.dockerfile new file mode 100644 index 0000000000000..00e44721bd3ef --- /dev/null +++ b/src/ceph-detect-init/integration/debian-sid.dockerfile @@ -0,0 +1,4 @@ +FROM debian:sid + +RUN apt-get update +RUN apt-get install -y python-virtualenv python-pip git diff --git a/src/ceph-detect-init/integration/debian-squeeze.dockerfile b/src/ceph-detect-init/integration/debian-squeeze.dockerfile new file mode 100644 index 0000000000000..e5080f6654ed3 --- /dev/null +++ b/src/ceph-detect-init/integration/debian-squeeze.dockerfile @@ -0,0 +1,4 @@ +FROM debian:squeeze + +RUN apt-get update +RUN apt-get install -y python-virtualenv python-pip git diff --git a/src/ceph-detect-init/integration/debian-wheezy.dockerfile b/src/ceph-detect-init/integration/debian-wheezy.dockerfile new file mode 100644 index 0000000000000..e03e30e454bfb --- /dev/null +++ b/src/ceph-detect-init/integration/debian-wheezy.dockerfile @@ -0,0 +1,4 @@ +FROM debian:wheezy + +RUN apt-get update +RUN apt-get install -y python-virtualenv python-pip git diff --git a/src/ceph-detect-init/integration/fedora-21.dockerfile b/src/ceph-detect-init/integration/fedora-21.dockerfile new file mode 100644 index 0000000000000..ee2ac93695d03 --- /dev/null +++ b/src/ceph-detect-init/integration/fedora-21.dockerfile @@ -0,0 +1,3 @@ +FROM fedora:21 + +RUN yum install -y python-pip python-virtualenv git diff --git a/src/ceph-detect-init/integration/opensuse-13.1.dockerfile b/src/ceph-detect-init/integration/opensuse-13.1.dockerfile new file mode 100644 index 0000000000000..00a5a28569bc3 --- /dev/null +++ b/src/ceph-detect-init/integration/opensuse-13.1.dockerfile @@ -0,0 +1,3 @@ +FROM opensuse:13.1 + +RUN zypper --non-interactive --gpg-auto-import-keys install lsb python-pip python-virtualenv git diff --git a/src/ceph-detect-init/integration/opensuse-13.2.dockerfile b/src/ceph-detect-init/integration/opensuse-13.2.dockerfile new file mode 100644 index 0000000000000..26f591b73c429 --- /dev/null +++ b/src/ceph-detect-init/integration/opensuse-13.2.dockerfile @@ -0,0 +1,3 @@ +FROM opensuse:13.2 + +RUN zypper --non-interactive --gpg-auto-import-keys install python-pip python-virtualenv git diff --git a/src/ceph-detect-init/integration/test_main.py b/src/ceph-detect-init/integration/test_main.py new file mode 100644 index 0000000000000..e7a620e7b28c5 --- /dev/null +++ b/src/ceph-detect-init/integration/test_main.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python +# +# Copyright (C) 2015 SUSE LINUX GmbH +# Copyright (C) 2015 +# +# Author: Owen Synge +# Author: Loic Dachary +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# +import logging +import shutil +import subprocess +import testtools + +from ceph_detect_init import main + +logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', + level=logging.DEBUG) + + +def run(os): + name = 'ceph-detect-init-' + os + shutil.rmtree(name, ignore_errors=True) + script = """\ +docker build -t {name} --file integration/{os}.dockerfile . +toplevel=$(git rev-parse --show-toplevel) +mkdir {name} +cat > {name}/try.sh < {name}/init +EOF + +docker run -v $toplevel:$toplevel -w $(pwd) --user $(id -u) {name} bash -x {name}/try.sh +""".format(name=name, + os=os) + subprocess.check_call(script, shell=True) + init = open(name + '/init').read().strip() + shutil.rmtree(name) + return init + + +class TestCephDetectInit(testtools.TestCase): + + def test_centos_6(self): + self.assertEqual('sysvinit', run('centos-6')) + + def test_centos_7(self): + self.assertEqual('sysvinit', run('centos-7')) + + def test_ubuntu_12_04(self): + self.assertEqual('upstart', run('ubuntu-12.04')) + + def test_ubuntu_14_04(self): + self.assertEqual('upstart', run('ubuntu-14.04')) + + def test_ubuntu_15_04(self): + self.assertEqual('upstart', run('ubuntu-15.04')) + + def test_debian_squeeze(self): + self.assertEqual('sysvinit', run('debian-squeeze')) + + def test_debian_wheezy(self): + self.assertEqual('sysvinit', run('debian-wheezy')) + + def test_debian_jessie(self): + self.assertEqual('sysvinit', run('debian-jessie')) + + def test_debian_sid(self): + self.assertEqual('sysvinit', run('debian-sid')) + + def test_fedora_21(self): + self.assertEqual('sysvinit', run('fedora-21')) + + def test_opensuse_13_1(self): + self.assertEqual('systemd', run('opensuse-13.1')) + + def test_opensuse_13_2(self): + self.assertEqual('systemd', run('opensuse-13.2')) + +# Local Variables: +# compile-command: "cd .. ; .tox/py27/bin/py.test integration/test_main.py" +# End: diff --git a/src/ceph-detect-init/integration/ubuntu-12.04.dockerfile b/src/ceph-detect-init/integration/ubuntu-12.04.dockerfile new file mode 100644 index 0000000000000..dda1a627460ba --- /dev/null +++ b/src/ceph-detect-init/integration/ubuntu-12.04.dockerfile @@ -0,0 +1,4 @@ +FROM ubuntu:12.04 + +RUN apt-get update +RUN apt-get install -y python-virtualenv python-pip git diff --git a/src/ceph-detect-init/integration/ubuntu-14.04.dockerfile b/src/ceph-detect-init/integration/ubuntu-14.04.dockerfile new file mode 100644 index 0000000000000..4f7a698227fd0 --- /dev/null +++ b/src/ceph-detect-init/integration/ubuntu-14.04.dockerfile @@ -0,0 +1,6 @@ +FROM ubuntu:14.04 + +RUN apt-get update +# http://stackoverflow.com/questions/27341064/how-do-i-fix-importerror-cannot-import-name-incompleteread +RUN apt-get install -y python-setuptools && easy_install -U pip +RUN apt-get install -y python-virtualenv git diff --git a/src/ceph-detect-init/integration/ubuntu-15.04.dockerfile b/src/ceph-detect-init/integration/ubuntu-15.04.dockerfile new file mode 100644 index 0000000000000..29b5776054497 --- /dev/null +++ b/src/ceph-detect-init/integration/ubuntu-15.04.dockerfile @@ -0,0 +1,4 @@ +FROM ubuntu:15.04 + +RUN apt-get update +RUN apt-get install -y python-pip python-virtualenv git diff --git a/src/ceph-detect-init/requirements.txt b/src/ceph-detect-init/requirements.txt new file mode 100644 index 0000000000000..1352d5e6f1935 --- /dev/null +++ b/src/ceph-detect-init/requirements.txt @@ -0,0 +1 @@ +argparse diff --git a/src/ceph-detect-init/run-tox.sh b/src/ceph-detect-init/run-tox.sh new file mode 100755 index 0000000000000..206938e287677 --- /dev/null +++ b/src/ceph-detect-init/run-tox.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# +# Copyright (C) 2015 SUSE LINUX GmbH +# Copyright (C) 2015 +# +# Author: Owen Synge +# Author: Loic Dachary +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# + +# run from the ceph-detect-init directory or from its parent +test -d ceph-detect-init && cd ceph-detect-init +trap "rm -fr make-check" EXIT +virtualenv make-check +. make-check/bin/activate +# older versions of pip will not install wrap_console scripts +# when using wheel packages +pip --log make-check/log.txt install --upgrade 'pip >= 6.1' +if test -d wheelhouse ; then + export NO_INDEX=--no-index +fi +pip --log make-check/log.txt install $NO_INDEX --use-wheel --find-links=file://$(pwd)/wheelhouse --upgrade distribute +pip --log make-check/log.txt install $NO_INDEX --use-wheel --find-links=file://$(pwd)/wheelhouse 'tox >=1.9' +tox > make-check/tox.out 2>&1 +status=$? +grep -v InterpreterNotFound < make-check/tox.out +exit $status diff --git a/src/ceph-detect-init/setup.py b/src/ceph-detect-init/setup.py new file mode 100644 index 0000000000000..dea9637d60ad3 --- /dev/null +++ b/src/ceph-detect-init/setup.py @@ -0,0 +1,79 @@ +# +# Copyright (C) 2015 SUSE LINUX GmbH +# Copyright (C) 2015 +# +# Author: Owen Synge +# Author: Loic Dachary +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see ``. +# +import os +import sys +from setuptools import setup +from setuptools import find_packages + +def read(fname): + path = os.path.join(os.path.dirname(__file__), fname) + f = open(path) + return f.read() + + +def filter_included_modules(*m): + modules = sum(m, []) + if sys.version_info[0] == 2 and sys.version_info[1] <= 6: + return modules + included_modules = set(['argparse', 'importlib', 'sysconfig']) + return list(set(modules) - included_modules) + + +install_requires = read('requirements.txt').split() +tests_require = read('test-requirements.txt').split() + +setup( + name='ceph-detect-init', + version='1.0.1', + packages=find_packages(), + + author='Owen Synge, Loic Dachary', + author_email='osynge@suse.de, loic@dachary.org', + description='display the normalized name of the init system', + long_description=read('README.rst'), + license='LGPLv2+', + keywords='ceph', + url="https://git.ceph.com/?p=ceph.git;a=summary", + + install_requires=filter_included_modules(['setuptools'], + install_requires), + tests_require=filter_included_modules(tests_require), + + classifiers=[ + 'Environment :: Console', + 'Intended Audience :: Information Technology', + 'Intended Audience :: System Administrators', + 'Operating System :: POSIX :: Linux', + 'License :: OSI Approved :: GNU Lesser General Public License v2 or later (LGPLv2+)', + 'Programming Language :: Python', + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 3', + 'Topic :: Utilities', + ], + + entry_points={ + + 'console_scripts': [ + 'ceph-detect-init = ceph_detect_init.main:run', + ], + + }, + ) diff --git a/src/ceph-detect-init/test-requirements.txt b/src/ceph-detect-init/test-requirements.txt new file mode 100644 index 0000000000000..5a0761c27abc1 --- /dev/null +++ b/src/ceph-detect-init/test-requirements.txt @@ -0,0 +1,10 @@ +coverage>=3.6 +discover +fixtures>=0.3.14 +python-subunit +testrepository>=0.0.17 +testtools>=0.9.32 +mock +pytest +tox +flake8 diff --git a/src/ceph-detect-init/tests/test_all.py b/src/ceph-detect-init/tests/test_all.py new file mode 100644 index 0000000000000..68189bf0187b8 --- /dev/null +++ b/src/ceph-detect-init/tests/test_all.py @@ -0,0 +1,162 @@ +# +# Copyright (C) 2015 SUSE LINUX GmbH +# Copyright (C) 2015 +# +# Author: Owen Synge +# Author: Loic Dachary +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see ``. +# +import logging +import mock +import testtools + +import ceph_detect_init +from ceph_detect_init import centos +from ceph_detect_init import debian +from ceph_detect_init import exc +from ceph_detect_init import fedora +from ceph_detect_init import main +from ceph_detect_init import rhel +from ceph_detect_init import suse + +logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', + level=logging.DEBUG) + + +class TestCephDetectInit(testtools.TestCase): + + def test_centos(self): + self.assertEqual('sysvinit', centos.choose_init()) + + def test_debian(self): + with mock.patch('ceph_detect_init.debian.distro', + 'debian'): + self.assertEqual('sysvinit', debian.choose_init()) + with mock.patch('ceph_detect_init.debian.distro', + 'ubuntu'): + self.assertEqual('upstart', debian.choose_init()) + + def test_fedora(self): + self.assertEqual('sysvinit', fedora.choose_init()) + + def test_rhel(self): + self.assertEqual('sysvinit', rhel.choose_init()) + + def test_suse(self): + with mock.patch('ceph_detect_init.suse.release', + '11'): + self.assertEqual('sysvinit', suse.choose_init()) + with mock.patch('ceph_detect_init.suse.release', + '12'): + self.assertEqual('systemd', suse.choose_init()) + with mock.patch('ceph_detect_init.suse.release', + '13.1'): + self.assertEqual('systemd', suse.choose_init()) + with mock.patch('ceph_detect_init.suse.release', + '13.2'): + self.assertEqual('systemd', suse.choose_init()) + + def test_get(self): + g = ceph_detect_init.get + with mock.patch('platform.linux_distribution', + lambda: (('unknown', '', ''))): + self.assertRaises(exc.UnsupportedPlatform, g) + try: + g() + except exc.UnsupportedPlatform as e: + self.assertIn('Platform is not supported', str(e)) + + with mock.patch('platform.linux_distribution', + lambda: (('debian', '6.0', ''))): + distro = ceph_detect_init.get() + self.assertEqual(debian, distro) + self.assertEqual('debian', distro.name) + self.assertEqual('debian', distro.normalized_name) + self.assertEqual('debian', distro.distro) + self.assertEqual(False, distro.is_el) + self.assertEqual('6.0', distro.release) + self.assertEqual('squeeze', distro.codename) + self.assertEqual('sysvinit', distro.init) + + def test_get_distro(self): + g = ceph_detect_init._get_distro + self.assertEqual(None, g(None)) + self.assertEqual(debian, g('debian')) + self.assertEqual(debian, g('ubuntu')) + self.assertEqual(centos, g('centos')) + self.assertEqual(centos, g('scientific')) + self.assertEqual(fedora, g('fedora')) + self.assertEqual(suse, g('suse')) + self.assertEqual(rhel, g('redhat', use_rhceph=True)) + + def test_normalized_distro_name(self): + n = ceph_detect_init._normalized_distro_name + self.assertEqual('redhat', n('RedHat')) + self.assertEqual('redhat', n('redhat')) + self.assertEqual('redhat', n('Red Hat')) + self.assertEqual('redhat', n('red hat')) + self.assertEqual('scientific', n('scientific')) + self.assertEqual('scientific', n('Scientific')) + self.assertEqual('scientific', n('Scientific Linux')) + self.assertEqual('scientific', n('scientific linux')) + self.assertEqual('suse', n('SUSE')) + self.assertEqual('suse', n('suse')) + self.assertEqual('suse', n('openSUSE')) + self.assertEqual('suse', n('opensuse')) + self.assertEqual('centos', n('CentOS')) + self.assertEqual('centos', n('centos')) + self.assertEqual('debian', n('Debian')) + self.assertEqual('debian', n('debian')) + self.assertEqual('ubuntu', n('Ubuntu')) + self.assertEqual('ubuntu', n('ubuntu')) + + def test_platform_information(self): + with mock.patch('platform.linux_distribution', + lambda: (('debian', '6.0', ''))): + self.assertEqual(('debian', '6.0', 'squeeze'), + ceph_detect_init.platform_information()) + + with mock.patch('platform.linux_distribution', + lambda: (('debian', '7.0', ''))): + self.assertEqual(('debian', '7.0', 'wheezy'), + ceph_detect_init.platform_information()) + + with mock.patch('platform.linux_distribution', + lambda: (('debian', '8.0', ''))): + self.assertEqual(('debian', '8.0', 'jessie'), + ceph_detect_init.platform_information()) + + with mock.patch('platform.linux_distribution', + lambda: (('debian', 'jessie/sid', ''))): + self.assertEqual(('debian', 'jessie/sid', 'sid'), + ceph_detect_init.platform_information()) + + with mock.patch('platform.linux_distribution', + lambda: (('debian', 'sid/jessie', ''))): + self.assertEqual(('debian', 'sid/jessie', 'sid'), + ceph_detect_init.platform_information()) + + def test_run(self): + argv = ['--use-rhceph', '--verbose'] + self.assertEqual(0, main.run(argv)) + + with mock.patch('platform.linux_distribution', + lambda: (('unknown', '', ''))): + self.assertRaises(exc.UnsupportedPlatform, main.run, argv) + self.assertEqual(0, main.run(argv + ['--default=sysvinit'])) + +# Local Variables: +# compile-command: "cd .. ; .tox/py27/bin/py.test tests/test_all.py" +# End: diff --git a/src/ceph-detect-init/tox.ini b/src/ceph-detect-init/tox.ini new file mode 100644 index 0000000000000..3da7065864307 --- /dev/null +++ b/src/ceph-detect-init/tox.ini @@ -0,0 +1,31 @@ +[tox] +envlist = pep8,py27,py3 +skip_missing_interpreters = True + +[testenv] +basepython = + py27: python2.7 + py3: python3 +setenv = VIRTUAL_ENV={envdir} +usedevelop = true +deps = + {env:NO_INDEX:} + --use-wheel + --find-links=file://{toxinidir}/wheelhouse + -r{toxinidir}/requirements.txt + -r{toxinidir}/test-requirements.txt + +commands = coverage run --source=ceph_detect_init {envbindir}/py.test -v tests + coverage report --omit=*test*,*tox* --show-missing --fail-under=100 + +[testenv:pep8] +basepython = python2 +commands = flake8 ceph_detect_init tests + +[testenv:integration] +basepython = python2 +setenv = VIRTUAL_ENV={envdir} +deps = -r{toxinidir}/requirements.txt + -r{toxinidir}/test-requirements.txt + +commands = {envbindir}/py.test -v integration/test_main.py diff --git a/src/ceph-disk b/src/ceph-disk index 6072c7a38501f..cb19cafc28e39 100755 --- a/src/ceph-disk +++ b/src/ceph-disk @@ -2,6 +2,7 @@ # # Copyright (C) 2014 Inktank # Copyright (C) 2014 Cloudwatt +# Copyright (C) 2014 Catalyst.net Ltd # # Author: Loic Dachary # @@ -30,6 +31,7 @@ import sys import tempfile import uuid import time +import shlex """ Prepare: @@ -43,6 +45,7 @@ Prepare: - triggered by administrator or ceph-deploy, e.g. 'ceph-disk [journal disk] Activate: + - if encrypted, map the dmcrypt volume - mount the volume in a temp location - allocate an osd id (if needed) - remount in the correct location /var/lib/ceph/osd/$cluster-$id @@ -74,12 +77,15 @@ knew the GPT partition type. CEPH_OSD_ONDISK_MAGIC = 'ceph osd volume v026' -JOURNAL_UUID = '45b0969e-9b03-4f30-b4c6-b4b80ceff106' -DMCRYPT_JOURNAL_UUID = '45b0969e-9b03-4f30-b4c6-5ec00ceff106' -OSD_UUID = '4fbd7e29-9d25-41b8-afd0-062c0ceff05d' -DMCRYPT_OSD_UUID = '4fbd7e29-9d25-41b8-afd0-5ec00ceff05d' -TOBE_UUID = '89c57f98-2fe5-4dc0-89c1-f3ad0ceff2be' -DMCRYPT_TOBE_UUID = '89c57f98-2fe5-4dc0-89c1-5ec00ceff2be' +JOURNAL_UUID = '45b0969e-9b03-4f30-b4c6-b4b80ceff106' +DMCRYPT_JOURNAL_UUID = '45b0969e-9b03-4f30-b4c6-5ec00ceff106' +DMCRYPT_LUKS_JOURNAL_UUID = '45b0969e-9b03-4f30-b4c6-35865ceff106' +OSD_UUID = '4fbd7e29-9d25-41b8-afd0-062c0ceff05d' +DMCRYPT_OSD_UUID = '4fbd7e29-9d25-41b8-afd0-5ec00ceff05d' +DMCRYPT_LUKS_OSD_UUID = '4fbd7e29-9d25-41b8-afd0-35865ceff05d' +TOBE_UUID = '89c57f98-2fe5-4dc0-89c1-f3ad0ceff2be' +DMCRYPT_TOBE_UUID = '89c57f98-2fe5-4dc0-89c1-5ec00ceff2be' +DMCRYPT_JOURNAL_TOBE_UUID = '89c57f98-2fe5-4dc0-89c1-35865ceff2be' DEFAULT_FS_TYPE = 'xfs' @@ -257,7 +263,10 @@ def which(executable): for location in locations: executable_path = os.path.join(location, executable) - if os.path.exists(executable_path): + LOG.debug('does ' + executable_path + ' exists ?') + if (os.path.isfile(executable_path) and + os.access(executable_path, os.X_OK)): + LOG.debug('yes, ' + executable_path + ' exists') return executable_path @@ -552,7 +561,7 @@ def verify_not_in_use(dev, check_partitions=False): raise Error('Device is mounted', dev) holders = is_held(dev) if holders: - raise Error('Device is in use by a device-mapper mapping (dm-crypt?)' % dev, ','.join(holders)) + raise Error('Device %s is in use by a device-mapper mapping (dm-crypt?)' % dev, ','.join(holders)) if check_partitions and not is_partition(dev): basename = get_dev_name(os.path.realpath(dev)) @@ -620,9 +629,24 @@ def write_one_line(parent, name, text): with file(tmp, 'wb') as tmp_file: tmp_file.write(text + '\n') os.fsync(tmp_file.fileno()) + path_set_context(tmp) os.rename(tmp, path) +def init_get(): + """ + Get a init system using 'ceph-detect-init' + """ + init = _check_output( + args=[ + 'ceph-detect-init', + '--default', 'sysvinit', + ], + ) + init = must_be_one_line(init) + return init + + def check_osd_magic(path): """ Check that this path has the Ceph OSD magic. @@ -686,6 +710,15 @@ def get_osd_id(path): check_osd_id(osd_id) return osd_id +def path_set_context(path): + # restore selinux context to default policy values + if which('restorecon'): + command( + [ + 'restorecon', '-R', + path, + ], + ) def _check_output(args=None, **kwargs): out, ret = command(args, **kwargs) @@ -774,29 +807,49 @@ def get_fsid(cluster): return fsid.lower() +def get_dmcrypt_key_path( + _uuid, + key_dir, + luks + ): + """ + Get path to dmcrypt key file. + + :return: Path to the dmcrypt key file, callers should check for existence. + """ + if luks: + path = os.path.join(key_dir, _uuid + ".luks.key") + else: + path = os.path.join(key_dir, _uuid) + + return path + + def get_or_create_dmcrypt_key( _uuid, key_dir, + key_size, + luks ): """ - Get path to dmcrypt key or create a new key file. + Get path to existing dmcrypt key or create a new key file. :return: Path to the dmcrypt key file. """ - path = os.path.join(key_dir, _uuid) - - # already have it? + path = get_dmcrypt_key_path(_uuid, key_dir, luks) if os.path.exists(path): return path # make a new key try: if not os.path.exists(key_dir): - os.makedirs(key_dir) + os.makedirs(key_dir, stat.S_IRUSR|stat.S_IWUSR|stat.S_IXUSR) with file('/dev/urandom', 'rb') as i: - key = i.read(256) - with file(path, 'wb') as key_file: - key_file.write(key) + key = i.read(key_size / 8) + fd = os.open(path, os.O_WRONLY|os.O_CREAT, + stat.S_IRUSR|stat.S_IWUSR) + assert os.write(fd, key) == len(key) + os.close(fd) return path except: raise Error('unable to read or create dm-crypt key', path) @@ -806,6 +859,8 @@ def dmcrypt_map( rawdev, keypath, _uuid, + cryptsetup_parameters, + luks ): """ Maps a device to a dmcrypt device. @@ -813,17 +868,40 @@ def dmcrypt_map( :return: Path to the dmcrypt device. """ dev = '/dev/mapper/' + _uuid - args = [ + luksFormat_args = [ + 'cryptsetup', + '--batch-mode', + '--key-file', + keypath, + 'luksFormat', + rawdev, + ] + cryptsetup_parameters + + luksOpen_args = [ + 'cryptsetup', + '--key-file', + keypath, + 'luksOpen', + rawdev, + _uuid, + ] + + create_args = [ 'cryptsetup', '--key-file', keypath, - '--key-size', '256', 'create', _uuid, rawdev, - ] + ] + cryptsetup_parameters + try: - command_check_call(args) + if luks: + command_check_call(luksFormat_args) + command_check_call(luksOpen_args) + else: + # Plain mode has no format function, nor any validation that the key is correct. + command_check_call(create_args) return dev except subprocess.CalledProcessError as e: @@ -885,6 +963,12 @@ def mount( path, ], ) + command( + [ + 'restorecon', + path, + ], + ) except subprocess.CalledProcessError as e: try: os.rmdir(path) @@ -926,6 +1010,9 @@ def unmount( ########################################### +def extract_parted_partition_numbers(partitions): + numbers_as_strings = re.findall('^\d+', partitions, re.MULTILINE) + return map(int, numbers_as_strings) def get_free_partition_index(dev): """ @@ -945,36 +1032,24 @@ def get_free_partition_index(dev): ], ) except subprocess.CalledProcessError as e: - print 'cannot read partition index; assume it isn\'t present\n (Error: %s)' % e + LOG.info('cannot read partition index; assume it isn\'t present\n (Error: %s)' % e) return 1 if not lines: raise Error('parted failed to output anything') - lines = str(lines).splitlines(True) - - # work around buggy libreadline(?) library in rhel/centos. - idiot_prefix = '\x1b\x5b\x3f\x31\x30\x33\x34\x68' - if lines[0].startswith(idiot_prefix): - lines[0] = lines[0][8:] - - if lines[0] not in ['CHS;\n', 'CYL;\n', 'BYT;\n']: - raise Error('weird parted units', lines[0]) - del lines[0] - - if not lines[0].startswith('/dev/'): - raise Error('weird parted disk entry', lines[0]) - del lines[0] - - seen = set() - for line in lines: - idx, _ = line.split(':', 1) - idx = int(idx) - seen.add(idx) - - num = 1 - while num in seen: - num += 1 - return num + if ('CHS;' not in lines and + 'CYL;' not in lines and + 'BYT;' not in lines): + raise Error('parted output expected to contain one of ' + + 'CHH; CYL; or BYT; : ' + lines) + if dev not in lines: + raise Error('parted output expected to contain ' + dev + ': ' + lines) + _, partitions = lines.split(dev) + partition_numbers = extract_parted_partition_numbers(partitions) + if partition_numbers: + return max(partition_numbers) + 1 + else: + return 1 def update_partition(action, dev, description): @@ -1010,6 +1085,9 @@ def zap(dev): """ Destroy the partition table and content of a given disk. """ + dmode = os.stat(dev).st_mode + if not stat.S_ISBLK(dmode) or is_partition(dev): + raise Error('not full block device; cannot zap', dev) try: LOG.debug('Zapping partition table on %s', dev) @@ -1025,6 +1103,13 @@ def zap(dev): [ 'sgdisk', '--zap-all', + '--', + dev, + ], + ) + command_check_call( + [ + 'sgdisk', '--clear', '--mbrtogpt', '--', @@ -1044,30 +1129,60 @@ def prepare_journal_dev( journal_size, journal_uuid, journal_dm_keypath, - ): + cryptsetup_parameters, + luks + ): reusing_partition = False if is_partition(journal): + if journal_dm_keypath: + raise Error(journal + ' partition already exists' + ' and --dmcrypt specified') LOG.debug('Journal %s is a partition', journal) LOG.warning('OSD will not be hot-swappable if journal is not the same device as the osd data') if get_partition_type(journal) == JOURNAL_UUID: LOG.debug('Journal %s was previously prepared with ceph-disk. Reusing it.', journal) reusing_partition = True - base = get_partition_base(journal) - part = journal.replace(base,'') - journal = base # needed for later + # Read and reuse the partition uuid from this journal's previous life. + # We reuse the uuid instead of changing it because udev does not reliably + # notice changes to an existing partition's GUID. + # See http://tracker.ceph.com/issues/10146 + journal_uuid = get_partition_uuid(journal) + LOG.debug('Reusing journal with uuid %s', journal_uuid) else: LOG.warning('Journal %s was not prepared with ceph-disk. Symlinking directly.', journal) return (journal, None, None) + journal_symlink = '/dev/disk/by-partuuid/{journal_uuid}'.format( + journal_uuid=journal_uuid, + ) + + journal_dmcrypt = None + if journal_dm_keypath: + journal_dmcrypt = journal_symlink + journal_symlink = '/dev/mapper/{uuid}'.format(uuid=journal_uuid) + + if reusing_partition: + # confirm that the journal_symlink exists. It should since this was an active journal + # in the past. Continuing otherwise would be futile. + assert os.path.exists(journal_symlink) + return (journal_symlink, journal_dmcrypt, journal_uuid) + + # From here on we are creating a new journal device, not reusing. + ptype = JOURNAL_UUID + ptype_tobe = JOURNAL_UUID if journal_dm_keypath: - ptype = DMCRYPT_JOURNAL_UUID + if luks: + ptype = DMCRYPT_LUKS_JOURNAL_UUID + else: + ptype = DMCRYPT_JOURNAL_UUID + ptype_tobe = DMCRYPT_JOURNAL_TOBE_UUID # it is a whole disk. create a partition! num = None - if journal == data and not reusing_partition: + if journal == data: # we're sharing the disk between osd data and journal; # make journal be partition number 2, so it's pretty num = 2 @@ -1075,9 +1190,6 @@ def prepare_journal_dev( num=num, size=journal_size, ) - elif reusing_partition: - num = int(part) - journal_part = '' # not used in this case else: # sgdisk has no way for me to say "whatever is the next # free index number" when setting type guids etc, so we @@ -1092,10 +1204,7 @@ def prepare_journal_dev( ) LOG.warning('OSD will not be hot-swappable if journal is not the same device as the osd data') - if reusing_partition: - dev_size = get_dev_size(base+part) - else: - dev_size = get_dev_size(journal) + dev_size = get_dev_size(journal) if journal_size > dev_size: LOG.error('refusing to create journal on %s' % journal) @@ -1105,7 +1214,9 @@ def prepare_journal_dev( ) try: - sgdisk_call = [ + LOG.debug('Creating journal partition num %d size %d on %s', num, journal_size, journal) + command_check_call( + [ 'sgdisk', '--new={part}'.format(part=journal_part), '--change-name={num}:ceph journal'.format(num=num), @@ -1115,19 +1226,13 @@ def prepare_journal_dev( ), '--typecode={num}:{uuid}'.format( num=num, - uuid=ptype, + uuid=ptype_tobe, ), '--mbrtogpt', '--', journal, - ] - if reusing_partition: - action= 'Reusing' - del sgdisk_call[1] # don't add --new when reusing - else: - action = 'Creating' - LOG.debug('%s journal partition num %d size %d on %s', action, num, journal_size, journal) - command_check_call(sgdisk_call) + ] + ) update_partition('-a', journal, 'prepared') @@ -1139,14 +1244,38 @@ def prepare_journal_dev( ], ) - journal_symlink = '/dev/disk/by-partuuid/{journal_uuid}'.format( - journal_uuid=journal_uuid, - ) + LOG.debug('Journal is GPT partition %s', journal_symlink) - journal_dmcrypt = None if journal_dm_keypath: - journal_dmcrypt = journal_symlink - journal_symlink = '/dev/mapper/{uuid}'.format(uuid=journal_uuid) + if luks: + luksFormat_args = [ + 'cryptsetup', + '--batch-mode', + '--key-file', + journal_dm_keypath, + 'luksFormat', + journal_dmcrypt, + ] + cryptsetup_parameters + + try: + command_check_call(luksFormat_args) + except subprocess.CalledProcessError as e: + raise Error('unable to format device for LUKS', journal_symlink, e) + + try: + command_check_call( + [ + 'sgdisk', + '--typecode={num}:{uuid}'.format( + num=num, + uuid=ptype, + ), + '--', + journal, + ], + ) + except subprocess.CalledProcessError as e: + raise Error('unable to mark device as formatted for LUKS', journal_symlink, e) LOG.debug('Journal is GPT partition %s', journal_symlink) return (journal_symlink, journal_dmcrypt, journal_uuid) @@ -1175,6 +1304,8 @@ def prepare_journal( force_file, force_dev, journal_dm_keypath, + cryptsetup_parameters, + luks ): if journal is None: @@ -1196,7 +1327,7 @@ def prepare_journal( if stat.S_ISBLK(jmode): if force_file: raise Error('Journal is not a regular file', journal) - return prepare_journal_dev(data, journal, journal_size, journal_uuid, journal_dm_keypath) + return prepare_journal_dev(data, journal, journal_size, journal_uuid, journal_dm_keypath, cryptsetup_parameters, luks) raise Error('Journal %s is neither a block device nor regular file' % journal) @@ -1277,6 +1408,8 @@ def prepare_dev( journal_uuid, journal_dmcrypt, osd_dm_keypath, + cryptsetup_parameters, + luks ): """ Prepare a data/journal combination to be used for an OSD. @@ -1292,7 +1425,10 @@ def prepare_dev( ptype_osd = OSD_UUID if osd_dm_keypath: ptype_tobe = DMCRYPT_TOBE_UUID - ptype_osd = DMCRYPT_OSD_UUID + if luks: + ptype_osd = DMCRYPT_LUKS_OSD_UUID + else: + ptype_osd = DMCRYPT_OSD_UUID rawdev = None if is_partition(data): @@ -1329,7 +1465,7 @@ def prepare_dev( dev = None if osd_dm_keypath: - dev = dmcrypt_map(rawdev, osd_dm_keypath, osd_uuid) + dev = dmcrypt_map(rawdev, osd_dm_keypath, osd_uuid, cryptsetup_parameters, luks) else: dev = rawdev @@ -1371,6 +1507,7 @@ def prepare_dev( journal_dmcrypt=journal_dmcrypt, ) finally: + path_set_context(path) unmount(path) finally: if rawdev != dev: @@ -1389,12 +1526,32 @@ def prepare_dev( except subprocess.CalledProcessError as e: raise Error(e) +def check_journal_reqs(args): + _, allows_journal = command([ + 'ceph-osd', '--check-allows-journal', + '-i', '0', + '--cluster', args.cluster, + ]) + _, wants_journal = command([ + 'ceph-osd', '--check-wants-journal', + '-i', '0', + '--cluster', args.cluster, + ]) + _, needs_journal = command([ + 'ceph-osd', '--check-needs-journal', + '-i', '0', + '--cluster', args.cluster, + ]) + return (not allows_journal, not wants_journal, not needs_journal) def main_prepare(args): journal_dm_keypath = None osd_dm_keypath = None try: + # first learn what the osd allows/wants/needs + (allows_journal, wants_journal, needs_journal) = check_journal_reqs(args) + prepare_lock.acquire() # noqa if not os.path.exists(args.data): if args.data_dev: @@ -1408,16 +1565,16 @@ def main_prepare(args): if stat.S_ISBLK(dmode): verify_not_in_use(args.data, True) + if args.journal and not allows_journal: + raise Error('journal specified but not allowed by osd backend') + if args.journal and os.path.exists(args.journal): jmode = os.stat(args.journal).st_mode if stat.S_ISBLK(jmode): verify_not_in_use(args.journal, False) if args.zap_disk is not None: - if stat.S_ISBLK(dmode) and not is_partition(args.data): - zap(args.data) - else: - raise Error('not full block device; cannot zap', args.data) + zap(args.data) if args.cluster_uuid is None: args.cluster_uuid = get_fsid(cluster=args.cluster) @@ -1473,30 +1630,93 @@ def main_prepare(args): ) journal_size = int(journal_size) + cryptsetup_parameters_str = get_conf( + cluster=args.cluster, + variable='osd_cryptsetup_parameters', + ) + if cryptsetup_parameters_str is None: + cryptsetup_parameters = [] + else: + cryptsetup_parameters = shlex.split(cryptsetup_parameters_str) + + dmcrypt_keysize_str = get_conf( + cluster=args.cluster, + variable='osd_dmcrypt_key_size', + ) + + dmcrypt_type = get_conf( + cluster=args.cluster, + variable='osd_dmcrypt_type', + ) + + if dmcrypt_type is None: + dmcrypt_type = "luks" + + if dmcrypt_type == "plain": + if dmcrypt_keysize_str is None: + # This value is hard-coded in the udev script + dmcrypt_keysize = 256 + else: + dmcrypt_keysize = int(dmcrypt_keysize_str) + LOG.warning('''ensure the 95-ceph-osd.rules file has been copied to /etc/udev/rules.d + and modified to call cryptsetup with --key-size=%s''' + % dmcrypt_keysize_str) + + if len (cryptsetup_parameters) > 0: + LOG.warning('''ensure the 95-ceph-osd.rules file has been copied to /etc/udev/rules.d + and modified to call cryptsetup with %s''' + % cryptsetup_parameters_str) + + cryptsetup_parameters = ['--key-size', str(dmcrypt_keysize)] + cryptsetup_parameters + luks = False + elif dmcrypt_type == "luks": + if dmcrypt_keysize_str is None: + # As LUKS will hash the 'passphrase' in .luks.key into a key, set a large default + # so if not updated for some time, it is still a + # reasonable value. + # + # We don't force this into the cryptsetup_parameters, as we want the cryptsetup defaults + # to prevail for the actual LUKS key lengths. + dmcrypt_keysize = 1024 + else: + dmcrypt_keysize = int(dmcrypt_keysize_str) + cryptsetup_parameters = ['--key-size', str(dmcrypt_keysize)] + cryptsetup_parameters + + luks = True + else: + raise Error('invalid osd_dmcrypt_type parameter (must be luks or plain): ', dmcrypt_type) + # colocate journal with data? - if stat.S_ISBLK(dmode) and not is_partition(args.data) and args.journal is None and args.journal_file is None: + if wants_journal and stat.S_ISBLK(dmode) and not is_partition(args.data) and args.journal is None and args.journal_file is None: LOG.info('Will colocate journal with data on %s', args.data) args.journal = args.data - if args.journal_uuid is None: + if args.journal and args.journal_uuid is None: args.journal_uuid = str(uuid.uuid4()) if args.osd_uuid is None: args.osd_uuid = str(uuid.uuid4()) # dm-crypt keys? if args.dmcrypt: - journal_dm_keypath = get_or_create_dmcrypt_key(args.journal_uuid, args.dmcrypt_key_dir) - osd_dm_keypath = get_or_create_dmcrypt_key(args.osd_uuid, args.dmcrypt_key_dir) + if args.journal: + journal_dm_keypath = get_or_create_dmcrypt_key(args.journal_uuid, args.dmcrypt_key_dir, dmcrypt_keysize, luks) + osd_dm_keypath = get_or_create_dmcrypt_key(args.osd_uuid, args.dmcrypt_key_dir, dmcrypt_keysize, luks) # prepare journal - (journal_symlink, journal_dmcrypt, journal_uuid) = prepare_journal( - data=args.data, - journal=args.journal, - journal_size=journal_size, - journal_uuid=args.journal_uuid, - force_file=args.journal_file, - force_dev=args.journal_dev, - journal_dm_keypath=journal_dm_keypath, + journal_symlink = None + journal_dmcrypt = None + journal_uuid = None + if args.journal: + (journal_symlink, journal_dmcrypt, journal_uuid) = prepare_journal( + data=args.data, + journal=args.journal, + journal_size=journal_size, + journal_uuid=args.journal_uuid, + force_file=args.journal_file, + force_dev=args.journal_dev, + journal_dm_keypath=journal_dm_keypath, + cryptsetup_parameters=cryptsetup_parameters, + luks=luks ) # prepare data @@ -1525,6 +1745,8 @@ def main_prepare(args): journal_uuid=journal_uuid, journal_dmcrypt=journal_dmcrypt, osd_dm_keypath=osd_dm_keypath, + cryptsetup_parameters=cryptsetup_parameters, + luks=luks ) else: raise Error('not a dir or block device', args.data) @@ -1535,9 +1757,17 @@ def main_prepare(args): except Error as e: if journal_dm_keypath: - os.unlink(journal_dm_keypath) + try: + os.unlink(journal_dm_keypath) + except OSError as e2: + if e2.errno != errno.ENOENT: # errno.ENOENT = no such file or directory + raise # re-raise exception if a different error occured if osd_dm_keypath: - os.unlink(osd_dm_keypath) + try: + os.unlink(osd_dm_keypath) + except OSError as e2: + if e2.errno != errno.ENOENT: # errno.ENOENT = no such file or directory + raise # re-raise exception if a different error occured prepare_lock.release() # noqa raise e @@ -1678,9 +1908,8 @@ def start_daemon( path = (STATEDIR + '/osd/{cluster}-{osd_id}').format( cluster=cluster, osd_id=osd_id) - # upstart? try: - if os.path.exists(os.path.join(path,'upstart')): + if os.path.exists(os.path.join(path, 'upstart')): command_check_call( [ '/sbin/initctl', @@ -1758,8 +1987,21 @@ def mount_activate( dev, activate_key_template, init, + dmcrypt, + dmcrypt_key_dir, ): + if dmcrypt: + # dev corresponds to a dmcrypt cyphertext device - map it before + # proceeding. + rawdev = dev + ptype = get_partition_type(rawdev) + if ptype not in [DMCRYPT_OSD_UUID]: + raise Error('activate --dmcrypt called for invalid dev %s' % (dev)) + part_uuid = get_partition_uuid(rawdev) + dmcrypt_key_path = os.path.join(dmcrypt_key_dir, part_uuid) + dev = dmcrypt_map(rawdev, dmcrypt_key_path, part_uuid) + try: fstype = detect_fstype(dev=dev) except (subprocess.CalledProcessError, @@ -1971,11 +2213,7 @@ def activate( if conf_val is not None: init = conf_val else: - (distro, release, codename) = platform.dist() - if distro == 'Ubuntu': - init = 'upstart' - else: - init = 'sysvinit' + init = init_get() LOG.debug('Marking with init system %s', init) with file(os.path.join(path, init), 'w'): @@ -2021,6 +2259,8 @@ def main_activate(args): dev=args.path, activate_key_template=args.activate_key_template, init=args.mark_init, + dmcrypt=args.dmcrypt, + dmcrypt_key_dir=args.dmcrypt_key_dir, ) osd_data = get_mount_point(cluster, osd_id) @@ -2035,7 +2275,7 @@ def main_activate(args): else: raise Error('%s is not a directory or block device' % args.path) - if args.mark_init == 'none': + if (not args.no_start_daemon and args.mark_init == 'none'): command_check_call( [ 'ceph-osd', @@ -2046,7 +2286,8 @@ def main_activate(args): ], ) - if args.mark_init not in (None, 'none' ): + if (not args.no_start_daemon and + args.mark_init not in (None, 'none' )): start_daemon( cluster=cluster, @@ -2095,15 +2336,38 @@ def main_activate_journal(args): cluster = None osd_id = None osd_uuid = None + dev = None activate_lock.acquire() # noqa try: - osd_uuid = get_journal_osd_uuid(args.dev) + if args.dmcrypt: + # journal dev corresponds to a dmcrypt cyphertext device - map + # it before proceeding. + rawdev = args.dev + ptype = get_partition_type(rawdev) + if ptype not in [DMCRYPT_JOURNAL_UUID]: + raise Error('activate-journal --dmcrypt called for invalid dev %s' % (rawdev)) + part_uuid = get_partition_uuid(rawdev) + dmcrypt_key_path = os.path.join(args.dmcrypt_key_dir, part_uuid) + dev = dmcrypt_map(rawdev, dmcrypt_key_path, partd_uuid) + else: + dev = args.dev + + # FIXME: For an encrypted journal dev, does this return the cyphertext + # or plaintext dev uuid!? Also, if the journal is encrypted, is the data + # partition also always encrypted, or are mixed pairs supported!? + osd_uuid = get_journal_osd_uuid(dev) path = os.path.join('/dev/disk/by-partuuid/', osd_uuid.lower()) + if is_suppressed(path): + LOG.info('suppressed activate request on %s', path) + return + (cluster, osd_id) = mount_activate( dev=path, activate_key_template=args.activate_key_template, init=args.mark_init, + dmcrypt=args.dmcrypt, + dmcrypt_key_dir=args.dmcrypt_key_dir, ) start_daemon( @@ -2129,20 +2393,27 @@ def main_activate_all(args): continue (tag, uuid) = name.split('.') - if tag == OSD_UUID or tag == DMCRYPT_OSD_UUID: + if tag == OSD_UUID or tag == DMCRYPT_OSD_UUID or tag == DMCRYPT_LUKS_OSD_UUID: - if tag == DMCRYPT_OSD_UUID: + if tag == DMCRYPT_OSD_UUID or tag == DMCRYPT_LUKS_OSD_UUID: path = os.path.join('/dev/mapper', uuid) else: path = os.path.join(dir, name) + if is_suppressed(path): + LOG.info('suppressed activate request on %s', path) + continue + LOG.info('Activating %s', path) activate_lock.acquire() # noqa try: + # never map dmcrypt cyphertext devices (cluster, osd_id) = mount_activate( dev=path, activate_key_template=args.activate_key_template, init=args.mark_init, + dmcrypt=False, + dmcrypt_key_dir='', ) start_daemon( cluster=cluster, @@ -2203,6 +2474,13 @@ def get_dev_fs(dev): return None +def split_dev_base_partnum(dev): + if 'loop' in dev or 'cciss' in dev or 'nvme' in dev: + return re.match('(.*\d+)p(\d+)', dev).group(1, 2) + else: + return re.match('(\D+)(\d+)', dev).group(1, 2) + + def get_partition_type(part): """ Get the GPT partition type UUID. If we have an old blkid and can't @@ -2252,7 +2530,7 @@ def get_partition_type(part): if 'blkid' not in warned_about: LOG.warning('Old blkid does not support ID_PART_ENTRY_* fields, trying sgdisk; may not correctly identify ceph volumes with dmcrypt') warned_about['blkid'] = True - (base, partnum) = re.match('(\D+)(\d+)', part).group(1, 2) + (base, partnum) = split_dev_base_partnum(part) sgdisk, _ = command( [ 'sgdisk', @@ -2278,7 +2556,7 @@ def get_partition_type(part): def get_partition_uuid(dev): - (base, partnum) = re.match('(\D+)(\d+)', dev).group(1, 2) + (base, partnum) = split_dev_base_partnum(dev) out, _ = command(['sgdisk', '-i', partnum, base]) for line in out.splitlines(): m = re.match('Partition unique GUID: (\S+)', line) @@ -2348,13 +2626,23 @@ def list_dev(dev, uuid_map, journal_map): elif ptype == DMCRYPT_OSD_UUID: holders = is_held(dev) if not holders: - desc = ['ceph data (dmcrypt)', 'not currently mapped'] + desc = ['ceph data (dmcrypt plain)', 'not currently mapped'] + elif len(holders) == 1: + holder = '/dev/' + holders[0] + fs_desc = list_dev_osd(holder, uuid_map) + desc = ['ceph data (dmcrypt plain %s)' % holder] + fs_desc + else: + desc = ['ceph data (dmcrypt plain)', 'holders: ' + ','.join(holders)] + elif ptype == DMCRYPT_LUKS_OSD_UUID: + holders = is_held(dev) + if not holders: + desc = ['ceph data (dmcrypt LUKS)', 'not currently mapped'] elif len(holders) == 1: holder = '/dev/' + holders[0] fs_desc = list_dev_osd(holder, uuid_map) - desc = ['ceph data (dmcrypt %s)' % holder] + fs_desc + desc = ['ceph data (dmcrypt LUKS %s)' % holder] + fs_desc else: - desc = ['ceph data (dmcrypt)', 'holders: ' + ','.join(holders)] + desc = ['ceph data (dmcrypt LUKS)', 'holders: ' + ','.join(holders)] elif ptype == JOURNAL_UUID: desc.append('ceph journal') part_uuid = get_partition_uuid(dev) @@ -2363,9 +2651,18 @@ def list_dev(dev, uuid_map, journal_map): elif ptype == DMCRYPT_JOURNAL_UUID: holders = is_held(dev) if len(holders) == 1: - desc = ['ceph journal (dmcrypt /dev/%s)' % holders[0]] + desc = ['ceph journal (dmcrypt plain /dev/%s)' % holders[0]] + else: + desc = ['ceph journal (dmcrypt plain)'] + part_uuid = get_partition_uuid(dev) + if part_uuid and part_uuid in journal_map: + desc.append('for %s' % journal_map[part_uuid]) + elif ptype == DMCRYPT_LUKS_JOURNAL_UUID: + holders = is_held(dev) + if len(holders) == 1: + desc = ['ceph journal (dmcrypt LUKS /dev/%s)' % holders[0]] else: - desc = ['ceph journal (dmcrypt)'] + desc = ['ceph journal (dmcrypt LUKS)'] part_uuid = get_partition_uuid(dev) if part_uuid and part_uuid in journal_map: desc.append('for %s' % journal_map[part_uuid]) @@ -2411,7 +2708,7 @@ def main_list(args): unmount(tpath) except MountError: pass - if ptype == DMCRYPT_OSD_UUID: + if ptype == DMCRYPT_OSD_UUID or ptype == DMCRYPT_LUKS_OSD_UUID: holders = is_held(dev) if len(holders) == 1: holder = '/dev/' + holders[0] @@ -2452,7 +2749,7 @@ def main_list(args): def is_suppressed(path): disk = os.path.realpath(path) try: - if not disk.startswith('/dev/') or not stat.S_ISBLK(os.lstat(path).st_mode): + if not disk.startswith('/dev/') or not stat.S_ISBLK(os.lstat(disk).st_mode): return False base = get_dev_name(disk) while len(base): @@ -2547,6 +2844,11 @@ def parse_args(): action='store_true', default=None, help='be more verbose', ) + parser.add_argument( + '--log-stdout', + action='store_true', default=None, + help='log to stdout', + ) parser.add_argument( '--prepend-to-path', metavar='PATH', @@ -2568,7 +2870,6 @@ def parse_args(): parser.set_defaults( # we want to hold on to this, for later prog=parser.prog, - cluster='ceph', ) subparsers = parser.add_subparsers( @@ -2581,6 +2882,7 @@ def parse_args(): prepare_parser.add_argument( '--cluster', metavar='NAME', + default='ceph', help='cluster name to assign this disk to', ) prepare_parser.add_argument( @@ -2673,12 +2975,28 @@ def parse_args(): default='auto', choices=INIT_SYSTEMS, ) + activate_parser.add_argument( + '--no-start-daemon', + action='store_true', default=None, + help='do not start the daemon', + ) activate_parser.add_argument( 'path', metavar='PATH', nargs='?', help='path to block device or directory', ) + activate_parser.add_argument( + '--dmcrypt', + action='store_true', default=None, + help='map DATA and/or JOURNAL devices with dm-crypt', + ) + activate_parser.add_argument( + '--dmcrypt-key-dir', + metavar='KEYDIR', + default='/etc/ceph/dmcrypt-keys', + help='directory where dm-crypt keys are stored', + ) activate_parser.set_defaults( activate_key_template='{statedir}/bootstrap-osd/{cluster}.keyring', func=main_activate, @@ -2703,6 +3021,17 @@ def parse_args(): default='auto', choices=INIT_SYSTEMS, ) + activate_journal_parser.add_argument( + '--dmcrypt', + action='store_true', default=None, + help='map DATA and/or JOURNAL devices with dm-crypt', + ) + activate_journal_parser.add_argument( + '--dmcrypt-key-dir', + metavar='KEYDIR', + default='/etc/ceph/dmcrypt-keys', + help='directory where dm-crypt keys are stored', + ) activate_journal_parser.set_defaults( activate_key_template='{statedir}/bootstrap-osd/{cluster}.keyring', func=main_activate_journal, @@ -2776,9 +3105,14 @@ def main(): if args.verbose: loglevel = logging.DEBUG - logging.basicConfig( - level=loglevel, - ) + if args.log_stdout: + ch = logging.StreamHandler(stream=sys.stdout) + ch.setLevel(loglevel) + LOG.addHandler(ch) + else: + logging.basicConfig( + level=loglevel, + ) if args.prepend_to_path != '': path = os.environ.get('PATH', os.defpath) diff --git a/src/ceph-disk-activate b/src/ceph-disk-activate deleted file mode 100755 index 72e89f9af30be..0000000000000 --- a/src/ceph-disk-activate +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/sh -dir=`dirname $0` -$dir/ceph-disk activate $* diff --git a/src/ceph-disk-prepare b/src/ceph-disk-prepare deleted file mode 100755 index f9255eb883168..0000000000000 --- a/src/ceph-disk-prepare +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/sh -dir=`dirname $0` -$dir/ceph-disk prepare $* diff --git a/src/ceph-disk-udev b/src/ceph-disk-udev index bdf524e6aea2f..8846d26030e47 100755 --- a/src/ceph-disk-udev +++ b/src/ceph-disk-udev @@ -38,6 +38,12 @@ case $ID_PART_ENTRY_TYPE in /sbin/cryptsetup --key-file /etc/ceph/dmcrypt-keys/${ID_PART_ENTRY_UUID} --key-size 256 create ${ID_PART_ENTRY_UUID} /dev/${NAME} ;; +45b0969e-9b03-4f30-b4c6-35865ceff106) + # DMCRYPT_LUKS_JOURNAL_UUID + # Map journal if using dm-crypt + /sbin/cryptsetup --key-file /etc/ceph/dmcrypt-keys/${ID_PART_ENTRY_UUID} luksOpen /dev/${NAME} ${ID_PART_ENTRY_UUID} + ;; + 4fbd7e29-9d25-41b8-afd0-062c0ceff05d) # OSD_UUID # activate ceph-tagged partitions. @@ -50,7 +56,16 @@ case $ID_PART_ENTRY_TYPE in # for dm-crypted data devices /sbin/cryptsetup --key-file /etc/ceph/dmcrypt-keys/${ID_PART_ENTRY_UUID} --key-size 256 create ${ID_PART_ENTRY_UUID} /dev/${NAME} bash -c 'while [ ! -e /dev/mapper/${ID_PART_ENTRY_UUID} ];do sleep 1; done' - /usr/sbin/ceph-disk-activate /dev/mapper/${ID_PART_ENTRY_UUID} + /usr/sbin/ceph-disk activate /dev/mapper/${ID_PART_ENTRY_UUID} + ;; + +4fbd7e29-9d25-41b8-afd0-35865ceff05d) + # DMCRYPT_LUKS_OSD_UUID + # Map data device and activate ceph-tagged partitions + # for dm-crypted data devices + /sbin/cryptsetup --key-file /etc/ceph/dmcrypt-keys/${ID_PART_ENTRY_UUID} luksOpen /dev/${NAME} ${ID_PART_ENTRY_UUID} + bash -c 'while [ ! -e /dev/mapper/${ID_PART_ENTRY_UUID} ];do sleep 1; done' + /usr/sbin/ceph-disk activate /dev/mapper/${ID_PART_ENTRY_UUID} ;; 89c57f98-2fe5-4dc0-89c1-f3ad0ceff2be) diff --git a/src/ceph-osd-prestart.sh b/src/ceph-osd-prestart.sh index 77153c9b36917..79f2c132d9fc9 100644 --- a/src/ceph-osd-prestart.sh +++ b/src/ceph-osd-prestart.sh @@ -27,7 +27,7 @@ if [ "${update:-1}" = "1" -o "${update:-1}" = "true" ]; then fi location="$($hook --cluster ${cluster:-ceph} --id $id --type osd)" weight="$(ceph-conf --cluster=${cluster:-ceph} --name=osd.$id --lookup osd_crush_initial_weight || :)" - defaultweight=`df -P -k /var/lib/ceph/osd/${cluster:-ceph}-$id/ | tail -1 | awk '{ d= $2/1073741824 ; r = sprintf("%.2f", d); print r }'` + defaultweight=`df -P -k /var/lib/ceph/osd/${cluster:-ceph}-$id/ | tail -1 | awk '{ d= $2/1073741824 ; r = sprintf("%.4f", d); print r }'` ceph \ --cluster="${cluster:-ceph}" \ --name="osd.$id" \ diff --git a/src/ceph-post-file.in b/src/ceph-post-file.in index 9b922a6c94153..b278e8abd95c7 100755 --- a/src/ceph-post-file.in +++ b/src/ceph-post-file.in @@ -1,16 +1,16 @@ #!/bin/bash -e -# if we start up as ./$0, assume we are running from a source -# checkout. -if [ `dirname $0` = "." ] && [ $PWD != "/usr/bin" ]; then - known_hosts=../share/known_hosts_drop.ceph.com - ssh_key=../share/id_dsa_drop.ceph.com -else - known_hosts=@datadir@/known_hosts_drop.ceph.com - ssh_key=@datadir@/id_dsa_drop.ceph.com +# If these files exist, assume we are a source install. +if [[ -f ../share/known_hosts_drop.ceph.com && -f ../share/id_dsa_drop.ceph.com ]] + then # running from source install + known_hosts=../share/known_hosts_drop.ceph.com + ssh_key=../share/id_dsa_drop.ceph.com + else # running from a pkg install + known_hosts=@datadir@/known_hosts_drop.ceph.com + ssh_key=@datadir@/id_dsa_drop.ceph.com fi -usage() { +function usage() { echo "Usage: $0 [options] file1 [dir2 ...] Easily upload files or directories to ceph.com for analysis by Ceph @@ -155,7 +155,7 @@ done cp "$ssh_key" "$t4" cp "${ssh_key}.pub" "$t4.pub" -sftp -i $t4 \ +sftp -o "IdentityFile=$t4" \ -C \ -oCheckHostIP=no \ -oGlobalKnownHostsFile=$known_hosts \ diff --git a/src/ceph-rbdnamer b/src/ceph-rbdnamer index efb68043cad0f..846f321e475b4 100755 --- a/src/ceph-rbdnamer +++ b/src/ceph-rbdnamer @@ -1,7 +1,7 @@ #!/bin/sh DEV=$1 -NUM=`echo $DEV | sed 's#p.*##g' | tr -d 'a-z'` +NUM=`echo $DEV | sed 's#p.*##g; s#[a-z]##g'` POOL=`cat /sys/devices/rbd/$NUM/pool` IMAGE=`cat /sys/devices/rbd/$NUM/name` SNAP=`cat /sys/devices/rbd/$NUM/current_snap` diff --git a/src/ceph.in b/src/ceph.in index 654148fba391f..f55f8c7df17b5 100755 --- a/src/ceph.in +++ b/src/ceph.in @@ -42,15 +42,18 @@ if MYDIR.endswith('src') and \ py_binary = os.environ.get("PYTHON", "python") MYLIBPATH = os.path.join(MYDIR, '.libs') + execv_cmd = ['python'] + if 'CEPH_DBG' in os.environ: + execv_cmd += ['-mpdb'] if lib_path_var in os.environ: if MYLIBPATH not in os.environ[lib_path_var]: os.environ[lib_path_var] += ':' + MYLIBPATH print >> sys.stderr, DEVMODEMSG - os.execvp(py_binary, ['python'] + sys.argv) + os.execvp(py_binary, execv_cmd + sys.argv) else: os.environ[lib_path_var] = MYLIBPATH print >> sys.stderr, DEVMODEMSG - os.execvp(py_binary, ['python'] + sys.argv) + os.execvp(py_binary, execv_cmd + sys.argv) sys.path.insert(0, os.path.join(MYDIR, 'pybind')) if os.environ.has_key('PATH') and MYDIR not in os.environ['PATH']: os.environ['PATH'] += ':' + MYDIR @@ -59,10 +62,9 @@ import argparse import errno import json import rados +import shlex import signal -import socket import string -import struct import subprocess from ceph_argparse import \ @@ -70,6 +72,8 @@ from ceph_argparse import \ matchnum, validate_command, find_cmd_target, \ send_command, json_command +from ceph_daemon import DaemonWatcher, admin_socket + # just a couple of globals verbose = False @@ -205,7 +209,7 @@ def do_extended_help(parser, args): def help_for_target(target, partial=None): ret, outbuf, outs = json_command(cluster_handle, target=target, - prefix='get_command_descriptions', + prefix='get_command_descriptions', timeout=10) if ret: print >> sys.stderr, \ @@ -300,60 +304,6 @@ def format_help(cmddict, partial=None): return fullusage -def admin_socket(asok_path, cmd, format=''): - """ - Send a daemon (--admin-daemon) command 'cmd'. asok_path is the - path to the admin socket; cmd is a list of strings; format may be - set to one of the formatted forms to get output in that form - (daemon commands don't support 'plain' output). - """ - - def do_sockio(path, cmd): - """ helper: do all the actual low-level stream I/O """ - sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) - sock.connect(path) - try: - sock.sendall(cmd + '\0') - len_str = sock.recv(4) - if len(len_str) < 4: - raise RuntimeError("no data returned from admin socket") - l, = struct.unpack(">I", len_str) - ret = '' - - got = 0 - while got < l: - bit = sock.recv(l - got) - ret += bit - got += len(bit) - - except Exception as e: - raise RuntimeError('exception: ' + str(e)) - return ret - - try: - cmd_json = do_sockio(asok_path, - json.dumps({"prefix":"get_command_descriptions"})) - except Exception as e: - raise RuntimeError('exception getting command descriptions: ' + str(e)) - - if cmd == 'get_command_descriptions': - return cmd_json - - sigdict = parse_json_funcsigs(cmd_json, 'cli') - valid_dict = validate_command(sigdict, cmd) - if not valid_dict: - raise RuntimeError('invalid command') - - if format: - valid_dict['format'] = format - - try: - ret = do_sockio(asok_path, json.dumps(valid_dict)) - except Exception as e: - raise RuntimeError('exception: ' + str(e)) - - return ret - def ceph_conf(parsed_args, field, name): args=['ceph-conf'] @@ -381,6 +331,25 @@ def ceph_conf(parsed_args, field, name): PROMPT = 'ceph> ' +if sys.stdin.isatty(): + def read_input(): + while True: + line = raw_input(PROMPT).rstrip() + if line in ['q', 'quit', 'Q', 'exit']: + return None + if line: + return line +else: + def read_input(): + while True: + line = sys.stdin.readline() + if not line: + return None + line = line.rstrip() + if line: + return line + + def new_style_command(parsed_args, cmdargs, target, sigdict, inbuf, verbose): """ Do new-style command dance. @@ -408,21 +377,22 @@ def new_style_command(parsed_args, cmdargs, target, sigdict, inbuf, verbose): else: return -errno.EINVAL, '', 'invalid command' else: - # do the command-interpreter looping - # for raw_input to do readline cmd editing - import readline - if sys.stdin.isatty(): - prompt = PROMPT - else: - prompt = '' + # do the command-interpreter looping + # for raw_input to do readline cmd editing + import readline # noqa while True: - interactive_input = raw_input(prompt) - if interactive_input in ['q', 'quit', 'Q']: + interactive_input = read_input() + if interactive_input is None: return 0, '', '' - cmdargs = parse_cmdargs(interactive_input.split())[2] - target = find_cmd_target(cmdargs) + cmdargs = parse_cmdargs(shlex.split(interactive_input))[2] + try: + target = find_cmd_target(cmdargs) + except Exception as e: + print >> sys.stderr, \ + 'error handling command target: {0}'.format(e) + return 1, '', '' valid_dict = validate_command(sigdict, cmdargs, verbose) if valid_dict: if parsed_args.output_format: @@ -435,7 +405,7 @@ def new_style_command(parsed_args, cmdargs, target, sigdict, inbuf, verbose): if ret: ret = abs(ret) print >> sys.stderr, \ - 'Error: {0} {1}'.format(ret, errno.errorcode[ret]) + 'Error: {0} {1}'.format(ret, errno.errorcode.get(ret, 'Unknown')) if outbuf: print outbuf if outs: @@ -451,7 +421,7 @@ def new_style_command(parsed_args, cmdargs, target, sigdict, inbuf, verbose): def complete(sigdict, args, target): """ Command completion. Match as much of [args] as possible, - and print every possible match separated by newlines. + and print every possible match separated by newlines. Return exitcode. """ # XXX this looks a lot like the front of validate_command(). Refactor? @@ -510,14 +480,20 @@ def complete(sigdict, args, target): ### # ping a monitor ### -def ping_monitor(cluster_handle, name): +def ping_monitor(cluster_handle, name, timeout): if 'mon.' not in name: print >> sys.stderr, '"ping" expects a monitor to ping; try "ping mon."' return 1 mon_id = name[len('mon.'):] - s = cluster_handle.ping_monitor(mon_id) - print s + if (mon_id == '*') : + cluster_handle.connect(timeout=timeout) + for m in monids() : + s = cluster_handle.ping_monitor(m) + print "mon.{0}".format(m) + '\n' + s + else : + s = cluster_handle.ping_monitor(mon_id) + print s return 0 ### @@ -535,7 +511,7 @@ def main(): parser, parsed_args, childargs = parse_cmdargs() if parsed_args.version: - print 'ceph version {0} ({1})'.format(CEPH_GIT_NICE_VER, CEPH_GIT_VER) + print 'ceph version {0} ({1})'.format(CEPH_GIT_NICE_VER, CEPH_GIT_VER) # noqa return 0 global verbose @@ -565,12 +541,16 @@ def main(): format = parsed_args.output_format + daemon_perf = False sockpath = None if parsed_args.admin_socket: sockpath = parsed_args.admin_socket - elif len(childargs) > 0 and childargs[0] == "daemon": + elif len(childargs) > 0 and childargs[0] in ["daemon", "daemonperf"]: + daemon_perf = (childargs[0] == "daemonperf") # Treat "daemon " or "daemon " like --admin_daemon - if len(childargs) > 2: + # Handle "daemonperf " the same but requires no trailing args + require_args = 2 if daemon_perf else 3 + if len(childargs) >= require_args: if childargs[1].find('/') >= 0: sockpath = childargs[1] else: @@ -585,10 +565,29 @@ def main(): # for both: childargs = childargs[2:] else: - print >> sys.stderr, 'daemon requires at least 3 arguments' + print >> sys.stderr, '{0} requires at least {1} arguments'.format( + childargs[0], require_args) return errno.EINVAL - if sockpath: + if sockpath and daemon_perf: + interval = 1 + count = None + if len(childargs) > 0: + try: + interval = float(childargs[0]) + if interval < 0: + raise ValueError + except ValueError: + print >> sys.stderr, 'daemonperf: interval should be a positive number' + return errno.EINVAL + if len(childargs) > 1: + if not childargs[1].isdigit(): + print >> sys.stderr, 'daemonperf: count should be a positive integer' + return errno.EINVAL + count = int(childargs[1]) + DaemonWatcher(sockpath).run(interval, count) + return 0 + elif sockpath: try: print admin_socket(sockpath, childargs, format) except Exception as e: @@ -655,9 +654,12 @@ def main(): if len(childargs) >= 2 and \ childargs[0] in ['mon', 'osd'] and \ childargs[1] == 'tell': - print >> sys.stderr, '"{0} tell" is deprecated; try "tell {0}." instead (id can be "*") '.format(childargs[0]) + print >> sys.stderr, '"{0} tell" is deprecated; try "tell {0}. [options...]" instead (id can be "*") '.format(childargs[0]) return 1 + if childargs in [['mon'], ['osd']]: + parsed_args.help = True + if parsed_args.help: # short default timeout for -h if not timeout: @@ -673,7 +675,7 @@ def main(): try: if childargs and childargs[0] == 'ping': - return ping_monitor(cluster_handle, childargs[1]) + return ping_monitor(cluster_handle, childargs[1], timeout) cluster_handle.connect(timeout=timeout) except KeyboardInterrupt: print >> sys.stderr, 'Cluster connection aborted' @@ -717,7 +719,7 @@ def main(): # this instance keeps the watch connection alive, but is # otherwise unused - logwatch = rados.MonitorLog(cluster_handle, level, watch_cb, 0) + rados.MonitorLog(cluster_handle, level, watch_cb, 0) # loop forever letting watch_cb print lines try: @@ -750,7 +752,12 @@ def main(): if parsed_args.status: childargs.insert(0, 'status') - target = find_cmd_target(childargs) + try: + target = find_cmd_target(childargs) + except Exception as e: + print >> sys.stderr, \ + 'error handling command target: {0}'.format(e) + return 1 # Repulsive hack to handle tell: lop off 'tell' and target # and validate the rest of the command. 'target' is already @@ -765,7 +772,9 @@ def main(): childargs = injectargs if not len(childargs): print >> sys.stderr, \ - 'Cannot use \'tell\' with interactive mode' + 'Cannot use \'tell\' with interactive mode.', \ + 'For an interactive shell,', \ + 'please start "{0}" without non-option arguments.'.format(sys.argv[0]) return errno.EINVAL # fetch JSON sigs from command @@ -829,11 +838,11 @@ def main(): sigdict, inbuf, verbose) if ret < 0: ret = -ret - print >> sys.stderr, prefix + 'Second attempt of previously successful command failed with {0}: {1}'.format(errno.errorcode[ret], outs) + print >> sys.stderr, prefix + 'Second attempt of previously successful command failed with {0}: {1}'.format(errno.errorcode.get(ret, 'Unknown'), outs) if ret < 0: ret = -ret - print >> sys.stderr, prefix + 'Error {0}: {1}'.format(errno.errorcode[ret], outs) + print >> sys.stderr, prefix + 'Error {0}: {1}'.format(errno.errorcode.get(ret, 'Unknown'), outs) if len(targets) > 1: final_ret = ret else: @@ -877,4 +886,8 @@ def main(): return 0 if __name__ == '__main__': - sys.exit(main()) + retval = main() + # shutdown explicitly; Rados() does not + if cluster_handle: + cluster_handle.shutdown() + sys.exit(retval) diff --git a/src/ceph.in.cmake b/src/ceph.in.cmake new file mode 100755 index 0000000000000..805393e75a8ca --- /dev/null +++ b/src/ceph.in.cmake @@ -0,0 +1,827 @@ +#!@PYTHON_EXECUTABLE@ +# -*- mode:python -*- +# vim: ts=4 sw=4 smarttab expandtab +# +# Processed in Makefile to add python #! line and version variable +# +# + + +""" +ceph.in becomes ceph, the command-line management tool for Ceph clusters. +This is a replacement for tools/ceph.cc and tools/common.cc. + +Copyright (C) 2013 Inktank Storage, Inc. + +This is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public +License version 2, as published by the Free Software +Foundation. See file COPYING. +""" + +import os +import sys + +# Make life easier on developers: +# If in src/, and .libs and pybind exist here, assume we're running +# from a Ceph source dir and tweak PYTHONPATH and LD_LIBRARY_PATH +# to use local files + +MYPATH = os.path.abspath(__file__) +MYDIR = os.path.dirname(MYPATH) +DEVMODEMSG = '*** DEVELOPER MODE: setting PATH, PYTHONPATH and LD_LIBRARY_PATH ***' + +if MYDIR.endswith('src') and \ + os.path.exists(os.path.join(MYDIR, '.libs')) and \ + os.path.exists(os.path.join(MYDIR, 'pybind')): + MYLIBPATH = os.path.join(MYDIR, '.libs') + if 'LD_LIBRARY_PATH' in os.environ: + if MYLIBPATH not in os.environ['LD_LIBRARY_PATH']: + os.environ['LD_LIBRARY_PATH'] += ':' + MYLIBPATH + print >> sys.stderr, DEVMODEMSG + os.execvp('python', ['python'] + sys.argv) + else: + os.environ['LD_LIBRARY_PATH'] = MYLIBPATH + print >> sys.stderr, DEVMODEMSG + os.execvp('python', ['python'] + sys.argv) + sys.path.insert(0, os.path.join(MYDIR, 'pybind')) + if MYDIR not in os.environ['PATH']: + os.environ['PATH'] += ':' + MYDIR + +import argparse +import errno +import json +import rados +import signal +import socket +import string +import struct +import subprocess + +from ceph_argparse import \ + concise_sig, descsort, parse_json_funcsigs, \ + matchnum, validate_command, find_cmd_target, \ + send_command, json_command + +# just a couple of globals + +verbose = False +cluster_handle = None + +############################################################################ + +def osdids(): + ret, outbuf, outs = json_command(cluster_handle, prefix='osd ls') + if ret == -errno.EINVAL: + # try old mon + ret, outbuf, outs = send_command(cluster_handle, cmd=['osd', 'ls']) + if ret: + raise RuntimeError('Can\'t contact mon for osd list') + return [i for i in outbuf.split('\n') if i != ''] + +def monids(): + ret, outbuf, outs = json_command(cluster_handle, prefix='mon dump', + argdict={'format':'json'}) + if ret == -errno.EINVAL: + # try old mon + ret, outbuf, outs = send_command(cluster_handle, + cmd=['mon', 'dump', '--format=json']) + if ret: + raise RuntimeError('Can\'t contact mon for mon list') + d = json.loads(outbuf) + return [m['name'] for m in d['mons']] + +def mdsids(): + ret, outbuf, outs = json_command(cluster_handle, prefix='mds dump', + argdict={'format':'json'}) + if ret == -errno.EINVAL: + # try old mon + ret, outbuf, outs = send_command(cluster_handle, + cmd=['mds', 'dump', '--format=json']) + if ret: + raise RuntimeError('Can\'t contact mon for mds list') + d = json.loads(outbuf) + l = [] + infodict = d['info'] + for mdsdict in infodict.values(): + l.append(mdsdict['name']) + return l + +def parse_cmdargs(args=None, target=''): + # alias: let the line-wrapping be sane + AP = argparse.ArgumentParser + + # format our own help + parser = AP(description='Ceph administration tool', add_help=False) + + parser.add_argument('--completion', action='store_true', + help=argparse.SUPPRESS) + + parser.add_argument('-h', '--help', help='request mon help', + action='store_true') + + parser.add_argument('-c', '--conf', dest='cephconf', + help='ceph configuration file') + parser.add_argument('-i', '--in-file', dest='input_file', + help='input file') + parser.add_argument('-o', '--out-file', dest='output_file', + help='output file') + + parser.add_argument('--id', '--user', dest='client_id', + help='client id for authentication') + parser.add_argument('--name', '-n', dest='client_name', + help='client name for authentication') + parser.add_argument('--cluster', help='cluster name') + + parser.add_argument('--admin-daemon', dest='admin_socket', + help='submit admin-socket commands (\"help\" for help') + parser.add_argument('--admin-socket', dest='admin_socket_nope', + help='you probably mean --admin-daemon') + + parser.add_argument('-s', '--status', action='store_true', + help='show cluster status') + + parser.add_argument('-w', '--watch', action='store_true', + help='watch live cluster changes') + parser.add_argument('--watch-debug', action='store_true', + help='watch debug events') + parser.add_argument('--watch-info', action='store_true', + help='watch info events') + parser.add_argument('--watch-sec', action='store_true', + help='watch security events') + parser.add_argument('--watch-warn', action='store_true', + help='watch warn events') + parser.add_argument('--watch-error', action='store_true', + help='watch error events') + + parser.add_argument('--version', '-v', action="store_true", help="display version") + parser.add_argument('--verbose', action="store_true", help="make verbose") + parser.add_argument('--concise', dest='verbose', action="store_false", + help="make less verbose") + + parser.add_argument('-f', '--format', choices=['json', 'json-pretty', + 'xml', 'xml-pretty', 'plain'], dest='output_format') + + parser.add_argument('--connect-timeout', dest='cluster_timeout', + type=int, + help='set a timeout for connecting to the cluster') + + # returns a Namespace with the parsed args, and a list of all extras + parsed_args, extras = parser.parse_known_args(args) + + return parser, parsed_args, extras + + +def hdr(s): + print '\n', s, '\n', '=' * len(s) + +def do_basic_help(parser, args): + """ + Print basic parser help + If the cluster is available, get and print monitor help + """ + hdr('General usage:') + parser.print_help() + +def do_extended_help(parser, args): + def help_for_sigs(sigs, partial=None): + sys.stdout.write(format_help(parse_json_funcsigs(sigs, 'cli'), + partial=partial)) + + def help_for_target(target, partial=None): + ret, outbuf, outs = json_command(cluster_handle, target=target, + prefix='get_command_descriptions', + timeout=10) + if ret: + print >> sys.stderr, \ + "couldn't get command descriptions for {0}: {1}".\ + format(target, outs) + else: + help_for_sigs(outbuf, partial) + + partial = ' '.join(args) + if (cluster_handle.state == "connected"): + help_for_target(target=('mon', ''), partial=partial) + return 0 + +DONTSPLIT = string.letters + '{[<>]}' + +def wrap(s, width, indent): + """ + generator to transform s into a sequence of strings width or shorter, + for wrapping text to a specific column width. + Attempt to break on anything but DONTSPLIT characters. + indent is amount to indent 2nd-through-nth lines. + + so "long string long string long string" width=11 indent=1 becomes + 'long string', ' long string', ' long string' so that it can be printed + as + long string + long string + long string + + Consumes s. + """ + result = '' + leader = '' + while len(s): + + if (len(s) <= width): + # no splitting; just possibly indent + result = leader + s + s = '' + yield result + + else: + splitpos = width + while (splitpos > 0) and (s[splitpos-1] in DONTSPLIT): + splitpos -= 1 + + if splitpos == 0: + splitpos = width + + if result: + # prior result means we're mid-iteration, indent + result = leader + else: + # first time, set leader and width for next + leader = ' ' * indent + width -= 1 # for subsequent space additions + + # remove any leading spaces in this chunk of s + result += s[:splitpos].lstrip() + s = s[splitpos:] + + yield result + + raise StopIteration + +def format_help(cmddict, partial=None): + """ + Formats all the cmdsigs and helptexts from cmddict into a sorted-by- + cmdsig 2-column display, with each column wrapped and indented to + fit into 40 characters. + """ + + fullusage = '' + for cmd in sorted(cmddict.itervalues(), cmp=descsort): + + if not cmd['help']: + continue + concise = concise_sig(cmd['sig']) + if partial and not concise.startswith(partial): + continue + siglines = [l for l in wrap(concise, 40, 1)] + helplines = [l for l in wrap(cmd['help'], 39, 1)] + + # make lists the same length + maxlen = max(len(siglines), len(helplines)) + siglines.extend([''] * (maxlen - len(siglines))) + helplines.extend([''] * (maxlen - len(helplines))) + + # so we can zip them for output + for (s, h) in zip(siglines, helplines): + fullusage += '{0:40s} {1}\n'.format(s, h) + + return fullusage + +def admin_socket(asok_path, cmd, format=''): + """ + Send a daemon (--admin-daemon) command 'cmd'. asok_path is the + path to the admin socket; cmd is a list of strings; format may be + set to one of the formatted forms to get output in that form + (daemon commands don't support 'plain' output). + """ + + def do_sockio(path, cmd): + """ helper: do all the actual low-level stream I/O """ + sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + sock.connect(path) + try: + sock.sendall(cmd + '\0') + len_str = sock.recv(4) + if len(len_str) < 4: + raise RuntimeError("no data returned from admin socket") + l, = struct.unpack(">I", len_str) + ret = '' + + got = 0 + while got < l: + bit = sock.recv(l - got) + ret += bit + got += len(bit) + + except Exception as e: + raise RuntimeError('exception: ' + str(e)) + return ret + + try: + cmd_json = do_sockio(asok_path, + json.dumps({"prefix":"get_command_descriptions"})) + except Exception as e: + raise RuntimeError('exception getting command descriptions: ' + str(e)) + + if cmd == 'get_command_descriptions': + return cmd_json + + sigdict = parse_json_funcsigs(cmd_json, 'cli') + valid_dict = validate_command(sigdict, cmd) + if not valid_dict: + raise RuntimeError('invalid command') + + if format: + valid_dict['format'] = format + + try: + ret = do_sockio(asok_path, json.dumps(valid_dict)) + except Exception as e: + raise RuntimeError('exception: ' + str(e)) + + return ret + + +def ceph_conf(field, name): + p = subprocess.Popen( + args=[ + 'ceph-conf', + '--show-config-value', + field, + '-n', + name, + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + outdata, errdata = p.communicate() + if (len(errdata)): + raise RuntimeError('unable to get conf option %s for %s: %s' % (field, name, errdata)) + return outdata.rstrip() + +def new_style_command(parsed_args, cmdargs, target, sigdict, inbuf, verbose): + """ + Do new-style command dance. + target: daemon to receive command: mon (any) or osd.N + sigdict - the parsed output from the new monitor describing commands + inbuf - any -i input file data + verbose - bool + """ + if verbose: + for cmdtag in sorted(sigdict.keys()): + cmd = sigdict[cmdtag] + sig = cmd['sig'] + print '{0}: {1}'.format(cmdtag, concise_sig(sig)) + + got_command = False + + if not got_command: + if cmdargs: + # Validate input args against list of sigs + valid_dict = validate_command(sigdict, cmdargs, verbose) + if valid_dict: + got_command = True + if parsed_args.output_format: + valid_dict['format'] = parsed_args.output_format + else: + return -errno.EINVAL, '', 'invalid command' + else: + # do the command-interpreter looping + # for raw_input to do readline cmd editing + import readline + while True: + interactive_input = raw_input('ceph> ') + if interactive_input in ['q', 'quit', 'Q']: + return 0, '', '' + cmdargs = parse_cmdargs(interactive_input.split())[2] + target = find_cmd_target(cmdargs) + valid_dict = validate_command(sigdict, cmdargs, verbose) + if valid_dict: + if parsed_args.output_format: + valid_dict['format'] = parsed_args.output_format + if verbose: + print >> sys.stderr, "Submitting command ", valid_dict + ret, outbuf, outs = json_command(cluster_handle, + target=target, + argdict=valid_dict) + if ret: + ret = abs(ret) + print >> sys.stderr, \ + 'Error: {0} {1}'.format(ret, errno.errorcode[ret]) + if outbuf: + print outbuf + if outs: + print >> sys.stderr, 'Status:\n', outs + else: + print >> sys.stderr, "Invalid command" + + if verbose: + print >> sys.stderr, "Submitting command ", valid_dict + return json_command(cluster_handle, target=target, argdict=valid_dict, + inbuf=inbuf) + +def complete(sigdict, args, target): + """ + Command completion. Match as much of [args] as possible, + and print every possible match separated by newlines. + Return exitcode. + """ + # XXX this looks a lot like the front of validate_command(). Refactor? + + complete_verbose = 'COMPVERBOSE' in os.environ + + # Repulsive hack to handle tell: lop off 'tell' and target + # and validate the rest of the command. 'target' is already + # determined in our callers, so it's ok to remove it here. + if len(args) and args[0] == 'tell': + args = args[2:] + # look for best match, accumulate possibles in bestcmds + # (so we can maybe give a more-useful error message) + best_match_cnt = 0 + bestcmds = [] + for cmdtag, cmd in sigdict.iteritems(): + sig = cmd['sig'] + matched = matchnum(args, sig, partial=True) + if (matched > best_match_cnt): + if complete_verbose: + print >> sys.stderr, \ + "better match: {0} > {1}: {2}:{3} ".format(matched, + best_match_cnt, cmdtag, concise_sig(sig)) + best_match_cnt = matched + bestcmds = [{cmdtag:cmd}] + elif matched == best_match_cnt: + if complete_verbose: + print >> sys.stderr, \ + "equal match: {0} > {1}: {2}:{3} ".format(matched, + best_match_cnt, cmdtag, concise_sig(sig)) + bestcmds.append({cmdtag:cmd}) + + # look through all matching sigs + comps = [] + for cmddict in bestcmds: + for cmd in cmddict.itervalues(): + sig = cmd['sig'] + # either: + # we match everything fully, so we want the next desc, or + # we match more partially, so we want the partial match + fullindex = matchnum(args, sig, partial=False) - 1 + partindex = matchnum(args, sig, partial=True) - 1 + if complete_verbose: + print >> sys.stderr, '{}: f {} p {} len {}'.format(sig, fullindex, partindex, len(sig)) + if fullindex == partindex and fullindex + 1 < len(sig): + d = sig[fullindex + 1] + else: + d = sig[partindex] + comps.append(str(d)) + if complete_verbose: + print >> sys.stderr, '\n'.join(comps) + print '\n'.join(comps) + + return 0 + +### +# ping a monitor +### +def ping_monitor(cluster_handle, name, timeout): + if 'mon.' not in name: + print >> sys.stderr, '"ping" expects a monitor to ping; try "ping mon."' + return 1 + + mon_id = name[len('mon.'):] + if (mon_id == '*') : + cluster_handle.connect(timeout=timeout) + for m in monids() : + s = cluster_handle.ping_monitor(m) + print "mon.{0}".format(m) + '\n' + s + else : + s = cluster_handle.ping_monitor(mon_id) + print s + return 0 + +### +# main +### + +def main(): + ceph_args = os.environ.get('CEPH_ARGS') + if ceph_args: + sys.argv.extend(ceph_args.split()) + + parser, parsed_args, childargs = parse_cmdargs() + + if parsed_args.version: + print 'ceph version {0} ({1})'.format(CEPH_GIT_NICE_VER, CEPH_GIT_VER) + return 0 + + global verbose + verbose = parsed_args.verbose + + if parsed_args.admin_socket_nope: + print >> sys.stderr, '--admin-socket is used by daemons; '\ + 'you probably mean --admin-daemon/daemon' + return 1 + + # pass on --id, --name, --conf + name = 'client.admin' + if parsed_args.client_id: + name = 'client.' + parsed_args.client_id + if parsed_args.client_name: + name = parsed_args.client_name + + # default '' means default conf search + conffile = '' + if parsed_args.cephconf: + conffile = parsed_args.cephconf + # For now, --admin-daemon is handled as usual. Try it + # first in case we can't connect() to the cluster + + format = parsed_args.output_format + + sockpath = None + if parsed_args.admin_socket: + sockpath = parsed_args.admin_socket + elif len(childargs) > 0 and childargs[0] == "daemon": + # Treat "daemon " or "daemon " like --admin_daemon + if len(childargs) > 2: + if childargs[1].find('/') >= 0: + sockpath = childargs[1] + else: + # try resolve daemon name + try: + sockpath = ceph_conf('admin_socket', childargs[1]) + except Exception as e: + print >> sys.stderr, \ + 'Can\'t get admin socket path: ' + str(e) + return errno.EINVAL + # for both: + childargs = childargs[2:] + else: + print >> sys.stderr, 'daemon requires at least 3 arguments' + return errno.EINVAL + + if sockpath: + try: + print admin_socket(sockpath, childargs, format) + except Exception as e: + print >> sys.stderr, 'admin_socket: {0}'.format(e) + return errno.EINVAL + return 0 + + timeout = None + if parsed_args.cluster_timeout: + timeout = parsed_args.cluster_timeout + + # basic help + if parsed_args.help: + do_basic_help(parser, childargs) + + # handle any 'generic' ceph arguments that we didn't parse here + global cluster_handle + + # rados.Rados() will call rados_create2, and then read the conf file, + # and then set the keys from the dict. So we must do these + # "pre-file defaults" first (see common_preinit in librados) + conf_defaults = { + 'log_to_stderr':'true', + 'err_to_stderr':'true', + 'log_flush_on_exit':'true', + } + + clustername = 'ceph' + if parsed_args.cluster: + clustername = parsed_args.cluster + + try: + cluster_handle = rados.Rados(name=name, clustername=clustername, + conf_defaults=conf_defaults, + conffile=conffile) + retargs = cluster_handle.conf_parse_argv(childargs) + except rados.Error as e: + print >> sys.stderr, 'Error initializing cluster client: {0}'.\ + format(e.__class__.__name__) + return 1 + + #tmp = childargs + childargs = retargs + if not childargs: + childargs = [] + + # -- means "stop parsing args", but we don't want to see it either + if '--' in childargs: + childargs.remove('--') + + # special deprecation warning for 'ceph tell' + # someday 'mds' will be here too + if len(childargs) >= 2 and \ + childargs[0] in ['mon', 'osd'] and \ + childargs[1] == 'tell': + print >> sys.stderr, '"{0} tell" is deprecated; try "tell {0}." instead (id can be "*") '.format(childargs[0]) + return 1 + + if parsed_args.help: + # short default timeout for -h + if not timeout: + timeout = 5 + + hdr('Monitor commands:') + print '[Contacting monitor, timeout after %d seconds]' % timeout + + if childargs and childargs[0] == 'ping': + if len(childargs) < 2: + print >> sys.stderr, '"ping" requires a monitor name as argument: "ping mon."' + return 1 + + try: + if childargs and childargs[0] == 'ping': + return ping_monitor(cluster_handle, childargs[1], timeout) + cluster_handle.connect(timeout=timeout) + except KeyboardInterrupt: + print >> sys.stderr, 'Cluster connection aborted' + return 1 + except Exception as e: + print >> sys.stderr, 'Error connecting to cluster: {0}'.\ + format(e.__class__.__name__) + return 1 + + if parsed_args.help: + return do_extended_help(parser, childargs) + + # implement -w/--watch_* + # This is ugly, but Namespace() isn't quite rich enough. + level = '' + for k, v in parsed_args._get_kwargs(): + if k.startswith('watch') and v: + if k == 'watch': + level = 'info' + else: + level = k.replace('watch_', '') + if level: + + # an awfully simple callback + def watch_cb(arg, line, who, stamp_sec, stamp_nsec, seq, level, msg): + print line + sys.stdout.flush() + + # first do a ceph status + ret, outbuf, outs = json_command(cluster_handle, prefix='status') + if ret == -errno.EINVAL: + # try old mon + ret, outbuf, outs = send_command(cluster_handle, cmd=['status']) + # old mon returns status to outs...ick + if ret == 0: + outbuf += outs + if ret: + print >> sys.stderr, "status query failed: ", outs + return ret + print outbuf + + # this instance keeps the watch connection alive, but is + # otherwise unused + logwatch = rados.MonitorLog(cluster_handle, level, watch_cb, 0) + + # loop forever letting watch_cb print lines + try: + signal.pause() + except KeyboardInterrupt: + # or until ^C, at least + return 0 + + # read input file, if any + inbuf = '' + if parsed_args.input_file: + try: + with open(parsed_args.input_file, 'r') as f: + inbuf = f.read() + except Exception as e: + print >> sys.stderr, 'Can\'t open input file {0}: {1}'.format(parsed_args.input_file, e) + return 1 + + # prepare output file, if any + if parsed_args.output_file: + try: + outf = open(parsed_args.output_file, 'w') + except Exception as e: + print >> sys.stderr, \ + 'Can\'t open output file {0}: {1}'.\ + format(parsed_args.output_file, e) + return 1 + + # -s behaves like a command (ceph status). + if parsed_args.status: + childargs.insert(0, 'status') + + target = find_cmd_target(childargs) + + # Repulsive hack to handle tell: lop off 'tell' and target + # and validate the rest of the command. 'target' is already + # determined in our callers, so it's ok to remove it here. + if len(childargs) and childargs[0] == 'tell': + childargs = childargs[2:] + + # fetch JSON sigs from command + # each line contains one command signature (a placeholder name + # of the form 'cmdNNN' followed by an array of argument descriptors) + # as part of the validated argument JSON object + + targets = [target] + + if target[1] == '*': + if target[0] == 'osd': + targets = [(target[0], o) for o in osdids()] + elif target[0] == 'mon': + targets = [(target[0], m) for m in monids()] + + final_ret = 0 + for target in targets: + # prettify? prefix output with target, if there was a wildcard used + prefix = '' + suffix = '' + if not parsed_args.output_file and len(targets) > 1: + prefix = '{0}.{1}: '.format(*target) + suffix = '\n' + + ret, outbuf, outs = json_command(cluster_handle, target=target, + prefix='get_command_descriptions') + compat = False + if ret == -errno.EINVAL: + # send command to old monitor or OSD + if verbose: + print prefix + '{0} to old {1}'.format(' '.join(childargs), target[0]) + compat = True + if parsed_args.output_format: + childargs.extend(['--format', parsed_args.output_format]) + ret, outbuf, outs = send_command(cluster_handle, target, childargs, + inbuf) + + if ret == -errno.EINVAL: + # did we race with a mon upgrade? try again! + ret, outbuf, outs = json_command(cluster_handle, target=target, + prefix='get_command_descriptions') + if ret == 0: + compat = False # yep, carry on + if not compat: + if ret: + if ret < 0: + outs = 'problem getting command descriptions from {0}.{1}'.format(*target) + else: + sigdict = parse_json_funcsigs(outbuf, 'cli') + + if parsed_args.completion: + return complete(sigdict, childargs, target) + + ret, outbuf, outs = new_style_command(parsed_args, childargs, target, + sigdict, inbuf, verbose) + + # debug tool: send any successful command *again* to + # verify that it is idempotent. + if not ret and 'CEPH_CLI_TEST_DUP_COMMAND' in os.environ: + ret, outbuf, outs = new_style_command(parsed_args, childargs, target, + sigdict, inbuf, verbose) + if ret < 0: + ret = -ret + print >> sys.stderr, prefix + 'Second attempt of previously successful command failed with {0}: {1}'.format(errno.errorcode[ret], outs) + + if ret < 0: + ret = -ret + print >> sys.stderr, prefix + 'Error {0}: {1}'.format(errno.errorcode[ret], outs) + if len(targets) > 1: + final_ret = ret + else: + return ret + + # this assumes outs never has useful command output, only status + if compat: + if ret == 0: + # old cli/mon would send status string to stdout on non-error + print outs + else: + if outs: + print >> sys.stderr, prefix + outs + + if (parsed_args.output_file): + outf.write(outbuf) + else: + # hack: old code printed status line before many json outputs + # (osd dump, etc.) that consumers know to ignore. Add blank line + # to satisfy consumers that skip the first line, but not annoy + # consumers that don't. + if parsed_args.output_format and \ + parsed_args.output_format.startswith('json') and \ + not compat: + sys.stdout.write('\n') + + # if we are prettifying things, normalize newlines. sigh. + if suffix != '': + outbuf = outbuf.rstrip() + if outbuf != '': + sys.stdout.write(prefix + outbuf + suffix) + + sys.stdout.flush() + + if (parsed_args.output_file): + outf.close() + + if final_ret: + return final_ret + + return 0 + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/ceph_fuse.cc b/src/ceph_fuse.cc index f6a421b93bc17..4dde41772ff76 100644 --- a/src/ceph_fuse.cc +++ b/src/ceph_fuse.cc @@ -13,6 +13,7 @@ */ #include +#include #include #include using namespace std; @@ -29,6 +30,7 @@ using namespace std; #include "common/Timer.h" #include "common/ceph_argparse.h" +#include "common/linux_version.h" #include "global/global_init.h" #include "common/safe_io.h" @@ -84,6 +86,8 @@ int main(int argc, const char **argv, const char *envp[]) { // we need to handle the forking ourselves. int fd[2] = {0, 0}; // parent's, child's pid_t childpid = 0; + int tester_r = 0; + void *tester_rp = NULL; bool restart_log = false; if (g_conf->daemonize) { int r = socketpair(AF_UNIX, SOCK_STREAM, 0, fd); @@ -107,27 +111,71 @@ int main(int argc, const char **argv, const char *envp[]) { if (restart_log) g_ceph_context->_log->start(); + class RemountTest : public Thread { + public: + CephFuse *cfuse; + Client *client; + RemountTest() : Thread() {} + void init(CephFuse *cf, Client *cl) { + cfuse = cf; + client = cl; + } + virtual ~RemountTest() {} + virtual void *entry() { + int ver = get_linux_version(); + assert(ver != 0); + bool can_invalidate_dentries = g_conf->client_try_dentry_invalidate && + ver < KERNEL_VERSION(3, 18, 0); + int tr = client->test_dentry_handling(can_invalidate_dentries); + if (tr != 0) { + cerr << "ceph-fuse[" << getpid() + << "]: fuse failed dentry invalidate/remount test with error " + << cpp_strerror(tr) << ", stopping" << std::endl; + + char buf[5050]; + string mountpoint = cfuse->get_mount_point(); + snprintf(buf, 5049, "fusermount -u -z %s", mountpoint.c_str()); + int umount_r = system(buf); + if (umount_r) { + if (umount_r != -1) { + if (WIFEXITED(umount_r)) { + umount_r = WEXITSTATUS(umount_r); + cerr << "got error " << umount_r + << " when unmounting Ceph on failed remount test!" << std::endl; + } else { + cerr << "attempt to umount on failed remount test failed (on a signal?)" << std::endl; + } + } else { + cerr << "system() invocation failed during remount test" << std::endl; + } + } + } + return reinterpret_cast(tr); + } + } tester; + + // get monmap Messenger *messenger = NULL; Client *client; CephFuse *cfuse; - MonClient mc(g_ceph_context); - int r = mc.build_initial_monmap(); + MonClient *mc = new MonClient(g_ceph_context); + int r = mc->build_initial_monmap(); if (r == -EINVAL) usage(); if (r < 0) goto out_mc_start_failed; // start up network - messenger = Messenger::create(g_ceph_context, + messenger = Messenger::create(g_ceph_context, g_conf->ms_type, entity_name_t::CLIENT(), "client", getpid()); messenger->set_default_policy(Messenger::Policy::lossy_client(0, 0)); messenger->set_policy(entity_name_t::TYPE_MDS, Messenger::Policy::lossless_client(0, 0)); - client = new Client(messenger, &mc); + client = new Client(messenger, mc); if (filer_flags) { client->set_filer_flags(filer_flags); } @@ -169,17 +217,24 @@ int main(int argc, const char **argv, const char *envp[]) { cerr << "ceph-fuse[" << getpid() << "]: fuse failed to start" << std::endl; goto out_client_unmount; } + cerr << "ceph-fuse[" << getpid() << "]: starting fuse" << std::endl; + tester.init(cfuse, client); + tester.create(); r = cfuse->loop(); - cerr << "ceph-fuse[" << getpid() << "]: fuse finished with error " << r << std::endl; - + tester.join(&tester_rp); + tester_r = static_cast(reinterpret_cast(tester_rp)); + cerr << "ceph-fuse[" << getpid() << "]: fuse finished with error " << r + << " and tester_r " << tester_r <unmount(); //cout << "unmounted" << std::endl; - + cfuse->finalize(); delete cfuse; - + out_shutdown: client->shutdown(); out_init_failed: @@ -189,17 +244,19 @@ int main(int argc, const char **argv, const char *envp[]) { out_messenger_start_failed: delete client; out_mc_start_failed: - + if (g_conf->daemonize) { //cout << "child signalling parent with " << r << std::endl; static int foo = 0; foo += ::write(fd[1], &r, sizeof(r)); } - + delete messenger; g_ceph_context->put(); free(newargv); - + + delete mc; + //cout << "child done" << std::endl; return r; } else { diff --git a/src/ceph_mds.cc b/src/ceph_mds.cc index fca21c40130ee..91ff002e1edd2 100644 --- a/src/ceph_mds.cc +++ b/src/ceph_mds.cc @@ -26,7 +26,7 @@ using namespace std; #include "common/strtol.h" #include "mon/MonMap.h" -#include "mds/MDS.h" +#include "mds/MDSDaemon.h" #include "msg/Messenger.h" @@ -78,7 +78,7 @@ static int parse_rank(const char *opt_name, const std::string &val) -MDS *mds = NULL; +MDSDaemon *mds = NULL; static void handle_mds_signal(int signum) @@ -145,18 +145,19 @@ int main(int argc, const char **argv) usage(); } - if (g_conf->name.get_id().empty() || (g_conf->name.get_id()[0] >= '0' && g_conf->name.get_id()[0] <= '9')) { + if (g_conf->name.get_id().empty() || + (g_conf->name.get_id()[0] >= '0' && g_conf->name.get_id()[0] <= '9')) { derr << "deprecation warning: MDS id '" << g_conf->name << "' is invalid and will be forbidden in a future version. " "MDS names may not start with a numeric digit." << dendl; } - Messenger *messenger = Messenger::create(g_ceph_context, - entity_name_t::MDS(-1), "mds", - getpid()); - messenger->set_cluster_protocol(CEPH_MDS_PROTOCOL); + Messenger *msgr = Messenger::create(g_ceph_context, g_conf->ms_type, + entity_name_t::MDS(-1), "mds", + getpid()); + msgr->set_cluster_protocol(CEPH_MDS_PROTOCOL); - cout << "starting " << g_conf->name << " at " << messenger->get_myaddr() + cout << "starting " << g_conf->name << " at " << msgr->get_myaddr() << std::endl; uint64_t supported = CEPH_FEATURE_UID | @@ -165,21 +166,23 @@ int main(int argc, const char **argv) CEPH_FEATURE_MDS_INLINE_DATA | CEPH_FEATURE_PGID64 | CEPH_FEATURE_MSG_AUTH | - CEPH_FEATURE_EXPORT_PEER; + CEPH_FEATURE_EXPORT_PEER | + CEPH_FEATURE_MDS_QUOTA; uint64_t required = CEPH_FEATURE_OSDREPLYMUX; - messenger->set_default_policy(Messenger::Policy::lossy_client(supported, required)); - messenger->set_policy(entity_name_t::TYPE_MON, - Messenger::Policy::lossy_client(supported, - CEPH_FEATURE_UID | - CEPH_FEATURE_PGID64)); - messenger->set_policy(entity_name_t::TYPE_MDS, - Messenger::Policy::lossless_peer(supported, - CEPH_FEATURE_UID)); - messenger->set_policy(entity_name_t::TYPE_CLIENT, - Messenger::Policy::stateful_server(supported, 0)); - - int r = messenger->bind(g_conf->public_addr); + + msgr->set_default_policy(Messenger::Policy::lossy_client(supported, required)); + msgr->set_policy(entity_name_t::TYPE_MON, + Messenger::Policy::lossy_client(supported, + CEPH_FEATURE_UID | + CEPH_FEATURE_PGID64)); + msgr->set_policy(entity_name_t::TYPE_MDS, + Messenger::Policy::lossless_peer(supported, + CEPH_FEATURE_UID)); + msgr->set_policy(entity_name_t::TYPE_CLIENT, + Messenger::Policy::stateful_server(supported, 0)); + + int r = msgr->bind(g_conf->public_addr); if (r < 0) exit(1); @@ -193,16 +196,16 @@ int main(int argc, const char **argv) return -1; global_init_chdir(g_ceph_context); - messenger->start(); + msgr->start(); // start mds - mds = new MDS(g_conf->name.get_id().c_str(), messenger, &mc); + mds = new MDSDaemon(g_conf->name.get_id().c_str(), msgr, &mc); // in case we have to respawn... mds->orig_argc = argc; mds->orig_argv = argv; - if (shadow) + if (shadow != MDSMap::STATE_NULL) r = mds->init(shadow); else r = mds->init(); @@ -218,7 +221,7 @@ int main(int argc, const char **argv) if (g_conf->inject_early_sigterm) kill(getpid(), SIGTERM); - messenger->wait(); + msgr->wait(); unregister_async_signal_handler(SIGHUP, sighup_handler); unregister_async_signal_handler(SIGINT, handle_mds_signal); @@ -235,8 +238,10 @@ int main(int argc, const char **argv) // only delete if it was a clean shutdown (to aid memory leak // detection, etc.). don't bother if it was a suicide. - if (mds->is_stopped()) + if (mds->is_clean_shutdown()) { delete mds; + delete msgr; + } g_ceph_context->put(); diff --git a/src/ceph_mon.cc b/src/ceph_mon.cc index 3bd6022b81c9a..dcb07de615dd0 100644 --- a/src/ceph_mon.cc +++ b/src/ceph_mon.cc @@ -193,7 +193,7 @@ int preload_erasure_code() stringstream ss; int r = ErasureCodePluginRegistry::instance().preload(plugins, directory, - ss); + &ss); if (r) derr << ss.str() << dendl; else @@ -497,9 +497,17 @@ int main(int argc, const char **argv) Preforker prefork; if (!(flags & CINIT_FLAG_NO_DAEMON_ACTIONS)) { if (global_init_prefork(g_ceph_context, 0) >= 0) { - prefork.prefork(); + string err_msg; + err = prefork.prefork(err_msg); + if (err < 0) { + cerr << err_msg << std::endl; + prefork.exit(err); + } if (prefork.is_parent()) { - return prefork.parent_wait(); + err = prefork.parent_wait(err_msg); + if (err < 0) + cerr << err_msg << std::endl; + prefork.exit(err); } global_init_postfork_start(g_ceph_context); } @@ -510,40 +518,10 @@ int main(int argc, const char **argv) } MonitorDBStore *store = new MonitorDBStore(g_conf->mon_data); - - Monitor::StoreConverter converter(g_conf->mon_data, store); - if (store->open(std::cerr) < 0) { - int needs_conversion = converter.needs_conversion(); - if (needs_conversion < 0) { - if (needs_conversion == -ENOENT) { - derr << "monitor data directory at '" << g_conf->mon_data - << "' is not empty but has no valid store nor legacy monitor" - << " store." << dendl; - } else { - derr << "found errors while validating legacy unconverted" - << " monitor store: " << cpp_strerror(needs_conversion) << dendl; - } - prefork.exit(1); - } - - int ret = store->create_and_open(std::cerr); - if (ret < 0) { - derr << "failed to create new leveldb store" << dendl; - prefork.exit(1); - } - - if (needs_conversion > 0) { - dout(0) << "converting monitor store, please do not interrupt..." << dendl; - int r = converter.convert(); - if (r) { - derr << "failed to convert monitor store: " << cpp_strerror(r) << dendl; - prefork.exit(1); - } - } - } else if (converter.is_converting()) { - derr << "there is an on-going (maybe aborted?) conversion." << dendl; - derr << "you should check what happened" << dendl; - derr << "remove store.db to restart conversion" << dendl; + err = store->open(std::cerr); + if (err < 0) { + derr << "error opening mon data directory at '" + << g_conf->mon_data << "': " << cpp_strerror(err) << dendl; prefork.exit(1); } @@ -685,12 +663,12 @@ int main(int argc, const char **argv) // bind int rank = monmap.get_rank(g_conf->name.get_id()); - Messenger *messenger = Messenger::create(g_ceph_context, - entity_name_t::MON(rank), - "mon", - 0); - messenger->set_cluster_protocol(CEPH_MON_PROTOCOL); - messenger->set_default_send_priority(CEPH_MSG_PRIO_HIGH); + Messenger *msgr = Messenger::create(g_ceph_context, g_conf->ms_type, + entity_name_t::MON(rank), + "mon", + 0); + msgr->set_cluster_protocol(CEPH_MON_PROTOCOL); + msgr->set_default_send_priority(CEPH_MSG_PRIO_HIGH); uint64_t supported = CEPH_FEATURE_UID | @@ -698,34 +676,38 @@ int main(int argc, const char **argv) CEPH_FEATURE_MONCLOCKCHECK | CEPH_FEATURE_PGID64 | CEPH_FEATURE_MSG_AUTH; - messenger->set_default_policy(Messenger::Policy::stateless_server(supported, 0)); - messenger->set_policy(entity_name_t::TYPE_MON, - Messenger::Policy::lossless_peer_reuse(supported, - CEPH_FEATURE_UID | - CEPH_FEATURE_PGID64 | - CEPH_FEATURE_MON_SINGLE_PAXOS)); - messenger->set_policy(entity_name_t::TYPE_OSD, - Messenger::Policy::stateless_server(supported, - CEPH_FEATURE_PGID64 | - CEPH_FEATURE_OSDENC)); - messenger->set_policy(entity_name_t::TYPE_CLIENT, - Messenger::Policy::stateless_server(supported, 0)); - messenger->set_policy(entity_name_t::TYPE_MDS, - Messenger::Policy::stateless_server(supported, 0)); - + msgr->set_default_policy(Messenger::Policy::stateless_server(supported, 0)); + msgr->set_policy(entity_name_t::TYPE_MON, + Messenger::Policy::lossless_peer_reuse( + supported, + CEPH_FEATURE_UID | + CEPH_FEATURE_PGID64 | + CEPH_FEATURE_MON_SINGLE_PAXOS)); + msgr->set_policy(entity_name_t::TYPE_OSD, + Messenger::Policy::stateless_server( + supported, + CEPH_FEATURE_PGID64 | + CEPH_FEATURE_OSDENC)); + msgr->set_policy(entity_name_t::TYPE_CLIENT, + Messenger::Policy::stateless_server(supported, 0)); + msgr->set_policy(entity_name_t::TYPE_MDS, + Messenger::Policy::stateless_server(supported, 0)); // throttle client traffic Throttle *client_throttler = new Throttle(g_ceph_context, "mon_client_bytes", g_conf->mon_client_bytes); - messenger->set_policy_throttlers(entity_name_t::TYPE_CLIENT, client_throttler, NULL); + msgr->set_policy_throttlers(entity_name_t::TYPE_CLIENT, + client_throttler, NULL); // throttle daemon traffic // NOTE: actual usage on the leader may multiply by the number of // monitors if they forward large update messages from daemons. Throttle *daemon_throttler = new Throttle(g_ceph_context, "mon_daemon_bytes", g_conf->mon_daemon_bytes); - messenger->set_policy_throttlers(entity_name_t::TYPE_OSD, daemon_throttler, NULL); - messenger->set_policy_throttlers(entity_name_t::TYPE_MDS, daemon_throttler, NULL); + msgr->set_policy_throttlers(entity_name_t::TYPE_OSD, daemon_throttler, + NULL); + msgr->set_policy_throttlers(entity_name_t::TYPE_MDS, daemon_throttler, + NULL); dout(0) << "starting " << g_conf->name << " rank " << rank << " at " << ipaddr @@ -733,15 +715,21 @@ int main(int argc, const char **argv) << " fsid " << monmap.get_fsid() << dendl; - err = messenger->bind(ipaddr); + err = msgr->bind(ipaddr); if (err < 0) { derr << "unable to bind monitor to " << ipaddr << dendl; prefork.exit(1); } + cout << "starting " << g_conf->name << " rank " << rank + << " at " << ipaddr + << " mon_data " << g_conf->mon_data + << " fsid " << monmap.get_fsid() + << std::endl; + // start monitor - mon = new Monitor(g_ceph_context, g_conf->name.get_id(), store, - messenger, &monmap); + mon = new Monitor(g_ceph_context, g_conf->name.get_id(), store, + msgr, &monmap); if (force_sync) { derr << "flagging a forced sync ..." << dendl; @@ -765,7 +753,7 @@ int main(int argc, const char **argv) prefork.daemonize(); } - messenger->start(); + msgr->start(); mon->init(); @@ -778,7 +766,7 @@ int main(int argc, const char **argv) if (g_conf->inject_early_sigterm) kill(getpid(), SIGTERM); - messenger->wait(); + msgr->wait(); store->close(); @@ -789,7 +777,7 @@ int main(int argc, const char **argv) delete mon; delete store; - delete messenger; + delete msgr; delete client_throttler; delete daemon_throttler; g_ceph_context->put(); diff --git a/src/ceph_osd.cc b/src/ceph_osd.cc index a6228d442a627..3369460b6c88e 100644 --- a/src/ceph_osd.cc +++ b/src/ceph_osd.cc @@ -31,7 +31,6 @@ using namespace std; #include "mon/MonMap.h" - #include "msg/Messenger.h" #include "common/Timer.h" @@ -62,10 +61,25 @@ void handle_osd_signal(int signum) void usage() { - derr << "usage: ceph-osd -i osdid [--osd-data=path] [--osd-journal=path] " - << "[--mkfs] [--mkjournal] [--convert-filestore]" << dendl; - derr << " --debug_osd N set debug level (e.g. 10)" << dendl; + cout << "usage: ceph-osd -i \n" + << " --osd-data=path data directory\n" + << " --osd-journal=path\n" + << " journal file or block device\n" + << " --mkfs create a [new] data directory\n" + << " --convert-filestore\n" + << " run any pending upgrade operations\n" + << " --flush-journal flush all data out of journal\n" + << " --mkjournal initialize a new journal\n" + << " --check-wants-journal\n" + << " check whether a journal is desired\n" + << " --check-allows-journal\n" + << " check whether a journal is allowed\n" + << " --check-needs-journal\n" + << " check whether a journal is required\n" + << " --debug_osd N set debug level (e.g. 10)" + << std::endl; generic_server_usage(); + cout.flush(); } int preload_erasure_code() @@ -75,7 +89,7 @@ int preload_erasure_code() stringstream ss; int r = ErasureCodePluginRegistry::instance().preload(plugins, directory, - ss); + &ss); if (r) derr << ss.str() << dendl; else @@ -100,6 +114,9 @@ int main(int argc, const char **argv) // osd specific args bool mkfs = false; bool mkjournal = false; + bool check_wants_journal = false; + bool check_allows_journal = false; + bool check_needs_journal = false; bool mkkey = false; bool flushjournal = false; bool dump_journal = false; @@ -107,7 +124,6 @@ int main(int argc, const char **argv) bool get_journal_fsid = false; bool get_osd_fsid = false; bool get_cluster_fsid = false; - bool check_need_journal = false; std::string dump_pg_log; std::string val; @@ -121,6 +137,12 @@ int main(int argc, const char **argv) mkfs = true; } else if (ceph_argparse_flag(args, i, "--mkjournal", (char*)NULL)) { mkjournal = true; + } else if (ceph_argparse_flag(args, i, "--check-allows-journal", (char*)NULL)) { + check_allows_journal = true; + } else if (ceph_argparse_flag(args, i, "--check-wants-journal", (char*)NULL)) { + check_wants_journal = true; + } else if (ceph_argparse_flag(args, i, "--check-needs-journal", (char*)NULL)) { + check_needs_journal = true; } else if (ceph_argparse_flag(args, i, "--mkkey", (char*)NULL)) { mkkey = true; } else if (ceph_argparse_flag(args, i, "--flush-journal", (char*)NULL)) { @@ -137,8 +159,6 @@ int main(int argc, const char **argv) get_osd_fsid = true; } else if (ceph_argparse_flag(args, i, "--get-journal-fsid", "--get-journal-uuid", (char*)NULL)) { get_journal_fsid = true; - } else if (ceph_argparse_flag(args, i, "--check-needs-journal", (char*)NULL)) { - check_need_journal = true; } else { ++i; } @@ -260,6 +280,33 @@ int main(int argc, const char **argv) << " for object store " << g_conf->osd_data << dendl; exit(0); } + if (check_wants_journal) { + if (store->wants_journal()) { + cout << "yes" << std::endl; + exit(0); + } else { + cout << "no" << std::endl; + exit(1); + } + } + if (check_allows_journal) { + if (store->allows_journal()) { + cout << "yes" << std::endl; + exit(0); + } else { + cout << "no" << std::endl; + exit(1); + } + } + if (check_needs_journal) { + if (store->needs_journal()) { + cout << "yes" << std::endl; + exit(0); + } else { + cout << "no" << std::endl; + exit(1); + } + } if (flushjournal) { common_init_finish(g_ceph_context); int err = store->mount(); @@ -318,14 +365,6 @@ int main(int argc, const char **argv) exit(r); } - if (check_need_journal) { - if (store->need_journal()) - cout << "yes" << std::endl; - else - cout << "no" << std::endl; - exit(0); - } - string magic; uuid_d cluster_fsid, osd_fsid; int w; @@ -369,22 +408,22 @@ int main(int argc, const char **argv) << TEXT_NORMAL << dendl; } - Messenger *ms_public = Messenger::create(g_ceph_context, + Messenger *ms_public = Messenger::create(g_ceph_context, g_conf->ms_type, entity_name_t::OSD(whoami), "client", getpid()); - Messenger *ms_cluster = Messenger::create(g_ceph_context, + Messenger *ms_cluster = Messenger::create(g_ceph_context, g_conf->ms_type, entity_name_t::OSD(whoami), "cluster", - getpid()); - Messenger *ms_hbclient = Messenger::create(g_ceph_context, + getpid(), CEPH_FEATURES_ALL); + Messenger *ms_hbclient = Messenger::create(g_ceph_context, g_conf->ms_type, entity_name_t::OSD(whoami), "hbclient", getpid()); - Messenger *ms_hb_back_server = Messenger::create(g_ceph_context, + Messenger *ms_hb_back_server = Messenger::create(g_ceph_context, g_conf->ms_type, entity_name_t::OSD(whoami), "hb_back_server", getpid()); - Messenger *ms_hb_front_server = Messenger::create(g_ceph_context, + Messenger *ms_hb_front_server = Messenger::create(g_ceph_context, g_conf->ms_type, entity_name_t::OSD(whoami), "hb_front_server", getpid()); - Messenger *ms_objecter = Messenger::create(g_ceph_context, + Messenger *ms_objecter = Messenger::create(g_ceph_context, g_conf->ms_type, entity_name_t::OSD(whoami), "ms_objecter", getpid()); ms_cluster->set_cluster_protocol(CEPH_OSD_PROTOCOL); @@ -413,11 +452,17 @@ int main(int argc, const char **argv) CEPH_FEATURE_MSG_AUTH | CEPH_FEATURE_OSD_ERASURE_CODES; + // All feature bits 0 - 34 should be present from dumpling v0.67 forward uint64_t osd_required = CEPH_FEATURE_UID | CEPH_FEATURE_PGID64 | CEPH_FEATURE_OSDENC | - CEPH_FEATURE_OSD_SNAPMAPPER; + CEPH_FEATURE_OSD_SNAPMAPPER | + CEPH_FEATURE_INDEP_PG_MAP | + CEPH_FEATURE_OSD_PACKED_RECOVERY | + CEPH_FEATURE_RECOVERY_RESERVATION | + CEPH_FEATURE_BACKFILL_RESERVATION | + CEPH_FEATURE_CHUNKY_SCRUB; ms_public->set_default_policy(Messenger::Policy::stateless_server(supported, 0)); ms_public->set_policy_throttlers(entity_name_t::TYPE_CLIENT, @@ -431,7 +476,7 @@ int main(int argc, const char **argv) //try to poison pill any OSD connections on the wrong address ms_public->set_policy(entity_name_t::TYPE_OSD, Messenger::Policy::stateless_server(0,0)); - + ms_cluster->set_default_policy(Messenger::Policy::stateless_server(0, 0)); ms_cluster->set_policy(entity_name_t::TYPE_MON, Messenger::Policy::lossy_client(0,0)); ms_cluster->set_policy(entity_name_t::TYPE_OSD, @@ -456,6 +501,12 @@ int main(int argc, const char **argv) if (r < 0) exit(1); + if (g_conf->osd_heartbeat_use_min_delay_socket) { + ms_hbclient->set_socket_priority(SOCKET_PRIORITY_MIN_DELAY); + ms_hb_back_server->set_socket_priority(SOCKET_PRIORITY_MIN_DELAY); + ms_hb_front_server->set_socket_priority(SOCKET_PRIORITY_MIN_DELAY); + } + // hb back should bind to same ip as cluster_addr (if specified) entity_addr_t hb_back_addr = g_conf->osd_heartbeat_addr; if (hb_back_addr.is_blank_ip()) { @@ -488,16 +539,17 @@ int main(int argc, const char **argv) return -1; osd = new OSD(g_ceph_context, - store, - whoami, - ms_cluster, - ms_public, - ms_hbclient, - ms_hb_front_server, - ms_hb_back_server, - ms_objecter, - &mc, - g_conf->osd_data, g_conf->osd_journal); + store, + whoami, + ms_cluster, + ms_public, + ms_hbclient, + ms_hb_front_server, + ms_hb_back_server, + ms_objecter, + &mc, + g_conf->osd_data, + g_conf->osd_journal); int err = osd->pre_init(); if (err < 0) { @@ -506,9 +558,6 @@ int main(int argc, const char **argv) return 1; } - // Now close the standard file descriptors - global_init_shutdown_stderr(g_ceph_context); - ms_public->start(); ms_hbclient->start(); ms_hb_front_server->start(); @@ -555,6 +604,7 @@ int main(int argc, const char **argv) delete ms_hb_back_server; delete ms_cluster; delete ms_objecter; + client_byte_throttler.reset(); client_msg_throttler.reset(); g_ceph_context->put(); diff --git a/src/ceph_syn.cc b/src/ceph_syn.cc index c3410aa61d413..71db206d785f7 100644 --- a/src/ceph_syn.cc +++ b/src/ceph_syn.cc @@ -65,9 +65,10 @@ int main(int argc, const char **argv, char *envp[]) cout << "ceph-syn: starting " << g_conf->num_client << " syn client(s)" << std::endl; for (int i=0; inum_client; i++) { - messengers[i] = Messenger::create(g_ceph_context, - entity_name_t(entity_name_t::TYPE_CLIENT,-1), "synclient", - i * 1000000 + getpid()); + messengers[i] = Messenger::create( + g_ceph_context, g_conf->ms_type, + entity_name_t(entity_name_t::TYPE_CLIENT,-1), "synclient", + i * 1000000 + getpid()); messengers[i]->bind(g_conf->public_addr); mclients[i] = new MonClient(g_ceph_context); mclients[i]->build_initial_monmap(); diff --git a/src/ceph_ver.h.in.cmake b/src/ceph_ver.h.in.cmake new file mode 100644 index 0000000000000..9c269cdf500cf --- /dev/null +++ b/src/ceph_ver.h.in.cmake @@ -0,0 +1,7 @@ +#ifndef CEPH_VERSION_H +#define CEPH_VERSION_H + +#define CEPH_GIT_VER @CEPH_GIT_VER@ +#define CEPH_GIT_NICE_VER "@CEPH_GIT_NICE_VER@" + +#endif diff --git a/src/check_version b/src/check_version deleted file mode 100755 index 8600c556f4c0b..0000000000000 --- a/src/check_version +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/sh - -dname=`dirname $0` - -if [ ! -d $dname/../.git ]; then - echo "not updating .git_version (no $dname/../.git)" - exit 0 -fi - -cur=`cd $dname && git rev-parse HEAD 2>/dev/null; git describe 2>/dev/null` -[ -e $1 ] && old=`cat $1` - -if [ "$cur" != "$old" ]; then - echo regenerating $1 with $cur - echo "$cur" > $1 -else - echo $1 is up to date. -fi - diff --git a/src/civetweb b/src/civetweb index 45da9c5f9052e..8d271315a5412 160000 --- a/src/civetweb +++ b/src/civetweb @@ -1 +1 @@ -Subproject commit 45da9c5f9052e82a9368b92e9bfb48878fff844f +Subproject commit 8d271315a541218caada366f84a2690fdbd474a2 diff --git a/src/client/Client.cc b/src/client/Client.cc index 5cd6e861c30c0..280d0d3dfc378 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -22,6 +22,7 @@ #include #include #include +#include #if defined(__linux__) #include @@ -34,6 +35,8 @@ using namespace std; #include "common/config.h" +#include "common/version.h" + // ceph stuff #include "messages/MMonMap.h" @@ -44,10 +47,11 @@ using namespace std; #include "messages/MClientRequestForward.h" #include "messages/MClientReply.h" #include "messages/MClientCaps.h" -#include "messages/MClientCapRelease.h" #include "messages/MClientLease.h" #include "messages/MClientSnap.h" #include "messages/MCommandReply.h" +#include "messages/MOSDMap.h" +#include "messages/MClientQuota.h" #include "messages/MGenericMessage.h" @@ -71,8 +75,8 @@ using namespace std; #define dout_subsys ceph_subsys_client #include "include/lru.h" - #include "include/compat.h" +#include "include/stringify.h" #include "Client.h" #include "Inode.h" @@ -116,9 +120,7 @@ Client::CommandHook::CommandHook(Client *client) : bool Client::CommandHook::call(std::string command, cmdmap_t& cmdmap, std::string format, bufferlist& out) { - Formatter *f = new_formatter(format); - if (!f) - f = new_formatter("json-pretty"); + Formatter *f = Formatter::create(format); f->open_object_section("result"); m_client->client_lock.Lock(); if (command == "mds_requests") @@ -129,6 +131,8 @@ bool Client::CommandHook::call(std::string command, cmdmap_t& cmdmap, m_client->dump_cache(f); else if (command == "kick_stale_sessions") m_client->_kick_stale_sessions(); + else if (command == "status") + m_client->dump_status(f); else assert(0 == "bad command registered"); m_client->client_lock.Unlock(); @@ -145,30 +149,32 @@ dir_result_t::dir_result_t(Inode *in) : inode(in), offset(0), this_offset(2), next_offset(2), release_count(0), ordered_count(0), start_shared_gen(0), buffer(0) { - inode->get(); } // cons/des Client::Client(Messenger *m, MonClient *mc) : Dispatcher(m->cct), - cct(m->cct), logger(NULL), m_command_hook(this), timer(m->cct, client_lock), + callback_handle(NULL), switch_interrupt_cb(NULL), + remount_cb(NULL), ino_invalidate_cb(NULL), - ino_invalidate_cb_handle(NULL), dentry_invalidate_cb(NULL), - dentry_invalidate_cb_handle(NULL), getgroups_cb(NULL), - getgroups_cb_handle(NULL), + can_invalidate_dentries(false), + require_remount(false), async_ino_invalidator(m->cct), async_dentry_invalidator(m->cct), interrupt_finisher(m->cct), + remount_finisher(m->cct), objecter_finisher(m->cct), tick_event(NULL), monclient(mc), messenger(m), whoami(m->get_myname().num()), + cap_epoch_barrier(0), + last_tid(0), oldest_tid(0), last_flush_seq(0), initialized(false), authenticated(false), mounted(false), unmounting(false), local_osd(-1), local_osd_epoch(0), @@ -177,11 +183,6 @@ Client::Client(Messenger *m, MonClient *mc) { monclient->set_messenger(m); - last_tid = 0; - last_flush_seq = 0; - - cwd = NULL; - // root = 0; @@ -201,7 +202,7 @@ Client::Client(Messenger *m, MonClient *mc) // osd interfaces mdsmap = new MDSMap; - objecter = new Objecter(cct, messenger, monclient, + objecter = new Objecter(cct, messenger, monclient, NULL, 0, 0); objecter->set_client_incarnation(0); // client always 0, for now. writeback_handler = new ObjecterWriteback(objecter, &objecter_finisher, @@ -236,11 +237,6 @@ Client::~Client() delete logger; } - - - - - void Client::tear_down_cache() { // fd's @@ -249,8 +245,7 @@ void Client::tear_down_cache() ++it) { Fh *fh = it->second; ldout(cct, 1) << "tear_down_cache forcing close of fh " << it->first << " ino " << fh->inode->ino << dendl; - put_inode(fh->inode); - delete fh; + _release_fh(fh); } fd_map.clear(); @@ -263,10 +258,13 @@ void Client::tear_down_cache() assert(lru.lru_get_size() == 0); // close root ino - assert(inode_map.size() <= 1); - if (root && inode_map.size() == 1) { + assert(inode_map.size() <= 1 + root_parents.size()); + if (root && inode_map.size() == 1 + root_parents.size()) { delete root; root = 0; + root_ancestor = 0; + while (!root_parents.empty()) + root_parents.erase(root_parents.begin()); inode_map.clear(); } @@ -320,7 +318,7 @@ void Client::dump_inode(Formatter *f, Inode *in, set& did, bool disconne f->close_section(); } if (it->second->inode) - dump_inode(f, it->second->inode, did, false); + dump_inode(f, it->second->inode.get(), did, false); } } } @@ -350,6 +348,35 @@ void Client::dump_cache(Formatter *f) f->close_section(); } +void Client::dump_status(Formatter *f) +{ + assert(client_lock.is_locked_by_me()); + + ldout(cct, 1) << __func__ << dendl; + + const OSDMap *osdmap = objecter->get_osdmap_read(); + const epoch_t osd_epoch = osdmap->get_epoch(); + objecter->put_osdmap_read(); + + if (f) { + f->open_object_section("metadata"); + { + for (std::map::const_iterator i = metadata.begin(); + i != metadata.end(); ++i) { + f->dump_string(i->first.c_str(), i->second); + } + } + f->close_section(); + + f->dump_int("dentry_count", lru.lru_get_size()); + f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned()); + f->dump_int("inode_count", inode_map.size()); + f->dump_int("mds_epoch", mdsmap->get_epoch()); + f->dump_int("osd_epoch", osd_epoch); + f->dump_int("osd_epoch_barrier", cap_epoch_barrier); + } +} + int Client::init() { client_lock.Lock(); @@ -383,16 +410,16 @@ int Client::init() // logger PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last); - plb.add_time_avg(l_c_reply, "reply"); - plb.add_time_avg(l_c_lat, "lat"); - plb.add_time_avg(l_c_wrlat, "wrlat"); - plb.add_time_avg(l_c_owrlat, "owrlat"); - plb.add_time_avg(l_c_ordlat, "ordlat"); + plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request"); + plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request"); + plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation"); logger = plb.create_perf_counters(); cct->get_perfcounters_collection()->add(logger); client_lock.Unlock(); + cct->_conf->add_observer(this); + AdminSocket* admin_socket = cct->get_admin_socket(); int ret = admin_socket->register_command("mds_requests", "mds_requests", @@ -426,6 +453,14 @@ int Client::init() lderr(cct) << "error registering admin socket command: " << cpp_strerror(-ret) << dendl; } + ret = admin_socket->register_command("status", + "status", + &m_command_hook, + "show overall client status"); + if (ret < 0) { + lderr(cct) << "error registering admin socket command: " + << cpp_strerror(-ret) << dendl; + } populate_metadata(); @@ -439,11 +474,14 @@ void Client::shutdown() { ldout(cct, 1) << "shutdown" << dendl; + cct->_conf->remove_observer(this); + AdminSocket* admin_socket = cct->get_admin_socket(); admin_socket->unregister_command("mds_requests"); admin_socket->unregister_command("mds_sessions"); admin_socket->unregister_command("dump_cache"); admin_socket->unregister_command("kick_stale_sessions"); + admin_socket->unregister_command("status"); if (ino_invalidate_cb) { ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl; @@ -463,6 +501,12 @@ void Client::shutdown() interrupt_finisher.stop(); } + if (remount_cb) { + ldout(cct, 10) << "shutdown stopping remount finisher" << dendl; + remount_finisher.wait_for_empty(); + remount_finisher.stop(); + } + objectcacher->stop(); // outside of client_lock! this does a join. client_lock.Lock(); @@ -490,7 +534,7 @@ void Client::shutdown() // =================== // metadata cache stuff -void Client::trim_cache() +void Client::trim_cache(bool trim_kernel_dcache) { ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << lru.lru_get_max() << dendl; unsigned last = 0; @@ -507,11 +551,17 @@ void Client::trim_cache() trim_dentry(dn); } + if (trim_kernel_dcache && lru.lru_get_size() > lru.lru_get_max()) + _invalidate_kernel_dcache(); + // hose root? - if (lru.lru_get_size() == 0 && root && root->get_num_ref() == 0 && inode_map.size() == 1) { + if (lru.lru_get_size() == 0 && root && root->get_num_ref() == 0 && inode_map.size() == 1 + root_parents.size()) { ldout(cct, 15) << "trim_cache trimmed root " << root << dendl; delete root; root = 0; + root_ancestor = 0; + while (!root_parents.empty()) + root_parents.erase(root_parents.begin()); inode_map.clear(); } } @@ -543,7 +593,7 @@ void Client::trim_cache_for_reconnect(MetaSession *s) << " trimmed " << trimmed << " dentries" << dendl; if (s->caps.size() > 0) - _invalidate_kernel_dcache(s); + _invalidate_kernel_dcache(); } void Client::trim_dentry(Dentry *dn) @@ -551,10 +601,13 @@ void Client::trim_dentry(Dentry *dn) ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name << " in dir " << hex << dn->dir->parent_inode->ino << dendl; - dn->dir->release_count++; - if (dn->dir->parent_inode->flags & I_COMPLETE) { - ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *dn->dir->parent_inode << dendl; - dn->dir->parent_inode->flags &= ~(I_COMPLETE | I_DIR_ORDERED); + if (dn->inode) { + dn->dir->release_count++; + if (dn->dir->parent_inode->flags & I_COMPLETE) { + ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " + << *dn->dir->parent_inode << dendl; + dn->dir->parent_inode->flags &= ~(I_COMPLETE | I_DIR_ORDERED); + } } unlink(dn, false, false); // drop dir, drop dentry } @@ -680,12 +733,15 @@ Inode * Client::add_update_inode(InodeStat *st, utime_t from, in = inode_map[st->vino]; ldout(cct, 12) << "add_update_inode had " << *in << " caps " << ccap_string(st->cap.caps) << dendl; } else { - in = new Inode(cct, st->vino, &st->layout); + in = new Inode(this, st->vino, &st->layout); inode_map[st->vino] = in; if (!root) { root = in; + root_ancestor = in; cwd = root; - cwd->get(); + } else if (!mounted) { + root_parents[root_ancestor] = in; + root_ancestor = in; } // immutable bits @@ -728,14 +784,6 @@ Inode * Client::add_update_inode(InodeStat *st, utime_t from, in->nlink = st->nlink; } - if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) && - st->xattrbl.length() && - st->xattr_version > in->xattr_version) { - bufferlist::iterator p = st->xattrbl.begin(); - ::decode(in->xattrs, p); - in->xattr_version = st->xattr_version; - } - in->dirstat = st->dirstat; in->rstat = st->rstat; @@ -744,6 +792,10 @@ Inode * Client::add_update_inode(InodeStat *st, utime_t from, ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl; } + if (st->quota.is_enable() ^ in->quota.is_enable()) + invalidate_quota_tree(in); + in->quota = st->quota; + in->layout = st->layout; update_inode_file_bits(in, st->truncate_seq, st->truncate_size, st->size, @@ -755,6 +807,14 @@ Inode * Client::add_update_inode(InodeStat *st, utime_t from, in->inline_version = st->inline_version; } + if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) && + st->xattrbl.length() && + st->xattr_version > in->xattr_version) { + bufferlist::iterator p = st->xattrbl.begin(); + ::decode(in->xattrs, p); + in->xattr_version = st->xattr_version; + } + // move me if/when version reflects fragtree changes. if (in->dirfragtree != st->dirfragtree) { in->dirfragtree = st->dirfragtree; @@ -823,8 +883,8 @@ Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dl } } - if (!dn || dn->inode == 0) { - in->get(); + if (!dn || !dn->inode) { + InodeRef tmp_ref(in); if (old_dentry) { if (old_dentry->dir != dir) { old_dentry->dir->ordered_count++; @@ -842,7 +902,6 @@ Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dl dir->parent_inode->flags &= ~I_DIR_ORDERED; } dn = link(dir, dname, in, dn); - put_inode(in); } update_dentry_lease(dn, dlease, from, session); @@ -990,8 +1049,7 @@ void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, dn->offset = dir_result_t::make_fpos(fg, i + readdir_offset); // add to cached result list - in->get(); - request->readdir_result.push_back(pair(dname, in)); + request->readdir_result.push_back(pair(dname, in)); ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl; } @@ -1132,9 +1190,15 @@ Inode* Client::insert_trace(MetaRequest *request, MetaSession *session) } } - if (in && (reply->head.op == CEPH_MDS_OP_READDIR || - reply->head.op == CEPH_MDS_OP_LSSNAP)) { - insert_readdir_results(request, session, in); + if (in) { + if (reply->head.op == CEPH_MDS_OP_READDIR || + reply->head.op == CEPH_MDS_OP_LSSNAP) + insert_readdir_results(request, session, in); + + if (request->dentry() == NULL && in != request->inode()) { + // pin the target inode if its parent dentry is not pinned + request->set_other_inode(in); + } } request->target = in; @@ -1176,7 +1240,7 @@ mds_rank_t Client::choose_target_mds(MetaRequest *req) } } else if (de) { if (de->inode) { - in = de->inode; + in = de->inode.get(); ldout(cct, 20) << "choose_target_mds starting with req->dentry inode " << *in << dendl; } else { in = de->dir->parent_inode; @@ -1192,12 +1256,12 @@ mds_rank_t Client::choose_target_mds(MetaRequest *req) ldout(cct, 10) << "choose_target_mds " << *in << " is snapped, using nonsnap parent" << dendl; while (in->snapid != CEPH_NOSNAP) { if (in->snapid == CEPH_SNAPDIR) - in = in->snapdir_parent; + in = in->snapdir_parent.get(); else if (!in->dn_set.empty()) /* In most cases there will only be one dentry, so getting it * will be the correct action. If there are multiple hard links, * I think the MDS should be able to redirect as needed*/ - in = in->get_first_parent()->dir->parent_inode; + in = in->get_first_parent()->dir->parent_inode; else { ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl; break; @@ -1284,7 +1348,7 @@ void Client::dump_mds_requests(Formatter *f) int Client::verify_reply_trace(int r, MetaRequest *request, MClientReply *reply, - Inode **ptarget, bool *pcreated, + InodeRef *ptarget, bool *pcreated, int uid, int gid) { // check whether this request actually did the create, and set created flag @@ -1307,17 +1371,17 @@ int Client::verify_reply_trace(int r, *pcreated = got_created_ino; if (request->target) { - *ptarget = request->target; - ldout(cct, 20) << "make_request target is " << *request->target << dendl; + ptarget->swap(request->target); + ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl; } else { if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) { (*ptarget) = p->second; - ldout(cct, 20) << "make_request created, target is " << **ptarget << dendl; + ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl; } else { // we got a traceless reply, and need to look up what we just // created. for now, do this by name. someday, do this by the // ino... which we know! FIXME. - Inode *target = 0; // ptarget may be NULL + InodeRef target; Dentry *d = request->dentry(); if (d) { if (d->dir) { @@ -1339,15 +1403,14 @@ int Client::verify_reply_trace(int r, target = in; } if (r >= 0) { - if (ptarget) - *ptarget = target; - // verify ino returned in reply and trace_dist are the same if (got_created_ino && created_ino.val != target->ino.val) { ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl; r = -EINTR; } + if (ptarget) + ptarget->swap(target); } } } @@ -1378,7 +1441,7 @@ int Client::verify_reply_trace(int r, */ int Client::make_request(MetaRequest *request, int uid, int gid, - Inode **ptarget, bool *pcreated, + InodeRef *ptarget, bool *pcreated, int use_mds, bufferlist *pdirbl) { @@ -1393,6 +1456,9 @@ int Client::make_request(MetaRequest *request, // make note mds_requests[tid] = request->get(); + if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)\ + oldest_tid = tid; + if (uid < 0) { uid = geteuid(); gid = getegid(); @@ -1400,16 +1466,21 @@ int Client::make_request(MetaRequest *request, request->set_caller_uid(uid); request->set_caller_gid(gid); - if (!mds_requests.empty()) - request->set_oldest_client_tid(mds_requests.begin()->first); - else - request->set_oldest_client_tid(tid); // this one is the oldest. + if (cct->_conf->client_inject_fixed_oldest_tid) { + ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl; + request->set_oldest_client_tid(1); + } else { + request->set_oldest_client_tid(oldest_tid); + } // hack target mds? if (use_mds >= 0) request->resend_mds = use_mds; while (1) { + if (request->aborted) + break; + // set up wait cond Cond caller_cond; request->caller_cond = &caller_cond; @@ -1468,10 +1539,21 @@ int Client::make_request(MetaRequest *request, break; } + if (!request->reply) { + assert(request->aborted); + assert(!request->got_unsafe); + request->item.remove_myself(); + unregister_request(request); + put_request(request); // ours + return -ETIMEDOUT; + } + // got it! MClientReply *reply = request->reply; request->reply = NULL; r = reply->get_result(); + if (r >= 0) + request->success = true; // kick dispatcher (we've got it!) assert(request->dispatch_cond); @@ -1498,16 +1580,41 @@ int Client::make_request(MetaRequest *request, return r; } +void Client::unregister_request(MetaRequest *req) +{ + mds_requests.erase(req->tid); + if (req->tid == oldest_tid) { + map::iterator p = mds_requests.upper_bound(oldest_tid); + while (true) { + if (p == mds_requests.end()) { + oldest_tid = 0; + break; + } + if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) { + oldest_tid = p->first; + break; + } + ++p; + } + } + put_request(req); +} + void Client::put_request(MetaRequest *request) { if (request->_put()) { - if (request->inode()) - put_inode(request->take_inode()); - if (request->old_inode()) - put_inode(request->take_old_inode()); - if (request->other_inode()) - put_inode(request->take_other_inode()); + int op = -1; + if (request->success) + op = request->get_op(); + InodeRef other_in; + request->take_other_inode(&other_in); delete request; + + if (other_in) { + if (other_in->dir && + (op == CEPH_MDS_OP_RMDIR || op == CEPH_MDS_OP_RENAME)) + _try_to_trim_inode(other_in.get()); + } } } @@ -1651,6 +1758,10 @@ void Client::populate_metadata() // Ceph entity id (the '0' in "client.0") metadata["entity_id"] = cct->_conf->name.get_id(); + + // Ceph version + metadata["ceph_version"] = pretty_version_to_str(); + metadata["ceph_sha1"] = git_version_to_str(); } /** @@ -1752,6 +1863,10 @@ void Client::handle_client_session(MClientSession *m) session->con->send_message(new MClientSession(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq())); break; + case CEPH_SESSION_FORCE_RO: + force_session_readonly(session); + break; + default: assert(0); } @@ -1772,7 +1887,8 @@ void Client::_kick_stale_sessions() } } -void Client::send_request(MetaRequest *request, MetaSession *session) +void Client::send_request(MetaRequest *request, MetaSession *session, + bool drop_cap_releases) { // make the request mds_rank_t mds = session->mds_num; @@ -1786,7 +1902,10 @@ void Client::send_request(MetaRequest *request, MetaSession *session) r->set_replayed_op(); } else { encode_cap_releases(request, mds); - r->releases.swap(request->cap_releases); + if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases + request->cap_releases.clear(); + else + r->releases.swap(request->cap_releases); } r->set_mdsmap_epoch(mdsmap->get_epoch()); @@ -1836,7 +1955,7 @@ MClientRequest* Client::build_client_request(MetaRequest *request) req->set_filepath(request->get_filepath()); req->set_filepath2(request->get_filepath2()); req->set_data(request->data); - req->set_retry_attempt(request->retry_attempt); + req->set_retry_attempt(request->retry_attempt++); req->head.num_fwd = request->num_fwd; return req; } @@ -1881,6 +2000,17 @@ void Client::handle_client_request_forward(MClientRequestForward *fwd) fwd->put(); } +bool Client::is_dir_operation(MetaRequest *req) +{ + int op = req->get_op(); + if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK || + op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME || + op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR || + op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE) + return true; + return false; +} + void Client::handle_client_reply(MClientReply *reply) { mds_rank_t mds_num = mds_rank_t(reply->get_source().num()); @@ -1946,6 +2076,11 @@ void Client::handle_client_reply(MClientReply *reply) if (!is_safe) { request->got_unsafe = true; session->unsafe_requests.push_back(&request->unsafe_item); + if (is_dir_operation(request)) { + Inode *dir = request->inode(); + assert(dir); + dir->unsafe_dir_ops.push_back(&request->unsafe_dir_item); + } } // Only signal the caller once (on the first reply): @@ -1970,19 +2105,98 @@ void Client::handle_client_reply(MClientReply *reply) // we're done, clean up if (request->got_unsafe) { request->unsafe_item.remove_myself(); + request->unsafe_dir_item.remove_myself(); + signal_cond_list(request->waitfor_safe); } request->item.remove_myself(); - mds_requests.erase(tid); - put_request(request); + unregister_request(request); } if (unmounting) mount_cond.Signal(); } +void Client::_handle_full_flag(int64_t pool) +{ + ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations " + << "on " << pool << dendl; + // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary + // to do this rather than blocking, because otherwise when we fill up we + // potentially lock caps forever on files with dirty pages, and we need + // to be able to release those caps to the MDS so that it can delete files + // and free up space. + epoch_t cancelled_epoch = objecter->op_cancel_writes(-ENOSPC, pool); + + // For all inodes with layouts in this pool and a pending flush write op + // (i.e. one of the ones we will cancel), we've got to purge_set their data + // from ObjectCacher so that it doesn't re-issue the write in response to + // the ENOSPC error. + // Fortunately since we're cancelling everything in a given pool, we don't + // need to know which ops belong to which ObjectSet, we can just blow all + // the un-flushed cached data away and mark any dirty inodes' async_err + // field with -ENOSPC as long as we're sure all the ops we cancelled were + // affecting this pool, and all the objectsets we're purging were also + // in this pool. + for (unordered_map::iterator i = inode_map.begin(); + i != inode_map.end(); ++i) + { + Inode *inode = i->second; + if (inode->oset.dirty_or_tx + && (pool == -1 || inode->layout.fl_pg_pool == pool)) { + ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec + << " has dirty objects, purging and setting ENOSPC" << dendl; + objectcacher->purge_set(&inode->oset); + inode->async_err = -ENOSPC; + } + } + + if (cancelled_epoch != (epoch_t)-1) { + set_cap_epoch_barrier(cancelled_epoch); + } +} + +void Client::handle_osd_map(MOSDMap *m) +{ + if (objecter->osdmap_full_flag()) { + _handle_full_flag(-1); + } else { + // Accumulate local list of full pools so that I can drop + // the objecter lock before re-entering objecter in + // cancel_writes + std::vector full_pools; + + const OSDMap *osd_map = objecter->get_osdmap_read(); + const map& pools = osd_map->get_pools(); + for (map::const_iterator i = pools.begin(); + i != pools.end(); ++i) { + if (i->second.has_flag(pg_pool_t::FLAG_FULL)) { + full_pools.push_back(i->first); + } + } + + objecter->put_osdmap_read(); + + for (std::vector::iterator i = full_pools.begin(); + i != full_pools.end(); ++i) { + _handle_full_flag(*i); + } + + // Subscribe to subsequent maps to watch for the full flag going + // away. For the global full flag objecter does this for us, but + // it pays no attention to the per-pool full flag so in this branch + // we do it ourselves. + if (!full_pools.empty()) { + objecter->maybe_request_map(); + } + } + + m->put(); +} + // ------------------------ // incoming messages + bool Client::ms_dispatch(Message *m) { Mutex::Locker l(client_lock); @@ -2002,7 +2216,7 @@ bool Client::ms_dispatch(Message *m) break; case CEPH_MSG_OSD_MAP: - m->put(); + handle_osd_map(static_cast(m)); break; // requests @@ -2029,6 +2243,10 @@ bool Client::ms_dispatch(Message *m) return false; } break; + case CEPH_MSG_CLIENT_QUOTA: + handle_quota(static_cast(m)); + break; + default: return false; } @@ -2121,6 +2339,7 @@ void Client::handle_mds_map(MMDSMap* m) if (newstate >= MDSMap::STATE_ACTIVE) { if (oldstate < MDSMap::STATE_ACTIVE) { + // kick new requests kick_requests(p->second); kick_flushing_caps(p->second); signal_context_list(p->second->waiting_for_open); @@ -2148,11 +2367,20 @@ void Client::send_reconnect(MetaSession *session) // trim unused caps to reduce MDS's cache rejoin time trim_cache_for_reconnect(session); + session->readonly = false; + if (session->release) { session->release->put(); session->release = NULL; } + // reset my cap seq number + session->seq = 0; + //connect to the mds' offload targets + connect_mds_targets(mds); + //make sure unsafe requests get saved + resend_unsafe_requests(session); + MClientReconnect *m = new MClientReconnect; // i have an open session. @@ -2191,15 +2419,6 @@ void Client::send_reconnect(MetaSession *session) } } } - - // reset my cap seq number - session->seq = 0; - - //connect to the mds' offload targets - connect_mds_targets(mds); - //make sure unsafe requests get saved - resend_unsafe_requests(session); - session->con->send_message(m); mount_cond.Signal(); @@ -2212,6 +2431,10 @@ void Client::kick_requests(MetaSession *session) for (map::iterator p = mds_requests.begin(); p != mds_requests.end(); ++p) { + if (p->second->got_unsafe) + continue; + if (p->second->retry_attempt > 0) + continue; // new requests only if (p->second->mds == session->mds_num) { send_request(p->second, session); } @@ -2224,6 +2447,20 @@ void Client::resend_unsafe_requests(MetaSession *session) !iter.end(); ++iter) send_request(*iter, session); + + // also re-send old requests when MDS enters reconnect stage. So that MDS can + // process completed requests in clientreplay stage. + for (map::iterator p = mds_requests.begin(); + p != mds_requests.end(); + ++p) { + MetaRequest *req = p->second; + if (req->got_unsafe) + continue; + if (req->retry_attempt == 0) + continue; // old requests only + if (req->mds == session->mds_num) + send_request(req, session, true); + } } void Client::kick_requests_closed(MetaSession *session) @@ -2242,8 +2479,9 @@ void Client::kick_requests_closed(MetaSession *session) if (req->got_unsafe) { lderr(cct) << "kick_requests_closed removing unsafe request " << req->get_tid() << dendl; req->unsafe_item.remove_myself(); - mds_requests.erase(req->get_tid()); - put_request(req); + req->unsafe_dir_item.remove_myself(); + signal_cond_list(req->waitfor_safe); + unregister_request(req); } } } @@ -2321,13 +2559,24 @@ void Client::put_inode(Inode *in, int n) ldout(cct, 10) << "put_inode deleting " << *in << dendl; bool unclean = objectcacher->release_set(&in->oset); assert(!unclean); - if (in->snapdir_parent) - put_inode(in->snapdir_parent); + put_qtree(in); inode_map.erase(in->vino()); in->cap_item.remove_myself(); in->snaprealm_item.remove_myself(); - if (in == root) + in->snapdir_parent.reset(); + if (in == root) { root = 0; + root_ancestor = 0; + while (!root_parents.empty()) + root_parents.erase(root_parents.begin()); + } + + if (!in->oset.objects.empty()) { + ldout(cct, 0) << __func__ << ": leftover objects on inode 0x" + << std::hex << in->ino << std::dec << dendl; + assert(in->oset.objects.empty()); + } + delete in->fcntl_locks; delete in->flock_locks; delete in; @@ -2377,7 +2626,6 @@ Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn) if (in) { // link to inode dn->inode = in; - in->get(); if (in->is_dir()) { if (in->dir) dn->get(); // dir -> dn pin @@ -2404,12 +2652,14 @@ Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn) void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry) { - Inode *in = dn->inode; + InodeRef in; + in.swap(dn->inode); ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn << " inode " << dn->inode << dendl; // unlink from inode if (in) { + invalidate_quota_tree(in.get()); if (in->is_dir()) { if (in->dir) dn->put(); // dir -> dn pin @@ -2420,7 +2670,6 @@ void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry) assert(in->dn_set.count(dn)); in->dn_set.erase(dn); ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dn_set << dendl; - put_inode(in); } if (keepdentry) { @@ -2441,6 +2690,28 @@ void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry) } } +/** + * For asynchronous flushes, check for errors from the IO and + * update the inode if necessary + */ +class C_Client_FlushComplete : public Context { +private: + Client *client; + InodeRef inode; +public: + C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { } + void finish(int r) { + assert(client->client_lock.is_locked_by_me()); + if (r != 0) { + client_t const whoami = client->whoami; // For the benefit of ldout prefix + ldout(client->cct, 1) << "I/O error from flush on inode " << inode + << " 0x" << std::hex << inode->ino << std::dec + << ": " << r << "(" << cpp_strerror(r) << ")" << dendl; + inode->async_err = r; + } + } +}; + /**** * caps @@ -2506,26 +2777,54 @@ void Client::put_cap_ref(Inode *in, int cap) int Client::get_caps(Inode *in, int need, int want, int *phave, loff_t endoff) { + int r = check_pool_perm(in, need); + if (r < 0) + return r; + while (1) { if (!in->is_any_caps()) return -ESTALE; - if (endoff > 0 && - (endoff >= (loff_t)in->max_size || - endoff > (loff_t)(in->size << 1)) && - endoff > (loff_t)in->wanted_max_size) { - ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl; - in->wanted_max_size = endoff; - check_caps(in, false); + int implemented; + int have = in->caps_issued(&implemented); + + bool waitfor_caps = false; + bool waitfor_commit = false; + + if (have & need & CEPH_CAP_FILE_WR) { + if (endoff > 0 && + (endoff >= (loff_t)in->max_size || + endoff > (loff_t)(in->size << 1)) && + endoff > (loff_t)in->wanted_max_size) { + ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl; + in->wanted_max_size = endoff; + check_caps(in, false); + } + + if (endoff >= 0 && endoff > (loff_t)in->max_size) { + ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl; + waitfor_caps = true; + } + if (!in->cap_snaps.empty()) { + if (in->cap_snaps.rbegin()->second->writing) { + ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl; + waitfor_caps = true; + } + for (map::iterator p = in->cap_snaps.begin(); + p != in->cap_snaps.end(); + ++p) + if (p->second->dirty_data) { + waitfor_commit = true; + break; + } + if (waitfor_commit) { + _flush(in, new C_Client_FlushComplete(this, in)); + ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl; + } + } } - if (endoff >= 0 && endoff > (loff_t)in->max_size) { - ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl; - } else if (!in->cap_snaps.empty() && in->cap_snaps.rbegin()->second->writing) { - ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl; - } else { - int implemented; - int have = in->caps_issued(&implemented); + if (!waitfor_caps && !waitfor_commit) { if ((have & need) == need) { int butnot = want & ~(have & need); int revoking = implemented & ~have; @@ -2540,9 +2839,17 @@ int Client::get_caps(Inode *in, int need, int want, int *phave, loff_t endoff) } } ldout(cct, 10) << "waiting for caps need " << ccap_string(need) << " want " << ccap_string(want) << dendl; + waitfor_caps = true; } - - wait_on_list(in->waitfor_caps); + + if ((need & CEPH_CAP_FILE_WR) && in->auth_cap && + in->auth_cap->session->readonly) + return -EROFS; + + if (waitfor_caps) + wait_on_list(in->waitfor_caps); + else if (waitfor_commit) + wait_on_list(in->waitfor_commit); } } @@ -2626,7 +2933,8 @@ void Client::send_cap(Inode *in, MetaSession *session, Cap *cap, cap->implemented, want, flush, - cap->mseq); + cap->mseq, + cap_epoch_barrier); m->head.issue_seq = cap->issue_seq; m->set_tid(flush_tid); @@ -2674,6 +2982,13 @@ void Client::check_caps(Inode *in, bool is_delayed) unsigned used = get_caps_used(in); unsigned cap_used; + if (in->is_dir() && (in->flags & I_COMPLETE)) { + // we do this here because we don't want to drop to Fs (and then + // drop the Fs if we do a create!) if that alone makes us send lookups + // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere + wanted |= CEPH_CAP_FILE_EXCL; + } + int retain = wanted | used | CEPH_CAP_PIN; if (!unmounting) { if (wanted) @@ -2774,11 +3089,11 @@ void Client::check_caps(Inode *in, bool is_delayed) } -void Client::queue_cap_snap(Inode *in, snapid_t seq) +void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc) { int used = get_caps_used(in); int dirty = in->caps_dirty(); - ldout(cct, 10) << "queue_cap_snap " << *in << " seq " << seq << " used " << ccap_string(used) << dendl; + ldout(cct, 10) << "queue_cap_snap " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl; if (in->cap_snaps.size() && in->cap_snaps.rbegin()->second->writing) { @@ -2787,12 +3102,11 @@ void Client::queue_cap_snap(Inode *in, snapid_t seq) } else if (in->caps_dirty() || (used & CEPH_CAP_FILE_WR) || (dirty & CEPH_CAP_ANY_WR)) { - in->get(); CapSnap *capsnap = new CapSnap(in); - in->cap_snaps[seq] = capsnap; - capsnap->context = in->snaprealm->get_snap_context(); + in->cap_snaps[old_snapc.seq] = capsnap; + capsnap->context = old_snapc; capsnap->issued = in->caps_issued(); - capsnap->dirty = in->caps_dirty(); // a bit conservative? + capsnap->dirty = in->caps_dirty(); capsnap->dirty_data = (used & CEPH_CAP_FILE_BUFFER); @@ -2822,6 +3136,13 @@ void Client::finish_cap_snap(Inode *in, CapSnap *capsnap, int used) capsnap->ctime = in->ctime; capsnap->time_warp_seq = in->time_warp_seq; + capsnap->dirty |= in->caps_dirty(); + + if (capsnap->dirty & CEPH_CAP_FILE_WR) { + capsnap->inline_data = in->inline_data; + capsnap->inline_version = in->inline_version; + } + if (used & CEPH_CAP_FILE_BUFFER) { ldout(cct, 10) << "finish_cap_snap " << *in << " cap_snap " << capsnap << " used " << used << " WRBUFFER, delaying" << dendl; @@ -2876,7 +3197,8 @@ void Client::flush_snaps(Inode *in, bool all_again, CapSnap *again) in->auth_cap->session->flushing_capsnaps.push_back(&capsnap->flushing_item); capsnap->flush_tid = ++in->last_flush_tid; - MClientCaps *m = new MClientCaps(CEPH_CAP_OP_FLUSHSNAP, in->ino, in->snaprealm->ino, 0, mseq); + MClientCaps *m = new MClientCaps(CEPH_CAP_OP_FLUSHSNAP, in->ino, in->snaprealm->ino, 0, mseq, + cap_epoch_barrier); m->set_client_tid(capsnap->flush_tid); m->head.snap_follows = p->first; @@ -2897,6 +3219,11 @@ void Client::flush_snaps(Inode *in, bool all_again, CapSnap *again) capsnap->atime.encode_timeval(&m->head.atime); m->head.time_warp_seq = capsnap->time_warp_seq; + if (capsnap->dirty & CEPH_CAP_FILE_WR) { + m->inline_version = in->inline_version; + m->inline_data = in->inline_data; + } + session->con->send_message(m); } } @@ -2950,13 +3277,12 @@ void Client::wake_inode_waiters(MetaSession *s) class C_Client_CacheInvalidate : public Context { private: Client *client; - Inode *inode; + InodeRef inode; int64_t offset, length; bool keep_caps; public: C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len, bool keep) : client(c), inode(in), offset(off), length(len), keep_caps(keep) { - inode->get(); } void finish(int r) { // _async_invalidate takes the lock when it needs to, call this back from outside of lock. @@ -2965,15 +3291,15 @@ class C_Client_CacheInvalidate : public Context { } }; -void Client::_async_invalidate(Inode *in, int64_t off, int64_t len, bool keep_caps) +void Client::_async_invalidate(InodeRef& in, int64_t off, int64_t len, bool keep_caps) { ldout(cct, 10) << "_async_invalidate " << off << "~" << len << (keep_caps ? " keep_caps" : "") << dendl; - ino_invalidate_cb(ino_invalidate_cb_handle, in->vino(), off, len); + ino_invalidate_cb(callback_handle, in->vino(), off, len); client_lock.Lock(); if (!keep_caps) - check_caps(in, false); - put_inode(in); + check_caps(in.get(), false); + in.reset(); // put inode inside client_lock client_lock.Unlock(); ldout(cct, 10) << "_async_invalidate " << off << "~" << len << (keep_caps ? " keep_caps" : "") << " done" << dendl; } @@ -3020,36 +3346,25 @@ void Client::_release(Inode *in) } } - -class C_Client_PutInode : public Context { - Client *client; - Inode *in; -public: - C_Client_PutInode(Client *c, Inode *i) : client(c), in(i) { - in->get(); - } - void finish(int) { - // I am used via ObjectCacher, which is responsible for taking - // the client lock before calling me back. - assert(client->client_lock.is_locked_by_me()); - client->put_inode(in); - } -}; - bool Client::_flush(Inode *in, Context *onfinish) { ldout(cct, 10) << "_flush " << *in << dendl; if (!in->oset.dirty_or_tx) { ldout(cct, 10) << " nothing to flush" << dendl; - if (onfinish) - onfinish->complete(0); + onfinish->complete(0); return true; } - if (!onfinish) { - onfinish = new C_Client_PutInode(this, in); + if (objecter->osdmap_pool_full(in->layout.fl_pg_pool)) { + ldout(cct, 1) << __func__ << ": FULL, purging for ENOSPC" << dendl; + objectcacher->purge_set(&in->oset); + if (onfinish) { + onfinish->complete(-ENOSPC); + } + return true; } + return objectcacher->flush_set(&in->oset, onfinish); } @@ -3209,14 +3524,12 @@ void Client::remove_cap(Cap *cap, bool queue_release) ldout(cct, 10) << "remove_cap mds." << mds << " on " << *in << dendl; if (queue_release) { - if (!session->release) - session->release = new MClientCapRelease; - ceph_mds_cap_item i; - i.ino = in->ino; - i.cap_id = cap->cap_id; - i.seq = cap->issue_seq; - i.migrate_seq = cap->mseq; - session->release->caps.push_back(i); + session->enqueue_cap_release( + in->ino, + cap->cap_id, + cap->issue_seq, + cap->mseq, + cap_epoch_barrier); } if (in->auth_cap == cap) { @@ -3277,20 +3590,38 @@ void Client::remove_session_caps(MetaSession *s) sync_cond.Signal(); } -void Client::_invalidate_kernel_dcache(MetaSession *s) -{ - if (!dentry_invalidate_cb) - return; +class C_Client_Remount : public Context { +private: + Client *client; +public: + C_Client_Remount(Client *c) : client(c) {} + void finish(int r) { + assert (r == 0); + r = client->remount_cb(client->callback_handle); + if (r != 0) { + client_t whoami = client->get_nodeid(); + lderr(client->cct) << "tried to remount (to trim kernel dentries) and got error " + << r << dendl; + if (client->require_remount && !client->unmounting) { + assert(0 == "failed to remount for kernel dentry trimming"); + } + } + } +}; - for (xlist::iterator p = s->caps.begin(); !p.end(); ++p) { - Inode *in = (*p)->inode; - if (in->dn_set.empty()) - continue; - for (set::iterator q = in->dn_set.begin(); - q != in->dn_set.end(); - ++q) { - _schedule_invalidate_dentry_callback(*q, false); +void Client::_invalidate_kernel_dcache() +{ + if (can_invalidate_dentries && dentry_invalidate_cb && root->dir) { + for (ceph::unordered_map::iterator p = root->dir->dentries.begin(); + p != root->dir->dentries.end(); + ++p) { + if (p->second->inode) + _schedule_invalidate_dentry_callback(p->second, false); } + } else if (remount_cb) { + // Hacky: + // when remounting a file system, linux kernel trims all unused dentries in the fs + remount_finisher.queue(new C_Client_Remount(this)); } } @@ -3319,11 +3650,17 @@ void Client::trim_caps(MetaSession *s, int max) ldout(cct, 20) << " trying to trim dentries for " << *in << dendl; bool all = true; set::iterator q = in->dn_set.begin(); - in->get(); + InodeRef tmp_ref(in); while (q != in->dn_set.end()) { Dentry *dn = *q++; if (dn->lru_is_expireable()) { - _schedule_invalidate_dentry_callback(dn, false); + if (can_invalidate_dentries && + dn->dir->parent_inode->ino == MDS_INO_ROOT) { + // Only issue one of these per DN for inodes in root: handle + // others more efficiently by calling for root-child DNs at + // the end of this function. + _schedule_invalidate_dentry_callback(dn, true); + } trim_dentry(dn); } else { ldout(cct, 20) << " not expirable: " << dn->name << dendl; @@ -3334,8 +3671,6 @@ void Client::trim_caps(MetaSession *s, int max) ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl; trimmed++; } - - put_inode(in); } ++p; @@ -3345,6 +3680,19 @@ void Client::trim_caps(MetaSession *s, int max) } } s->s_cap_iterator = NULL; + + if (s->caps.size() > max) + _invalidate_kernel_dcache(); +} + +void Client::force_session_readonly(MetaSession *s) +{ + s->readonly = true; + for (xlist::iterator p = s->caps.begin(); !p.end(); ++p) { + Inode *in = (*p)->inode; + if (in->caps_wanted() & CEPH_CAP_FILE_WR) + signal_cond_list(in->waitfor_caps); + } } void Client::mark_caps_dirty(Inode *in, int caps) @@ -3363,8 +3711,9 @@ int Client::mark_caps_flushing(Inode *in) int flushing = in->dirty_caps; assert(flushing); - if (flushing && !in->flushing_caps) { + if (!in->flushing_caps) { ldout(cct, 10) << "mark_caps_flushing " << ccap_string(flushing) << " " << *in << dendl; + in->flushing_cap_seq = ++last_flush_seq; num_flushing_caps++; } else { ldout(cct, 10) << "mark_caps_flushing (more) " << ccap_string(flushing) << " " << *in << dendl; @@ -3373,7 +3722,6 @@ int Client::mark_caps_flushing(Inode *in) in->flushing_caps |= flushing; in->dirty_caps = 0; - in->flushing_cap_seq = ++last_flush_seq; session->flushing_caps.push_back(&in->flushing_cap_item); @@ -3410,6 +3758,23 @@ void Client::flush_caps(Inode *in, MetaSession *session) (cap->issued | cap->implemented), in->flushing_caps); } +void Client::wait_sync_caps(Inode *in, uint16_t flush_tid[]) +{ +retry: + for (int i = 0; i < CEPH_CAP_BITS; ++i) { + if (!(in->flushing_caps & (1 << i))) + continue; + // handle uint16_t wrapping + if ((int16_t)(in->flushing_cap_tid[i] - flush_tid[i]) <= 0) { + ldout(cct, 10) << "wait_sync_caps on " << *in << " flushing " + << ccap_string(1 << i) << " want " << flush_tid[i] + << " last " << in->flushing_cap_tid[i] << dendl; + wait_on_list(in->waitfor_caps); + goto retry; + } + } +} + void Client::wait_sync_caps(uint64_t want) { retry: @@ -3437,10 +3802,10 @@ void Client::kick_flushing_caps(MetaSession *session) for (xlist::iterator p = session->flushing_capsnaps.begin(); !p.end(); ++p) { CapSnap *capsnap = *p; - Inode *in = capsnap->in; + InodeRef& in = capsnap->in; ldout(cct, 20) << " reflushing capsnap " << capsnap << " on " << *in << " to mds." << mds << dendl; - flush_snaps(in, false, capsnap); + flush_snaps(in.get(), false, capsnap); } for (xlist::iterator p = session->flushing_caps.begin(); !p.end(); ++p) { Inode *in = *p; @@ -3539,6 +3904,10 @@ void Client::put_snap_realm(SnapRealm *realm) << " " << realm->nref << " -> " << (realm->nref - 1) << dendl; if (--realm->nref == 0) { snap_realms.erase(realm->ino); + if (realm->pparent) { + realm->pparent->pchildren.erase(realm); + put_snap_realm(realm->pparent); + } delete realm; } } @@ -3560,12 +3929,20 @@ bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent) return false; } +static bool has_new_snaps(const SnapContext& old_snapc, + const SnapContext& new_snapc) +{ + return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq; +} + inodeno_t Client::update_snap_trace(bufferlist& bl, bool flush) { inodeno_t first_realm = 0; ldout(cct, 10) << "update_snap_trace len " << bl.length() << dendl; + map dirty_realms; + bufferlist::iterator p = bl.begin(); while (!p.end()) { SnapRealmInfo info; @@ -3574,6 +3951,8 @@ inodeno_t Client::update_snap_trace(bufferlist& bl, bool flush) first_realm = info.ino(); SnapRealm *realm = get_snap_realm(info.ino()); + bool invalidate = false; + if (info.seq() > realm->seq) { ldout(cct, 10) << "update_snap_trace " << *realm << " seq " << info.seq() << " > " << realm->seq << dendl; @@ -3586,28 +3965,19 @@ inodeno_t Client::update_snap_trace(bufferlist& bl, bool flush) while (!q.empty()) { SnapRealm *realm = q.front(); q.pop_front(); - ldout(cct, 10) << " flushing caps on " << *realm << dendl; - - xlist::iterator p = realm->inodes_with_caps.begin(); - while (!p.end()) { - Inode *in = *p; - ++p; - queue_cap_snap(in, realm->get_snap_context().seq); - } for (set::iterator p = realm->pchildren.begin(); - p != realm->pchildren.end(); + p != realm->pchildren.end(); ++p) q.push_back(*p); + + if (dirty_realms.count(realm) == 0) { + realm->nref++; + dirty_realms[realm] = realm->get_snap_context(); + } } } - } - - // _always_ verify parent - bool invalidate = adjust_realm_parent(realm, info.parent()); - - if (info.seq() > realm->seq) { // update realm->seq = info.seq(); realm->created = info.created(); @@ -3616,6 +3986,11 @@ inodeno_t Client::update_snap_trace(bufferlist& bl, bool flush) realm->my_snaps = info.my_snaps; invalidate = true; } + + // _always_ verify parent + if (adjust_realm_parent(realm, info.parent())) + invalidate = true; + if (invalidate) { invalidate_snaprealm_and_children(realm); ldout(cct, 15) << "update_snap_trace " << *realm << " self|parent updated" << dendl; @@ -3628,7 +4003,26 @@ inodeno_t Client::update_snap_trace(bufferlist& bl, bool flush) put_snap_realm(realm); } - return first_realm; + for (map::iterator q = dirty_realms.begin(); + q != dirty_realms.end(); + ++q) { + SnapRealm *realm = q->first; + // if there are new snaps ? + if (has_new_snaps(q->second, realm->get_snap_context())) { + ldout(cct, 10) << " flushing caps on " << *realm << dendl; + xlist::iterator r = realm->inodes_with_caps.begin(); + while (!r.end()) { + Inode *in = *r; + ++r; + queue_cap_snap(in, q->second); + } + } else { + ldout(cct, 10) << " no new snap on " << *realm << dendl; + } + put_snap_realm(realm); + } + + return first_realm; } void Client::handle_snap(MClientSnap *m) @@ -3645,6 +4039,7 @@ void Client::handle_snap(MClientSnap *m) list to_move; SnapRealm *realm = 0; + SnapContext old_snapc; if (m->head.op == CEPH_SNAP_OP_SPLIT) { assert(m->head.split); @@ -3656,6 +4051,7 @@ void Client::handle_snap(MClientSnap *m) // flush, then move, ino's. realm = get_snap_realm(info.ino()); ldout(cct, 10) << " splitting off " << *realm << dendl; + old_snapc = realm->get_snap_context(); for (vector::iterator p = m->split_inos.begin(); p != m->split_inos.end(); ++p) { @@ -3671,8 +4067,6 @@ void Client::handle_snap(MClientSnap *m) } ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl; - // queue for snap writeback - queue_cap_snap(in, in->snaprealm->get_snap_context().seq); in->snaprealm_item.remove_myself(); put_snap_realm(in->snaprealm); @@ -3696,11 +4090,15 @@ void Client::handle_snap(MClientSnap *m) update_snap_trace(m->bl, m->head.op != CEPH_SNAP_OP_DESTROY); if (realm) { + bool queue_snap = has_new_snaps(old_snapc, realm->get_snap_context()); for (list::iterator p = to_move.begin(); p != to_move.end(); ++p) { Inode *in = *p; in->snaprealm = realm; realm->inodes_with_caps.push_back(&in->snaprealm_item); realm->nref++; + // queue for snap writeback + if (queue_snap) + queue_cap_snap(in, old_snapc); } put_snap_realm(realm); } @@ -3708,6 +4106,35 @@ void Client::handle_snap(MClientSnap *m) m->put(); } +void Client::handle_quota(MClientQuota *m) +{ + mds_rank_t mds = mds_rank_t(m->get_source().num()); + MetaSession *session = _get_mds_session(mds, m->get_connection().get()); + if (!session) { + m->put(); + return; + } + + got_mds_push(session); + + ldout(cct, 10) << "handle_quota " << *m << " from mds." << mds << dendl; + + vinodeno_t vino(m->ino, CEPH_NOSNAP); + if (inode_map.count(vino)) { + Inode *in = NULL; + in = inode_map[vino]; + + if (in) { + if (in->quota.is_enable() ^ m->quota.is_enable()) + invalidate_quota_tree(in); + in->quota = m->quota; + in->rstat = m->rstat; + } + } + + m->put(); +} + void Client::handle_caps(MClientCaps *m) { mds_rank_t mds = mds_rank_t(m->get_source().num()); @@ -3716,6 +4143,17 @@ void Client::handle_caps(MClientCaps *m) m->put(); return; } + + if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) { + // Pause RADOS operations until we see the required epoch + objecter->set_epoch_barrier(m->osd_epoch_barrier); + } + + if (m->osd_epoch_barrier > cap_epoch_barrier) { + // Record the barrier so that we will transmit it to MDS when releasing + set_cap_epoch_barrier(m->osd_epoch_barrier); + } + got_mds_push(session); m->clear_payload(); // for if/when we send back to MDS @@ -3727,14 +4165,12 @@ void Client::handle_caps(MClientCaps *m) if (!in) { if (m->get_op() == CEPH_CAP_OP_IMPORT) { ldout(cct, 5) << "handle_caps don't have vino " << vino << " on IMPORT, immediately releasing" << dendl; - if (!session->release) - session->release = new MClientCapRelease; - ceph_mds_cap_item i; - i.ino = m->get_ino(); - i.cap_id = m->get_cap_id(); - i.seq = m->get_seq(); - i.migrate_seq = m->get_mseq(); - session->release->caps.push_back(i); + session->enqueue_cap_release( + m->get_ino(), + m->get_cap_id(), + m->get_seq(), + m->get_mseq(), + cap_epoch_barrier); } else { ldout(cct, 5) << "handle_caps don't have vino " << vino << ", dropping" << dendl; } @@ -3897,6 +4333,7 @@ void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, MCl num_flushing_caps--; sync_cond.Signal(); } + signal_cond_list(in->waitfor_caps); if (!in->caps_dirty()) put_inode(in); } @@ -3919,10 +4356,9 @@ void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, MClientCa } else { ldout(cct, 5) << "handle_cap_flushedsnap mds." << mds << " flushed snap follows " << follows << " on " << *in << dendl; + in->cap_snaps.erase(follows); capsnap->flushing_item.remove_myself(); delete capsnap; - in->cap_snaps.erase(follows); - put_inode(in); } } else { ldout(cct, 5) << "handle_cap_flushedsnap DUP(?) mds." << mds << " flushed snap follows " << follows @@ -3959,7 +4395,7 @@ void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& { ldout(cct, 10) << "_async_dentry_invalidate '" << name << "' ino " << ino << " in dir " << dirino << dendl; - dentry_invalidate_cb(dentry_invalidate_cb_handle, dirino, ino, name); + dentry_invalidate_cb(callback_handle, dirino, ino, name); } void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del) @@ -3968,8 +4404,24 @@ void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del) async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del)); } -void Client::_invalidate_inode_parents(Inode *in) +void Client::_try_to_trim_inode(Inode *in) { + int ref = in->get_num_ref(); + + if (in->dir && !in->dir->dentry_list.empty()) { + for (xlist::iterator p = in->dir->dentry_list.begin(); + !p.end(); ) { + Dentry *dn = *p; + ++p; + if (dn->lru_is_expireable()) + unlink(dn, false, false); // close dir, drop dentry + } + --ref; + } + // make sure inode was not freed when closing dir + if (ref == 0) + return; + set::iterator q = in->dn_set.begin(); while (q != in->dn_set.end()) { Dentry *dn = *q++; @@ -4047,9 +4499,8 @@ void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClient cap->issued = new_caps; cap->implemented |= new_caps; - - if (((used & ~new_caps) & CEPH_CAP_FILE_BUFFER) && - !_flush(in)) { + if (((used & ~new_caps) & CEPH_CAP_FILE_BUFFER) + && !_flush(in, new C_Client_FlushComplete(this, in))) { // waitin' for flush } else if ((old_caps & ~new_caps) & CEPH_CAP_FILE_CACHE) { _release(in); @@ -4087,7 +4538,7 @@ void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClient // may drop inode's last ref if (deleted_inode) - _invalidate_inode_parents(in); + _try_to_trim_inode(in); m->put(); } @@ -4097,7 +4548,7 @@ int Client::check_permissions(Inode *in, int flags, int uid, int gid) gid_t *sgids = NULL; int sgid_count = 0; if (getgroups_cb) { - sgid_count = getgroups_cb(getgroups_cb_handle, uid, &sgids); + sgid_count = getgroups_cb(callback_handle, uid, &sgids); if (sgid_count < 0) { ldout(cct, 3) << "getgroups failed!" << dendl; return sgid_count; @@ -4352,28 +4803,36 @@ int Client::mount(const std::string &mount_root) return r; } - mounted = true; - tick(); // start tick ldout(cct, 2) << "mounted: have mdsmap " << mdsmap->get_epoch() << dendl; // hack: get+pin root inode. // fuse assumes it's always there. - MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR); filepath fp(CEPH_INO_ROOT); if (!mount_root.empty()) fp = filepath(mount_root.c_str()); - req->set_filepath(fp); - req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL; - int res = make_request(req, -1, -1); - ldout(cct, 10) << "root getattr result=" << res << dendl; - if (res < 0) - return res; + while (true) { + MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR); + req->set_filepath(fp); + req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL; + int res = make_request(req, -1, -1); + ldout(cct, 10) << "root getattr result=" << res << dendl; + if (res < 0) + return res; + + if (fp.depth()) + fp.pop_dentry(); + else + break; + } + assert(root_ancestor->is_root()); assert(root); _ll_get(root); + mounted = true; + // trace? if (!cct->_conf->client_trace.empty()) { traceout.open(cct->_conf->client_trace.c_str()); @@ -4417,15 +4876,13 @@ void Client::unmount() timer.cancel_event(tick_event); tick_event = 0; - if (cwd) - put_inode(cwd); - cwd = NULL; + cwd.reset(); // clean up any unclosed files while (!fd_map.empty()) { Fh *fh = fd_map.begin()->second; fd_map.erase(fd_map.begin()); - ldout(cct, 0) << " destroying lost open file " << fh << " on " << *fh->inode << dendl; + ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl; _release_fh(fh); } @@ -4450,10 +4907,9 @@ void Client::unmount() assert(in); } if (!in->caps.empty()) { - in->get(); + InodeRef tmp_ref(in); _release(in); - _flush(in); - put_inode(in); + _flush(in, new C_Client_FlushComplete(this, in)); } } } @@ -4552,6 +5008,22 @@ void Client::tick() utime_t now = ceph_clock_now(cct); + if (!mounted && !mds_requests.empty()) { + MetaRequest *req = mds_requests.begin()->second; + if (req->op_stamp + cct->_conf->client_mount_timeout < now) { + req->aborted = true; + if (req->caller_cond) { + req->kick = true; + req->caller_cond->Signal(); + } + signal_cond_list(waiting_for_mdsmap); + for (map::iterator p = mds_sessions.begin(); + p != mds_sessions.end(); + ++p) + signal_context_list(p->second->waiting_for_open); + } + } + if (mdsmap->get_epoch()) { // renew caps? utime_t el = now - last_cap_renew; @@ -4573,6 +5045,7 @@ void Client::tick() check_caps(in, true); } + trim_cache(true); } void Client::renew_caps() @@ -4601,7 +5074,7 @@ void Client::renew_caps(MetaSession *session) // =============================================================== // high level (POSIXy) interface -int Client::_do_lookup(Inode *dir, const string& name, Inode **target) +int Client::_do_lookup(Inode *dir, const string& name, InodeRef *target) { int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP; MetaRequest *req = new MetaRequest(op); @@ -4618,7 +5091,7 @@ int Client::_do_lookup(Inode *dir, const string& name, Inode **target) return r; } -int Client::_lookup(Inode *dir, const string& dname, Inode **target) +int Client::_lookup(Inode *dir, const string& dname, InodeRef *target) { int r = 0; Dentry *dn = NULL; @@ -4678,9 +5151,14 @@ int Client::_lookup(Inode *dir, const string& dname, Inode **target) << " vs lease_gen " << dn->lease_gen << dendl; } // dir lease? - if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED) && - dn->cap_shared_gen == dir->shared_gen) { - goto hit_dn; + if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED)) { + if (dn->cap_shared_gen == dir->shared_gen) + goto hit_dn; + if (!dn->inode && (dir->flags & I_COMPLETE)) { + ldout(cct, 10) << "_lookup concluded ENOENT locally for " + << *dir << " dn '" << dname << "'" << dendl; + return -ENOENT; + } } } else { ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl; @@ -4745,10 +5223,10 @@ int Client::get_or_create(Inode *dir, const char* name, return 0; } -int Client::path_walk(const filepath& origpath, Inode **final, bool followsym) +int Client::path_walk(const filepath& origpath, InodeRef *end, bool followsym) { filepath path = origpath; - Inode *cur; + InodeRef cur; if (origpath.absolute()) cur = root; else @@ -4764,8 +5242,8 @@ int Client::path_walk(const filepath& origpath, Inode **final, bool followsym) const string &dname = path[i]; ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl; ldout(cct, 20) << " (path is " << path << ")" << dendl; - Inode *next; - int r = _lookup(cur, dname, &next); + InodeRef next; + int r = _lookup(cur.get(), dname, &next); if (r < 0) return r; // only follow trailing symlink if followsym. always follow @@ -4805,13 +5283,13 @@ int Client::path_walk(const filepath& origpath, Inode **final, bool followsym) continue; } } - cur = next; + cur.swap(next); i++; } if (!cur) return -ENOENT; - if (final) - *final = cur; + if (end) + end->swap(cur); return 0; } @@ -4830,18 +5308,15 @@ int Client::link(const char *relexisting, const char *relpath) string name = path.last_dentry(); path.pop_dentry(); - Inode *in, *dir; + InodeRef in, dir; int r; r = path_walk(existing, &in); if (r < 0) goto out; - in->get(); r = path_walk(path, &dir); if (r < 0) - goto out_unlock; - r = _link(in, dir, name.c_str()); - out_unlock: - put_inode(in); + goto out; + r = _link(in.get(), dir.get(), name.c_str()); out: return r; } @@ -4855,11 +5330,11 @@ int Client::unlink(const char *relpath) filepath path(relpath); string name = path.last_dentry(); path.pop_dentry(); - Inode *dir; + InodeRef dir; int r = path_walk(path, &dir); if (r < 0) return r; - return _unlink(dir, name.c_str()); + return _unlink(dir.get(), name.c_str()); } int Client::rename(const char *relfrom, const char *relto) @@ -4876,21 +5351,16 @@ int Client::rename(const char *relfrom, const char *relto) string toname = to.last_dentry(); to.pop_dentry(); - Inode *fromdir, *todir; + InodeRef fromdir, todir; int r; r = path_walk(from, &fromdir); if (r < 0) goto out; - fromdir->get(); r = path_walk(to, &todir); if (r < 0) - goto out_unlock; - todir->get(); - r = _rename(fromdir, fromname.c_str(), todir, toname.c_str()); - put_inode(todir); - out_unlock: - put_inode(fromdir); + goto out; + r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str()); out: return r; } @@ -4908,12 +5378,12 @@ int Client::mkdir(const char *relpath, mode_t mode) filepath path(relpath); string name = path.last_dentry(); path.pop_dentry(); - Inode *dir; + InodeRef dir; int r = path_walk(path, &dir); if (r < 0) { return r; } - return _mkdir(dir, name.c_str(), mode); + return _mkdir(dir.get(), name.c_str(), mode); } int Client::mkdirs(const char *relpath, mode_t mode) @@ -4928,12 +5398,12 @@ int Client::mkdirs(const char *relpath, mode_t mode) filepath path(relpath); unsigned int i; int r=0; - Inode *cur = cwd; - Inode *next; + InodeRef cur, next; + cur = cwd; for (i=0; iino).get_path() << dendl; } @@ -4967,11 +5437,11 @@ int Client::rmdir(const char *relpath) filepath path(relpath); string name = path.last_dentry(); path.pop_dentry(); - Inode *dir; + InodeRef dir; int r = path_walk(path, &dir); if (r < 0) return r; - return _rmdir(dir, name.c_str()); + return _rmdir(dir.get(), name.c_str()); } int Client::mknod(const char *relpath, mode_t mode, dev_t rdev) @@ -4984,11 +5454,11 @@ int Client::mknod(const char *relpath, mode_t mode, dev_t rdev) filepath path(relpath); string name = path.last_dentry(); path.pop_dentry(); - Inode *in; + InodeRef in; int r = path_walk(path, &in); if (r < 0) return r; - return _mknod(in, name.c_str(), mode, rdev); + return _mknod(in.get(), name.c_str(), mode, rdev); } // symlinks @@ -5003,11 +5473,11 @@ int Client::symlink(const char *target, const char *relpath) filepath path(relpath); string name = path.last_dentry(); path.pop_dentry(); - Inode *dir; + InodeRef dir; int r = path_walk(path, &dir); if (r < 0) return r; - return _symlink(dir, name.c_str(), target); + return _symlink(dir.get(), name.c_str(), target); } int Client::readlink(const char *relpath, char *buf, loff_t size) @@ -5017,12 +5487,12 @@ int Client::readlink(const char *relpath, char *buf, loff_t size) tout(cct) << relpath << std::endl; filepath path(relpath); - Inode *in; + InodeRef in; int r = path_walk(path, &in, false); if (r < 0) return r; - return _readlink(in, buf, size); + return _readlink(in.get(), buf, size); } int Client::_readlink(Inode *in, char *buf, size_t size) @@ -5062,7 +5532,7 @@ int Client::_getattr(Inode *in, int mask, int uid, int gid, bool force) } int Client::_setattr(Inode *in, struct stat *attr, int mask, int uid, int gid, - Inode **inp) + InodeRef *inp) { int issued = in->caps_issued(); @@ -5072,6 +5542,11 @@ int Client::_setattr(Inode *in, struct stat *attr, int mask, int uid, int gid, if (in->snapid != CEPH_NOSNAP) { return -EROFS; } + if ((mask & CEPH_SETATTR_SIZE) && + (unsigned long)attr->st_size > in->size && + is_quota_bytes_exceeded(in, (unsigned long)attr->st_size - in->size)) { + return -EDQUOT; + } // make the change locally? if (!mask) { @@ -5186,11 +5661,11 @@ int Client::setattr(const char *relpath, struct stat *attr, int mask) tout(cct) << mask << std::endl; filepath path(relpath); - Inode *in; + InodeRef in; int r = path_walk(path, &in); if (r < 0) return r; - return _setattr(in, attr, mask); + return _setattr(in, attr, mask); } int Client::fsetattr(int fd, struct stat *attr, int mask) @@ -5203,7 +5678,11 @@ int Client::fsetattr(int fd, struct stat *attr, int mask) Fh *f = get_filehandle(fd); if (!f) return -EBADF; - return _setattr(f->inode, attr, mask); +#if defined(__linux__) && defined(O_PATH) + if (f->flags & O_PATH) + return -EBADF; +#endif + return _setattr(f->inode, attr, mask); } int Client::stat(const char *relpath, struct stat *stbuf, @@ -5214,7 +5693,7 @@ int Client::stat(const char *relpath, struct stat *stbuf, tout(cct) << "stat" << std::endl; tout(cct) << relpath << std::endl; filepath path(relpath); - Inode *in; + InodeRef in; int r = path_walk(path, &in); if (r < 0) return r; @@ -5236,7 +5715,7 @@ int Client::lstat(const char *relpath, struct stat *stbuf, tout(cct) << "lstat" << std::endl; tout(cct) << relpath << std::endl; filepath path(relpath); - Inode *in; + InodeRef in; // don't follow symlinks int r = path_walk(path, &in, false); if (r < 0) @@ -5305,7 +5784,7 @@ int Client::chmod(const char *relpath, mode_t mode) tout(cct) << relpath << std::endl; tout(cct) << mode << std::endl; filepath path(relpath); - Inode *in; + InodeRef in; int r = path_walk(path, &in); if (r < 0) return r; @@ -5323,6 +5802,10 @@ int Client::fchmod(int fd, mode_t mode) Fh *f = get_filehandle(fd); if (!f) return -EBADF; +#if defined(__linux__) && defined(O_PATH) + if (f->flags & O_PATH) + return -EBADF; +#endif struct stat attr; attr.st_mode = mode; return _setattr(f->inode, &attr, CEPH_SETATTR_MODE); @@ -5335,7 +5818,7 @@ int Client::lchmod(const char *relpath, mode_t mode) tout(cct) << relpath << std::endl; tout(cct) << mode << std::endl; filepath path(relpath); - Inode *in; + InodeRef in; // don't follow symlinks int r = path_walk(path, &in, false); if (r < 0) @@ -5353,7 +5836,7 @@ int Client::chown(const char *relpath, int uid, int gid) tout(cct) << uid << std::endl; tout(cct) << gid << std::endl; filepath path(relpath); - Inode *in; + InodeRef in; int r = path_walk(path, &in); if (r < 0) return r; @@ -5376,6 +5859,10 @@ int Client::fchown(int fd, int uid, int gid) Fh *f = get_filehandle(fd); if (!f) return -EBADF; +#if defined(__linux__) && defined(O_PATH) + if (f->flags & O_PATH) + return -EBADF; +#endif struct stat attr; attr.st_uid = uid; attr.st_gid = gid; @@ -5393,7 +5880,7 @@ int Client::lchown(const char *relpath, int uid, int gid) tout(cct) << uid << std::endl; tout(cct) << gid << std::endl; filepath path(relpath); - Inode *in; + InodeRef in; // don't follow symlinks int r = path_walk(path, &in, false); if (r < 0) @@ -5415,7 +5902,7 @@ int Client::utime(const char *relpath, struct utimbuf *buf) tout(cct) << buf->modtime << std::endl; tout(cct) << buf->actime << std::endl; filepath path(relpath); - Inode *in; + InodeRef in; int r = path_walk(path, &in); if (r < 0) return r; @@ -5435,7 +5922,7 @@ int Client::lutime(const char *relpath, struct utimbuf *buf) tout(cct) << buf->modtime << std::endl; tout(cct) << buf->actime << std::endl; filepath path(relpath); - Inode *in; + InodeRef in; // don't follow symlinks int r = path_walk(path, &in, false); if (r < 0) @@ -5448,17 +5935,31 @@ int Client::lutime(const char *relpath, struct utimbuf *buf) return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME); } +int Client::flock(int fd, int operation, uint64_t owner) +{ + Mutex::Locker lock(client_lock); + tout(cct) << "flock" << std::endl; + tout(cct) << fd << std::endl; + tout(cct) << operation << std::endl; + tout(cct) << owner << std::endl; + Fh *f = get_filehandle(fd); + if (!f) + return -EBADF; + + return _flock(f, operation, owner, NULL); +} + int Client::opendir(const char *relpath, dir_result_t **dirpp) { Mutex::Locker lock(client_lock); tout(cct) << "opendir" << std::endl; tout(cct) << relpath << std::endl; filepath path(relpath); - Inode *in; + InodeRef in; int r = path_walk(path, &in); if (r < 0) return r; - r = _opendir(in, dirpp); + r = _opendir(in.get(), dirpp); tout(cct) << (unsigned long)*dirpp << std::endl; return r; } @@ -5496,8 +5997,7 @@ void Client::_closedir(dir_result_t *dirp) ldout(cct, 10) << "_closedir(" << dirp << ")" << dendl; if (dirp->inode) { ldout(cct, 10) << "_closedir detaching inode " << dirp->inode << dendl; - put_inode(dirp->inode); - dirp->inode = 0; + dirp->inode.reset(); } _readdir_drop_dirp_buffer(dirp); delete dirp; @@ -5598,8 +6098,6 @@ void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp) { ldout(cct, 10) << "_readdir_drop_dirp_buffer " << dirp << dendl; if (dirp->buffer) { - for (unsigned i = 0; i < dirp->buffer->size(); i++) - put_inode((*dirp->buffer)[i].second); delete dirp->buffer; dirp->buffer = NULL; } @@ -5621,13 +6119,13 @@ int Client::_readdir_get_frag(dir_result_t *dirp) if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR) op = CEPH_MDS_OP_LSSNAP; - Inode *diri = dirp->inode; + InodeRef& diri = dirp->inode; MetaRequest *req = new MetaRequest(op); filepath path; diri->make_nosnap_relative_path(path); req->set_filepath(path); - req->set_inode(diri); + req->set_inode(diri.get()); req->head.args.readdir.frag = fg; if (dirp->last_name.length()) { req->path2.set_path(dirp->last_name.c_str()); @@ -5652,7 +6150,7 @@ int Client::_readdir_get_frag(dir_result_t *dirp) _readdir_drop_dirp_buffer(dirp); - dirp->buffer = new vector >; + dirp->buffer = new vector >; dirp->buffer->swap(req->readdir_result); if (fg != req->readdir_reply_frag) { @@ -5711,8 +6209,12 @@ int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p) ++pd; } - string prev_name; - while (!pd.end()) { + string dn_name; + while (true) { + if (!dirp->inode->is_complete_and_ordered()) + return -EAGAIN; + if (pd.end()) + break; Dentry *dn = *pd; if (dn->inode == NULL) { ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl; @@ -5727,7 +6229,7 @@ int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p) struct stat st; struct dirent de; - int stmask = fill_stat(dn->inode, &st); + int stmask = fill_stat(dn->inode, &st); fill_dirent(&de, dn->name.c_str(), st.st_mode, st.st_ino, dirp->offset + 1); uint64_t next_off = dn->offset + 1; @@ -5735,6 +6237,8 @@ int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p) if (pd.end()) next_off = dir_result_t::END; + dn_name = dn->name; // fill in name while we have lock + client_lock.Unlock(); int r = cb(p, &de, &st, stmask, next_off); // _next_ offset client_lock.Lock(); @@ -5742,13 +6246,12 @@ int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p) << " = " << r << dendl; if (r < 0) { - dirp->next_offset = dn->offset; - dirp->at_cache_name = prev_name; + dirp->next_offset = next_off - 1; return r; } - prev_name = dn->name; - dirp->offset = next_off; + dirp->next_offset = dirp->offset = next_off; + dirp->at_cache_name = dn_name; // we successfully returned this one; update! if (r > 0) return r; } @@ -5777,7 +6280,7 @@ int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p) frag_t fg = dirp->frag(); uint32_t off = dirp->fragpos(); - Inode *diri = dirp->inode; + InodeRef& diri = dirp->inode; if (dirp->at_end()) return 0; @@ -5805,7 +6308,7 @@ int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p) if (dirp->offset == 1) { ldout(cct, 15) << " including .." << dendl; if (!diri->dn_set.empty()) { - Inode* in = diri->get_first_parent()->inode; + InodeRef& in = diri->get_first_parent()->inode; fill_dirent(&de, "..", S_IFDIR, in->ino, 2); fill_stat(in, &st); } else { @@ -5866,9 +6369,9 @@ int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p) dirp->offset = dir_result_t::make_fpos(fg, off); while (off >= dirp->this_offset && off - dirp->this_offset < dirp->buffer->size()) { - pair& ent = (*dirp->buffer)[off - dirp->this_offset]; + pair& ent = (*dirp->buffer)[off - dirp->this_offset]; - int stmask = fill_stat(ent.second, &st); + int stmask = fill_stat(ent.second, &st); fill_dirent(&de, ent.first.c_str(), st.st_mode, st.st_ino, dirp->offset + 1); client_lock.Unlock(); @@ -6111,21 +6614,40 @@ int Client::open(const char *relpath, int flags, mode_t mode, int stripe_unit, Fh *fh = NULL; +#if defined(__linux__) && defined(O_PATH) + /* When the O_PATH is being specified, others flags than O_DIRECTORY + * and O_NOFOLLOW are ignored. Please refer do_entry_open() function + * in kernel (fs/open.c). */ + if (flags & O_PATH) + flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH; +#endif + filepath path(relpath); - Inode *in; + InodeRef in; bool created = false; - int r = path_walk(path, &in); + /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */ + bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL))); + int r = path_walk(path, &in, followsym); + if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL)) return -EEXIST; + +#if defined(__linux__) && defined(O_PATH) + if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH)) +#else + if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW)) +#endif + return -ELOOP; + if (r == -ENOENT && (flags & O_CREAT)) { filepath dirpath = path; string dname = dirpath.last_dentry(); dirpath.pop_dentry(); - Inode *dir; + InodeRef dir; r = path_walk(dirpath, &dir); if (r < 0) return r; - r = _create(dir, dname.c_str(), flags, mode, &in, &fh, stripe_unit, + r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit, stripe_count, object_size, data_pool, &created); } if (r < 0) @@ -6135,17 +6657,16 @@ int Client::open(const char *relpath, int flags, mode_t mode, int stripe_unit, // posix says we can only check permissions of existing files uid_t uid = geteuid(); gid_t gid = getegid(); - r = check_permissions(in, flags, uid, gid); + r = check_permissions(in.get(), flags, uid, gid); if (r < 0) goto out; } if (!fh) - r = _open(in, flags, mode, &fh); + r = _open(in.get(), flags, mode, &fh); if (r >= 0) { // allocate a integer file descriptor assert(fh); - assert(in); r = get_fd(); assert(fd_map.count(r) == 0); fd_map[r] = fh; @@ -6235,11 +6756,12 @@ int Client::lookup_parent(Inode *ino, Inode **parent) req->set_filepath(path); req->set_inode(ino); - int r = make_request(req, -1, -1, NULL, NULL, rand() % mdsmap->get_num_in_mds()); + InodeRef target; + int r = make_request(req, -1, -1, &target, NULL, rand() % mdsmap->get_num_in_mds()); // Give caller a reference to the parent ino if they provided a pointer. if (parent != NULL) { if (r == 0) { - *parent = req->target; + *parent = target.get(); _ll_get(*parent); ldout(cct, 3) << "lookup_parent found parent " << (*parent)->ino << dendl; } else { @@ -6283,7 +6805,6 @@ Fh *Client::_create_fh(Inode *in, int flags, int cmode) // inode assert(in); f->inode = in; - f->inode->get(); ldout(cct, 10) << "_create_fh " << in->ino << " mode " << cmode << dendl; @@ -6317,12 +6838,12 @@ int Client::_release_fh(Fh *f) { //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl; //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl; - Inode *in = f->inode; + Inode *in = f->inode.get(); ldout(cct, 5) << "_release_fh " << f << " mode " << f->mode << " on " << *in << dendl; if (in->snapid == CEPH_NOSNAP) { if (in->put_open_ref(f->mode)) { - _flush(in); + _flush(in, new C_Client_FlushComplete(this, in)); // release clean pages too, if we dont want RDCACHE if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0 && !(in->caps_wanted() & CEPH_CAP_FILE_CACHE) && @@ -6338,10 +6859,25 @@ int Client::_release_fh(Fh *f) _release_filelocks(f); - put_inode(in); - delete f; + // Finally, read any async err (i.e. from flushes) from the inode + int err = in->async_err; + if (err != 0) { + ldout(cct, 1) << "_release_fh " << f << " on inode " << *in << " caught async_err = " + << cpp_strerror(err) << dendl; + } else { + ldout(cct, 10) << "_release_fh " << f << " on inode " << *in << " no async_err state" << dendl; + } - return 0; + _put_fh(f); + + return err; +} + +void Client::_put_fh(Fh *f) +{ + int left = f->put(); + if (!left) + delete f; } int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp, int uid, int gid) @@ -6399,10 +6935,10 @@ int Client::close(int fd) Fh *fh = get_filehandle(fd); if (!fh) return -EBADF; - _release_fh(fh); + int err = _release_fh(fh); fd_map.erase(fd); ldout(cct, 3) << "close exit(" << fd << ")" << dendl; - return 0; + return err; } @@ -6420,12 +6956,16 @@ loff_t Client::lseek(int fd, loff_t offset, int whence) Fh *f = get_filehandle(fd); if (!f) return -EBADF; +#if defined(__linux__) && defined(O_PATH) + if (f->flags & O_PATH) + return -EBADF; +#endif return _lseek(f, offset, whence); } loff_t Client::_lseek(Fh *f, loff_t offset, int whence) { - Inode *in = f->inode; + Inode *in = f->inode.get(); int r; switch (whence) { @@ -6510,7 +7050,7 @@ int Client::uninline_data(Inode *in, Context *onfinish) inline_version_bl); bufferlist inline_data = in->inline_data; uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq); - uninline_ops.setxattr("inline_version", inline_version_bl); + uninline_ops.setxattr("inline_version", stringify(in->inline_version)); objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), @@ -6539,6 +7079,10 @@ int Client::read(int fd, char *buf, loff_t size, loff_t offset) Fh *f = get_filehandle(fd); if (!f) return -EBADF; +#if defined(__linux__) && defined(O_PATH) + if (f->flags & O_PATH) + return -EBADF; +#endif bufferlist bl; int r = _read(f, offset, size, &bl); ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl; @@ -6549,10 +7093,17 @@ int Client::read(int fd, char *buf, loff_t size, loff_t offset) return r; } +int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset) +{ + if (iovcnt < 0) + return EINVAL; + return _preadv_pwritev(fd, iov, iovcnt, offset, false); +} + int Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl) { const md_config_t *conf = cct->_conf; - Inode *in = f->inode; + Inode *in = f->inode.get(); //bool lazy = f->mode == CEPH_FILE_MODE_LAZY; @@ -6675,27 +7226,35 @@ int Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl) return r < 0 ? r : bl->length(); } +Client::C_Readahead::C_Readahead(Client *c, Fh *f) : + client(c), f(f) { + f->get(); +} + void Client::C_Readahead::finish(int r) { lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl; - client->put_cap_ref(f->inode, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE); + client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE); f->readahead.dec_pending(); + client->_put_fh(f); } int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl) { const md_config_t *conf = cct->_conf; - Inode *in = f->inode; + Inode *in = f->inode.get(); ldout(cct, 10) << "_read_async " << *in << " " << off << "~" << len << dendl; // trim read based on file size? if (off >= in->size) return 0; + if (len == 0) + return 0; if (off + len > in->size) { len = in->size - off; } - ldout(cct, 10) << " max_byes=" << conf->client_readahead_max_bytes + ldout(cct, 10) << " max_bytes=" << conf->client_readahead_max_bytes << " max_periods=" << conf->client_readahead_max_periods << dendl; // read (and possibly block) @@ -6748,7 +7307,7 @@ int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl) int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl, bool *checkeof) { - Inode *in = f->inode; + Inode *in = f->inode.get(); uint64_t pos = off; int left = len; int read = 0; @@ -6819,11 +7378,9 @@ int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl, */ class C_Client_SyncCommit : public Context { Client *cl; - Inode *in; + InodeRef in; public: - C_Client_SyncCommit(Client *c, Inode *i) : cl(c), in(i) { - in->get(); - } + C_Client_SyncCommit(Client *c, Inode *i) : cl(c), in(i) {} void finish(int) { // Called back by Filter, then Client is responsible for taking its own lock assert(!cl->client_lock.is_locked_by_me()); @@ -6831,14 +7388,14 @@ class C_Client_SyncCommit : public Context { } }; -void Client::sync_write_commit(Inode *in) +void Client::sync_write_commit(InodeRef& in) { Mutex::Locker l(client_lock); assert(unsafe_sync_write > 0); unsafe_sync_write--; - put_cap_ref(in, CEPH_CAP_FILE_BUFFER); + put_cap_ref(in.get(), CEPH_CAP_FILE_BUFFER); ldout(cct, 15) << "sync_write_commit unsafe_sync_write = " << unsafe_sync_write << dendl; if (unsafe_sync_write == 0 && unmounting) { @@ -6846,7 +7403,7 @@ void Client::sync_write_commit(Inode *in) mount_cond.Signal(); } - put_inode(in); + in.reset(); // put inode inside client_lock } int Client::write(int fd, const char *buf, loff_t size, loff_t offset) @@ -6860,27 +7417,78 @@ int Client::write(int fd, const char *buf, loff_t size, loff_t offset) Fh *fh = get_filehandle(fd); if (!fh) return -EBADF; - int r = _write(fh, offset, size, buf); +#if defined(__linux__) && defined(O_PATH) + if (fh->flags & O_PATH) + return -EBADF; +#endif + int r = _write(fh, offset, size, buf, NULL, 0); ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl; return r; } +int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset) +{ + if (iovcnt < 0) + return EINVAL; + return _preadv_pwritev(fd, iov, iovcnt, offset, true); +} + +int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write) +{ + Mutex::Locker lock(client_lock); + tout(cct) << fd << std::endl; + tout(cct) << offset << std::endl; + + Fh *fh = get_filehandle(fd); + if (!fh) + return -EBADF; +#if defined(__linux__) && defined(O_PATH) + if (fh->flags & O_PATH) + return -EBADF; +#endif + loff_t totallen = 0; + for (unsigned i = 0; i < iovcnt; i++) { + totallen += iov[i].iov_len; + } + if (write) { + int w = _write(fh, offset, totallen, NULL, iov, iovcnt); + ldout(cct, 3) << "pwritev(" << fd << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl; + return w; + } else { + bufferlist bl; + int r = _read(fh, offset, totallen, &bl); + ldout(cct, 3) << "preadv(" << fd << ", " << offset << ") = " << r << dendl; + int bufoff = 0; + for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) { + /* + * This piece of code aims to handle the case that bufferlist does not have enough data + * to fill in the iov + */ + if (resid < iov[j].iov_len) { + bl.copy(bufoff, resid, (char *)iov[j].iov_base); + break; + } else { + bl.copy(bufoff, iov[j].iov_len, (char *)iov[j].iov_base); + } + resid -= iov[j].iov_len; + bufoff += iov[j].iov_len; + } + return r; + } +} -int Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf) +int Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf, + const struct iovec *iov, int iovcnt) { if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large! return -EFBIG; - { - const OSDMap *osdmap = objecter->get_osdmap_read(); - bool full = osdmap->test_flag(CEPH_OSDMAP_FULL); - objecter->put_osdmap_read(); - if (full) - return -ENOSPC; - } - //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl; - Inode *in = f->inode; + Inode *in = f->inode.get(); + + if (objecter->osdmap_pool_full(in->layout.fl_pg_pool)) { + return -ENOSPC; + } assert(in->snapid == CEPH_NOSNAP); @@ -6888,6 +7496,11 @@ int Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf) if ((f->mode & CEPH_FILE_MODE_WR) == 0) return -EBADF; + // check quota + uint64_t endoff = offset + size; + if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size)) + return -EDQUOT; + // use/adjust fd pos? if (offset < 0) { lock_fh_pos(f); @@ -6917,14 +7530,25 @@ int Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf) } // copy into fresh buffer (since our write may be resub, async) - bufferptr bp; - if (size > 0) bp = buffer::copy(buf, size); bufferlist bl; - bl.push_back( bp ); + bufferptr *bparr = NULL; + if (buf) { + bufferptr bp; + if (size > 0) bp = buffer::copy(buf, size); + bl.push_back( bp ); + } else if (iov){ + //iov case + bparr = new bufferptr[iovcnt]; + for (int i = 0; i < iovcnt; i++) { + if (iov[i].iov_len > 0) { + bparr[i] = buffer::copy((char*)iov[i].iov_base, iov[i].iov_len); + } + bl.push_back( bparr[i] ); + } + } utime_t lat; uint64_t totalwritten; - uint64_t endoff = offset + size; int have; int r = get_caps(in, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, endoff); if (r < 0) @@ -6978,9 +7602,7 @@ int Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf) // async, caching, non-blocking. r = objectcacher->file_write(&in->oset, &in->layout, in->snaprealm->get_snap_context(), - offset, size, bl, ceph_clock_now(cct), 0, - client_lock); - + offset, size, bl, ceph_clock_now(cct), 0); put_cap_ref(in, CEPH_CAP_FILE_BUFFER); if (r < 0) @@ -7034,9 +7656,13 @@ int Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf) in->size = totalwritten + offset; mark_caps_dirty(in, CEPH_CAP_FILE_WR); - if ((in->size << 1) >= in->max_size && - (in->reported_size << 1) < in->max_size) - check_caps(in, false); + if (is_quota_bytes_approaching(in)) { + check_caps(in, true); + } else { + if ((in->size << 1) >= in->max_size && + (in->reported_size << 1) < in->max_size) + check_caps(in, false); + } ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl; } else { @@ -7067,13 +7693,22 @@ int Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf) } put_cap_ref(in, CEPH_CAP_FILE_WR); + delete[] bparr; return r; } int Client::_flush(Fh *f) { - // no-op, for now. hrm. - return 0; + Inode *in = f->inode.get(); + int err = in->async_err; + if (err != 0) { + ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = " + << cpp_strerror(err) << dendl; + } else { + ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl; + } + + return err; } int Client::truncate(const char *relpath, loff_t length) @@ -7093,6 +7728,10 @@ int Client::ftruncate(int fd, loff_t length) Fh *f = get_filehandle(fd); if (!f) return -EBADF; +#if defined(__linux__) && defined(O_PATH) + if (f->flags & O_PATH) + return -EBADF; +#endif struct stat attr; attr.st_size = length; return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE); @@ -7108,42 +7747,41 @@ int Client::fsync(int fd, bool syncdataonly) Fh *f = get_filehandle(fd); if (!f) return -EBADF; +#if defined(__linux__) && defined(O_PATH) + if (f->flags & O_PATH) + return -EBADF; +#endif int r = _fsync(f, syncdataonly); ldout(cct, 3) << "fsync(" << fd << ", " << syncdataonly << ") = " << r << dendl; return r; } -int Client::_fsync(Fh *f, bool syncdataonly) +int Client::_fsync(Inode *in, bool syncdataonly) { int r = 0; - - Inode *in = f->inode; - ceph_tid_t wait_on_flush = 0; + uint16_t wait_on_flush[CEPH_CAP_BITS]; bool flushed_metadata = false; Mutex lock("Client::_fsync::lock"); Cond cond; bool done = false; C_SafeCond *object_cacher_completion = NULL; + InodeRef tmp_ref; - ldout(cct, 3) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl; + ldout(cct, 3) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl; if (cct->_conf->client_oc) { object_cacher_completion = new C_SafeCond(&lock, &cond, &done, &r); - in->get(); // take a reference; C_SafeCond doesn't and _flush won't either + tmp_ref = in; // take a reference; C_SafeCond doesn't and _flush won't either _flush(in, object_cacher_completion); ldout(cct, 15) << "using return-valued form of _fsync" << dendl; } if (!syncdataonly && (in->dirty_caps & ~CEPH_CAP_ANY_FILE_WR)) { - for (map::iterator iter = in->caps.begin(); iter != in->caps.end(); ++iter) { - if (iter->second->implemented & ~CEPH_CAP_ANY_FILE_WR) { - MetaSession *session = mds_sessions[iter->first]; - assert(session); - flush_caps(in, session); - } + check_caps(in, true); + if (in->flushing_caps) { + flushed_metadata = true; + memcpy(wait_on_flush, in->flushing_cap_tid, sizeof(wait_on_flush)); } - wait_on_flush = in->last_flush_tid; - flushed_metadata = true; } else ldout(cct, 10) << "no metadata needs to commit" << dendl; if (object_cacher_completion) { // wait on a real reply instead of guessing @@ -7154,7 +7792,6 @@ int Client::_fsync(Fh *f, bool syncdataonly) cond.Wait(lock); lock.Unlock(); client_lock.Lock(); - put_inode(in); ldout(cct, 15) << "got " << r << " from flush writeback" << dendl; } else { // FIXME: this can starve @@ -7165,19 +7802,46 @@ int Client::_fsync(Fh *f, bool syncdataonly) } } + if (!in->unsafe_dir_ops.empty()) { + MetaRequest *req = in->unsafe_dir_ops.back(); + uint64_t last_tid = req->get_tid(); + ldout(cct, 15) << "waiting on unsafe requests, last tid " << last_tid << dendl; + + do { + req->get(); + wait_on_list(req->waitfor_safe); + put_request(req); + if (in->unsafe_dir_ops.empty()) + break; + req = in->unsafe_dir_ops.front(); + } while (req->tid < last_tid); + } + if (!r) { - if (flushed_metadata) wait_sync_caps(wait_on_flush); - // this could wait longer than strictly necessary, - // but on a sync the user can put up with it + if (flushed_metadata) + wait_sync_caps(in, wait_on_flush); ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl; } else { ldout(cct, 1) << "ino " << in->ino << " failed to commit to disk! " << cpp_strerror(-r) << dendl; } + + if (in->async_err) { + ldout(cct, 1) << "ino " << in->ino << " marked with error from background flush! " + << cpp_strerror(in->async_err) << dendl; + r = in->async_err; + } + return r; } +int Client::_fsync(Fh *f, bool syncdataonly) +{ + ldout(cct, 3) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl; + return _fsync(f->inode.get(), syncdataonly); +} + int Client::fstat(int fd, struct stat *stbuf) { Mutex::Locker lock(client_lock); @@ -7204,15 +7868,12 @@ int Client::chdir(const char *relpath) tout(cct) << "chdir" << std::endl; tout(cct) << relpath << std::endl; filepath path(relpath); - Inode *in; + InodeRef in; int r = path_walk(path, &in); if (r < 0) return r; - if (cwd != in) { - in->get(); - put_inode(cwd); - cwd = in; - } + if (cwd != in) + cwd.swap(in); ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl; return 0; } @@ -7222,7 +7883,7 @@ void Client::getcwd(string& dir) filepath path; ldout(cct, 10) << "getcwd " << *cwd << dendl; - Inode *in = cwd; + Inode *in = cwd.get(); while (in != root) { assert(in->dn_set.size() < 2); // dirs can't be hard-linked Dentry *dn = in->get_first_parent(); @@ -7239,7 +7900,7 @@ void Client::getcwd(string& dir) // start over path = filepath(); - in = cwd; + in = cwd.get(); continue; } path.push_front_dentry(dn->name); @@ -7380,8 +8041,10 @@ int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep, if (!in->flock_locks) in->flock_locks = new ceph_lock_state_t(cct); lock_state = in->flock_locks; - } else + } else { assert(0); + return -EINVAL; + } _update_lock_state(fl, owner, lock_state); if (fh) { @@ -7411,8 +8074,10 @@ int Client::_interrupt_filelock(MetaRequest *req) lock_type = CEPH_LOCK_FLOCK_INTR; else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL) lock_type = CEPH_LOCK_FCNTL_INTR; - else + else { assert(0); + return -EINVAL; + } MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK); filepath path; @@ -7460,7 +8125,7 @@ void Client::_release_filelocks(Fh *fh) if (!fh->fcntl_locks && !fh->flock_locks) return; - Inode *in = fh->inode; + Inode *in = fh->inode.get(); ldout(cct, 10) << "_release_filelocks " << fh << " ino " << in->ino << dendl; list > to_release; @@ -7531,7 +8196,7 @@ void Client::_update_lock_state(struct flock *fl, uint64_t owner, int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner) { - Inode *in = fh->inode; + Inode *in = fh->inode.get(); ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl; int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner); return ret; @@ -7539,7 +8204,7 @@ int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner) int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep, void *fuse_req) { - Inode *in = fh->inode; + Inode *in = fh->inode.get(); ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl; int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner, fuse_req); ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl; @@ -7548,7 +8213,7 @@ int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep, void *fu int Client::_flock(Fh *fh, int cmd, uint64_t owner, void *fuse_req) { - Inode *in = fh->inode; + Inode *in = fh->inode.get(); ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl; int sleep = !(cmd & LOCK_NB); @@ -7587,43 +8252,64 @@ int Client::ll_statfs(Inode *in, struct statvfs *stbuf) return statfs(0, stbuf); } -void Client::ll_register_ino_invalidate_cb(client_ino_callback_t cb, void *handle) +void Client::ll_register_callbacks(struct client_callback_args *args) { - Mutex::Locker l(client_lock); - ldout(cct, 10) << "ll_register_ino_invalidate_cb cb " << (void*)cb << " p " << (void*)handle << dendl; - if (cb == NULL) + if (!args) return; - ino_invalidate_cb = cb; - ino_invalidate_cb_handle = handle; - async_ino_invalidator.start(); -} - -void Client::ll_register_dentry_invalidate_cb(client_dentry_callback_t cb, void *handle) -{ Mutex::Locker l(client_lock); - ldout(cct, 10) << "ll_register_dentry_invalidate_cb cb " << (void*)cb << " p " << (void*)handle << dendl; - if (cb == NULL) - return; - dentry_invalidate_cb = cb; - dentry_invalidate_cb_handle = handle; - async_dentry_invalidator.start(); + ldout(cct, 10) << "ll_register_callbacks cb " << args->handle + << " invalidate_ino_cb " << args->ino_cb + << " invalidate_dentry_cb " << args->dentry_cb + << " getgroups_cb" << args->getgroups_cb + << " switch_interrupt_cb " << args->switch_intr_cb + << " remount_cb " << args->remount_cb + << dendl; + callback_handle = args->handle; + if (args->ino_cb) { + ino_invalidate_cb = args->ino_cb; + async_ino_invalidator.start(); + } + if (args->dentry_cb) { + dentry_invalidate_cb = args->dentry_cb; + async_dentry_invalidator.start(); + } + if (args->switch_intr_cb) { + switch_interrupt_cb = args->switch_intr_cb; + interrupt_finisher.start(); + } + if (args->remount_cb) { + remount_cb = args->remount_cb; + remount_finisher.start(); + } + getgroups_cb = args->getgroups_cb; } -void Client::ll_register_switch_interrupt_cb(client_switch_interrupt_callback_t cb) +int Client::test_dentry_handling(bool can_invalidate) { - Mutex::Locker l(client_lock); - ldout(cct, 10) << "ll_register_switch_interrupt_cb cb " << (void*)cb << dendl; - if (cb == NULL) - return; - switch_interrupt_cb = cb; - interrupt_finisher.start(); -} + int r = 0; -void Client::ll_register_getgroups_cb(client_getgroups_callback_t cb, void *handle) -{ - Mutex::Locker l(client_lock); - getgroups_cb = cb; - getgroups_cb_handle = handle; + can_invalidate_dentries = can_invalidate; + + if (can_invalidate_dentries) { + assert(dentry_invalidate_cb); + ldout(cct, 1) << "using dentry_invalidate_cb" << dendl; + } else if (remount_cb) { + ldout(cct, 1) << "using remount_cb" << dendl; + int s = remount_cb(callback_handle); + if (s) { + lderr(cct) << "Failed to invoke remount, needed to ensure kernel dcache consistency" + << dendl; + } + if (cct->_conf->client_die_on_failed_remount) { + require_remount = true; + r = s; + } + } else { + lderr(cct) << "no method to invalidate kernel dentry cache; expect issues!" << dendl; + if (cct->_conf->client_die_on_failed_remount) + assert(0); + } + return r; } int Client::_sync_fs() @@ -7681,7 +8367,7 @@ int Client::lazyio_synchronize(int fd, loff_t offset, size_t count) Fh *f = get_filehandle(fd); if (!f) return -EBADF; - Inode *in = f->inode; + Inode *in = f->inode.get(); _fsync(f, true); _release(in); @@ -7696,22 +8382,22 @@ int Client::mksnap(const char *relpath, const char *name) { Mutex::Locker l(client_lock); filepath path(relpath); - Inode *in; + InodeRef in; int r = path_walk(path, &in); if (r < 0) return r; - Inode *snapdir = open_snapdir(in); + Inode *snapdir = open_snapdir(in.get()); return _mkdir(snapdir, name, 0); } int Client::rmsnap(const char *relpath, const char *name) { Mutex::Locker l(client_lock); filepath path(relpath); - Inode *in; + InodeRef in; int r = path_walk(path, &in); if (r < 0) return r; - Inode *snapdir = open_snapdir(in); + Inode *snapdir = open_snapdir(in.get()); return _rmdir(snapdir, name); } @@ -7733,7 +8419,7 @@ int Client::get_caps_issued(const char *path) { Mutex::Locker lock(client_lock); filepath p(path); - Inode *in; + InodeRef in; int r = path_walk(p, &in, true); if (r < 0) return r; @@ -7748,7 +8434,7 @@ Inode *Client::open_snapdir(Inode *diri) Inode *in; vinodeno_t vino(diri->ino, CEPH_SNAPDIR); if (!inode_map.count(vino)) { - in = new Inode(cct, vino, &diri->layout); + in = new Inode(this, vino, &diri->layout); in->ino = diri->ino; in->snapid = CEPH_SNAPDIR; @@ -7762,7 +8448,6 @@ Inode *Client::open_snapdir(Inode *diri) in->dirfragtree.clear(); inode_map[vino] = in; in->snapdir_parent = diri; - diri->get(); ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl; } else { in = inode_map[vino]; @@ -7780,7 +8465,7 @@ int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr, tout(cct) << name << std::endl; string dname(name); - Inode *in; + InodeRef in; int r = 0; r = _lookup(parent, dname, &in); @@ -7791,40 +8476,38 @@ int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr, assert(in); fill_stat(in, attr); - _ll_get(in); + _ll_get(in.get()); out: ldout(cct, 3) << "ll_lookup " << parent << " " << name << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl; tout(cct) << attr->st_ino << std::endl; - *out = in; + *out = in.get(); return r; } -int Client::ll_walk(const char* name, Inode **i, struct stat *attr) +int Client::ll_walk(const char* name, Inode **out, struct stat *attr) { Mutex::Locker lock(client_lock); filepath fp(name, 0); - Inode *destination = NULL; + InodeRef in; int rc; ldout(cct, 3) << "ll_walk" << name << dendl; tout(cct) << "ll_walk" << std::endl; tout(cct) << name << std::endl; - rc = path_walk(fp, &destination, false); - if (rc < 0) - { - attr->st_ino = 0; - *i = NULL; - return rc; - } - else - { - fill_stat(destination, attr); - *i = destination; - return 0; - } + rc = path_walk(fp, &in, false); + if (rc < 0) { + attr->st_ino = 0; + *out = NULL; + return rc; + } else { + assert(in); + fill_stat(in, attr); + *out = in.get(); + return 0; + } } @@ -7968,12 +8651,13 @@ int Client::ll_setattr(Inode *in, struct stat *attr, int mask, int uid, tout(cct) << attr->st_atime << std::endl; tout(cct) << mask << std::endl; - Inode *target = in; + InodeRef target(in); int res = _setattr(in, attr, mask, uid, gid, &target); if (res == 0) { - assert(in == target); + assert(in == target.get()); fill_stat(in, attr); } + ldout(cct, 3) << "ll_setattr " << vino << " = " << res << dendl; return res; } @@ -7985,81 +8669,117 @@ int Client::ll_setattr(Inode *in, struct stat *attr, int mask, int uid, int Client::getxattr(const char *path, const char *name, void *value, size_t size) { Mutex::Locker lock(client_lock); - Inode *ceph_inode; - int r = Client::path_walk(path, &ceph_inode, true); + InodeRef in; + int r = Client::path_walk(path, &in, true); if (r < 0) return r; - return Client::_getxattr(ceph_inode, name, value, size, getuid(), getgid()); + return Client::_getxattr(in.get(), name, value, size, getuid(), getgid()); } int Client::lgetxattr(const char *path, const char *name, void *value, size_t size) { Mutex::Locker lock(client_lock); - Inode *ceph_inode; - int r = Client::path_walk(path, &ceph_inode, false); + InodeRef in; + int r = Client::path_walk(path, &in, false); if (r < 0) return r; - return Client::_getxattr(ceph_inode, name, value, size, getuid(), getgid()); + return Client::_getxattr(in.get(), name, value, size, getuid(), getgid()); +} + +int Client::fgetxattr(int fd, const char *name, void *value, size_t size) +{ + Mutex::Locker lock(client_lock); + Fh *f = get_filehandle(fd); + if (!f) + return -EBADF; + return Client::_getxattr(f->inode.get(), name, value, size, getuid(), getgid()); } int Client::listxattr(const char *path, char *list, size_t size) { Mutex::Locker lock(client_lock); - Inode *ceph_inode; - int r = Client::path_walk(path, &ceph_inode, true); + InodeRef in; + int r = Client::path_walk(path, &in, true); if (r < 0) return r; - return Client::_listxattr(ceph_inode, list, size, getuid(), getgid()); + return Client::_listxattr(in.get(), list, size, getuid(), getgid()); } int Client::llistxattr(const char *path, char *list, size_t size) { Mutex::Locker lock(client_lock); - Inode *ceph_inode; - int r = Client::path_walk(path, &ceph_inode, false); + InodeRef in; + int r = Client::path_walk(path, &in, false); if (r < 0) return r; - return Client::_listxattr(ceph_inode, list, size, getuid(), getgid()); + return Client::_listxattr(in.get(), list, size, getuid(), getgid()); +} + +int Client::flistxattr(int fd, char *list, size_t size) +{ + Mutex::Locker lock(client_lock); + Fh *f = get_filehandle(fd); + if (!f) + return -EBADF; + return Client::_listxattr(f->inode.get(), list, size, getuid(), getgid()); } int Client::removexattr(const char *path, const char *name) { Mutex::Locker lock(client_lock); - Inode *ceph_inode; - int r = Client::path_walk(path, &ceph_inode, true); + InodeRef in; + int r = Client::path_walk(path, &in, true); if (r < 0) return r; - return Client::_removexattr(ceph_inode, name, getuid(), getgid()); + return Client::_removexattr(in.get(), name, getuid(), getgid()); } int Client::lremovexattr(const char *path, const char *name) { Mutex::Locker lock(client_lock); - Inode *ceph_inode; - int r = Client::path_walk(path, &ceph_inode, false); + InodeRef in; + int r = Client::path_walk(path, &in, false); if (r < 0) return r; - return Client::_removexattr(ceph_inode, name, getuid(), getgid()); + return Client::_removexattr(in.get(), name, getuid(), getgid()); +} + +int Client::fremovexattr(int fd, const char *name) +{ + Mutex::Locker lock(client_lock); + Fh *f = get_filehandle(fd); + if (!f) + return -EBADF; + return Client::_removexattr(f->inode.get(), name, getuid(), getgid()); } int Client::setxattr(const char *path, const char *name, const void *value, size_t size, int flags) { Mutex::Locker lock(client_lock); - Inode *ceph_inode; - int r = Client::path_walk(path, &ceph_inode, true); + InodeRef in; + int r = Client::path_walk(path, &in, true); if (r < 0) return r; - return Client::_setxattr(ceph_inode, name, value, size, flags, getuid(), getgid()); + return Client::_setxattr(in.get(), name, value, size, flags, getuid(), getgid()); } int Client::lsetxattr(const char *path, const char *name, const void *value, size_t size, int flags) { Mutex::Locker lock(client_lock); - Inode *ceph_inode; - int r = Client::path_walk(path, &ceph_inode, false); + InodeRef in; + int r = Client::path_walk(path, &in, false); if (r < 0) return r; - return Client::_setxattr(ceph_inode, name, value, size, flags, getuid(), getgid()); + return Client::_setxattr(in.get(), name, value, size, flags, getuid(), getgid()); +} + +int Client::fsetxattr(int fd, const char *name, const void *value, size_t size, int flags) +{ + Mutex::Locker lock(client_lock); + Fh *f = get_filehandle(fd); + if (!f) + return -EBADF; + return Client::_setxattr(f->inode.get(), name, value, size, flags, getuid(), getgid()); } int Client::_getxattr(Inode *in, const char *name, void *value, size_t size, @@ -8092,7 +8812,7 @@ int Client::_getxattr(Inode *in, const char *name, void *value, size_t size, r = -ENODATA; if (in->xattrs.count(n)) { r = in->xattrs[n].length(); - if (size != 0) { + if (r > 0 && size != 0) { if (size >= (unsigned)r) memcpy(value, in->xattrs[n].c_str(), r); else @@ -8281,6 +9001,26 @@ int Client::ll_removexattr(Inode *in, const char *name, int uid, int gid) return _removexattr(in, name, uid, gid); } +bool Client::_vxattrcb_quota_exists(Inode *in) +{ + return in->quota.is_enable(); +} +size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size) +{ + return snprintf(val, size, + "max_bytes=%lld max_files=%lld", + (long long int)in->quota.max_bytes, + (long long int)in->quota.max_files); +} +size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size) +{ + return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes); +} +size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size) +{ + return snprintf(val, size, "%lld", (long long int)in->quota.max_files); +} + bool Client::_vxattrcb_layout_exists(Inode *in) { char *p = (char *)&in->layout; @@ -8382,6 +9122,14 @@ size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size) hidden: true, \ exists_cb: &Client::_vxattrcb_layout_exists, \ } +#define XATTR_QUOTA_FIELD(_type, _name) \ +{ \ + name: CEPH_XATTR_NAME(_type, _name), \ + getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \ + readonly: false, \ + hidden: true, \ + exists_cb: &Client::_vxattrcb_quota_exists, \ +} const Client::VXattr Client::_dir_vxattrs[] = { { @@ -8403,6 +9151,15 @@ const Client::VXattr Client::_dir_vxattrs[] = { XATTR_NAME_CEPH(dir, rsubdirs), XATTR_NAME_CEPH(dir, rbytes), XATTR_NAME_CEPH(dir, rctime), + { + name: "ceph.quota", + getxattr_cb: &Client::_vxattrcb_quota, + readonly: false, + hidden: true, + exists_cb: &Client::_vxattrcb_quota_exists, + }, + XATTR_QUOTA_FIELD(quota, max_bytes), + XATTR_QUOTA_FIELD(quota, max_files), { name: "" } /* Required table terminator */ }; @@ -8478,7 +9235,7 @@ int Client::ll_readlink(Inode *in, char *buf, size_t buflen, int uid, int gid) } int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev, - int uid, int gid, Inode **inp) + int uid, int gid, InodeRef *inp) { ldout(cct, 3) << "_mknod(" << dir->ino << " " << name << ", 0" << oct << mode << dec << ", " << rdev << ", uid " << uid << ", gid " @@ -8490,6 +9247,9 @@ int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev, if (dir->snapid != CEPH_NOSNAP) { return -EROFS; } + if (is_quota_files_exceeded(dir)) { + return -EDQUOT; + } MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD); @@ -8536,21 +9296,21 @@ int Client::ll_mknod(Inode *parent, const char *name, mode_t mode, tout(cct) << mode << std::endl; tout(cct) << rdev << std::endl; - Inode *in = NULL; + InodeRef in; int r = _mknod(parent, name, mode, rdev, uid, gid, &in); if (r == 0) { fill_stat(in, attr); - _ll_get(in); + _ll_get(in.get()); } tout(cct) << attr->st_ino << std::endl; ldout(cct, 3) << "ll_mknod " << vparent << " " << name << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl; - *out = in; + *out = in.get(); return r; } int Client::_create(Inode *dir, const char *name, int flags, mode_t mode, - Inode **inp, Fh **fhp, int stripe_unit, int stripe_count, + InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count, int object_size, const char *data_pool, bool *created, int uid, int gid) { @@ -8562,6 +9322,9 @@ int Client::_create(Inode *dir, const char *name, int flags, mode_t mode, if (dir->snapid != CEPH_NOSNAP) { return -EROFS; } + if (is_quota_files_exceeded(dir)) { + return -EDQUOT; + } int cmode = ceph_flags_to_mode(flags); if (cmode < 0) @@ -8612,7 +9375,7 @@ int Client::_create(Inode *dir, const char *name, int flags, mode_t mode, /* If the caller passed a value in fhp, do the open */ if(fhp) { (*inp)->get_open_ref(cmode); - *fhp = _create_fh(*inp, flags, cmode); + *fhp = _create_fh(inp->get(), flags, cmode); } reply_error: @@ -8632,7 +9395,7 @@ int Client::_create(Inode *dir, const char *name, int flags, mode_t mode, int Client::_mkdir(Inode *dir, const char *name, mode_t mode, int uid, int gid, - Inode **inp) + InodeRef *inp) { ldout(cct, 3) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct << mode << dec << ", uid " << uid << ", gid " << gid << ")" @@ -8644,6 +9407,9 @@ int Client::_mkdir(Inode *dir, const char *name, mode_t mode, int uid, int gid, if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) { return -EROFS; } + if (is_quota_files_exceeded(dir)) { + return -EDQUOT; + } MetaRequest *req = new MetaRequest(dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR); @@ -8689,21 +9455,21 @@ int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode, tout(cct) << name << std::endl; tout(cct) << mode << std::endl; - Inode *in = NULL; + InodeRef in; int r = _mkdir(parent, name, mode, uid, gid, &in); if (r == 0) { fill_stat(in, attr); - _ll_get(in); + _ll_get(in.get()); } tout(cct) << attr->st_ino << std::endl; ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl; - *out = in; + *out = in.get(); return r; } int Client::_symlink(Inode *dir, const char *name, const char *target, int uid, - int gid, Inode **inp) + int gid, InodeRef *inp) { ldout(cct, 3) << "_symlink(" << dir->ino << " " << name << ", " << target << ", uid " << uid << ", gid " << gid << ")" << dendl; @@ -8714,6 +9480,9 @@ int Client::_symlink(Inode *dir, const char *name, const char *target, int uid, if (dir->snapid != CEPH_NOSNAP) { return -EROFS; } + if (is_quota_files_exceeded(dir)) { + return -EDQUOT; + } MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK); @@ -8758,16 +9527,16 @@ int Client::ll_symlink(Inode *parent, const char *name, const char *value, tout(cct) << name << std::endl; tout(cct) << value << std::endl; - Inode *in = NULL; + InodeRef in; int r = _symlink(parent, name, value, uid, gid, &in); if (r == 0) { fill_stat(in, attr); - _ll_get(in); + _ll_get(in.get()); } tout(cct) << attr->st_ino << std::endl; ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl; - *out = in; + *out = in.get(); return r; } @@ -8786,6 +9555,8 @@ int Client::_unlink(Inode *dir, const char *name, int uid, int gid) path.push_dentry(name); req->set_filepath(path); + InodeRef otherin; + Dentry *de; int res = get_or_create(dir, name, &de); if (res < 0) @@ -8794,11 +9565,10 @@ int Client::_unlink(Inode *dir, const char *name, int uid, int gid) req->dentry_drop = CEPH_CAP_FILE_SHARED; req->dentry_unless = CEPH_CAP_FILE_EXCL; - Inode *otherin; res = _lookup(dir, name, &otherin); if (res < 0) goto fail; - req->set_other_inode(otherin); + req->set_other_inode(otherin.get()); req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; req->set_inode(dir); @@ -8845,18 +9615,24 @@ int Client::_rmdir(Inode *dir, const char *name, int uid, int gid) req->dentry_drop = CEPH_CAP_FILE_SHARED; req->dentry_unless = CEPH_CAP_FILE_EXCL; - req->inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; + req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; + + InodeRef in; Dentry *de; int res = get_or_create(dir, name, &de); if (res < 0) goto fail; - req->set_dentry(de); - Inode *in; res = _lookup(dir, name, &in); if (res < 0) goto fail; - req->set_inode(in); + if (req->get_op() == CEPH_MDS_OP_RMDIR) { + req->set_inode(dir); + req->set_dentry(de); + req->set_other_inode(in.get()); + } else { + unlink(de, true, true); + } res = make_request(req, uid, gid); @@ -8888,12 +9664,26 @@ int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const ch ldout(cct, 3) << "_rename(" << fromdir->ino << " " << fromname << " to " << todir->ino << " " << toname << " uid " << uid << " gid " << gid << ")" << dendl; - if (fromdir->snapid != CEPH_NOSNAP || - todir->snapid != CEPH_NOSNAP) { - return -EROFS; + if (fromdir->snapid != todir->snapid) + return -EXDEV; + + int op = CEPH_MDS_OP_RENAME; + if (fromdir->snapid != CEPH_NOSNAP) { + if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR) + op = CEPH_MDS_OP_RENAMESNAP; + else + return -EROFS; + } + if (cct->_conf->client_quota && + fromdir != todir && + (fromdir->quota.is_enable() || + todir->quota.is_enable() || + get_quota_root(fromdir) != get_quota_root(todir))) { + return -EXDEV; } - MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RENAME); + InodeRef target; + MetaRequest *req = new MetaRequest(op); filepath from; fromdir->make_nosnap_relative_path(from); @@ -8908,39 +9698,44 @@ int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const ch int res = get_or_create(fromdir, fromname, &oldde); if (res < 0) goto fail; - req->set_old_dentry(oldde); - req->old_dentry_drop = CEPH_CAP_FILE_SHARED; - req->old_dentry_unless = CEPH_CAP_FILE_EXCL; - Dentry *de; res = get_or_create(todir, toname, &de); if (res < 0) goto fail; - req->set_dentry(de); - req->dentry_drop = CEPH_CAP_FILE_SHARED; - req->dentry_unless = CEPH_CAP_FILE_EXCL; - Inode *oldin; - res = _lookup(fromdir, fromname, &oldin); - if (res < 0) - goto fail; - req->set_old_inode(oldin); - req->old_inode_drop = CEPH_CAP_LINK_SHARED; + if (op == CEPH_MDS_OP_RENAME) { + req->set_old_dentry(oldde); + req->old_dentry_drop = CEPH_CAP_FILE_SHARED; + req->old_dentry_unless = CEPH_CAP_FILE_EXCL; - Inode *otherin; - res = _lookup(todir, toname, &otherin); - if (res != 0 && res != -ENOENT) { - goto fail; - } else if (res == 0) { - req->set_other_inode(otherin); - req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; - } + req->set_dentry(de); + req->dentry_drop = CEPH_CAP_FILE_SHARED; + req->dentry_unless = CEPH_CAP_FILE_EXCL; - req->set_inode(todir); + InodeRef oldin, otherin; + res = _lookup(fromdir, fromname, &oldin); + if (res < 0) + goto fail; + req->set_old_inode(oldin.get()); + req->old_inode_drop = CEPH_CAP_LINK_SHARED; - Inode *target; - res = make_request(req, uid, gid, &target); + res = _lookup(todir, toname, &otherin); + if (res != 0 && res != -ENOENT) { + goto fail; + } else if (res == 0) { + req->set_other_inode(otherin.get()); + req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; + } + req->set_inode(todir); + } else { + // renamesnap reply contains no tracedn, so we need to invalidate + // dentry manually + unlink(oldde, true, true); + unlink(de, true, true); + } + + res = make_request(req, uid, gid, &target); ldout(cct, 10) << "rename result is " << res << dendl; // renamed item from our cache @@ -8973,7 +9768,7 @@ int Client::ll_rename(Inode *parent, const char *name, Inode *newparent, return _rename(parent, name, newparent, newname, uid, gid); } -int Client::_link(Inode *in, Inode *dir, const char *newname, int uid, int gid, Inode **inp) +int Client::_link(Inode *in, Inode *dir, const char *newname, int uid, int gid, InodeRef *inp) { ldout(cct, 3) << "_link(" << in->ino << " to " << dir->ino << " " << newname << " uid " << uid << " gid " << gid << ")" << dendl; @@ -8984,6 +9779,9 @@ int Client::_link(Inode *in, Inode *dir, const char *newname, int uid, int gid, if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) { return -EROFS; } + if (is_quota_files_exceeded(dir)) { + return -EDQUOT; + } MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK); @@ -9029,10 +9827,12 @@ int Client::ll_link(Inode *parent, Inode *newparent, const char *newname, tout(cct) << vnewparent << std::endl; tout(cct) << newname << std::endl; - int r = _link(parent, newparent, newname, uid, gid, &parent); + InodeRef target; + int r = _link(parent, newparent, newname, uid, gid, &target); if (r == 0) { - fill_stat(parent, attr); - _ll_get(parent); + assert(target); + fill_stat(target, attr); + _ll_get(target.get()); } return r; } @@ -9082,6 +9882,11 @@ int Client::ll_file_layout(Inode *in, ceph_file_layout *layout) return 0; } +int Client::ll_file_layout(Fh *fh, ceph_file_layout *layout) +{ + return ll_file_layout(fh->inode.get(), layout); +} + /* Currently we cannot take advantage of redundancy in reads, since we would have to go through all possible placement groups (a potentially quite large number determined by a hash), and use CRUSH @@ -9163,6 +9968,16 @@ int Client::ll_releasedir(dir_result_t *dirp) return 0; } +int Client::ll_fsyncdir(dir_result_t *dirp) +{ + Mutex::Locker lock(client_lock); + ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl; + tout(cct) << "ll_fsyncdir" << std::endl; + tout(cct) << (unsigned long)dirp << std::endl; + + return _fsync(dirp->inode.get(), false); +} + int Client::ll_open(Inode *in, int flags, Fh **fhp, int uid, int gid) { assert(!(flags & O_CREAT)); @@ -9212,7 +10027,7 @@ int Client::ll_create(Inode *parent, const char *name, mode_t mode, tout(cct) << flags << std::endl; bool created = false; - Inode *in = NULL; + InodeRef in; int r = _lookup(parent, name, &in); if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL)) @@ -9223,9 +10038,6 @@ int Client::ll_create(Inode *parent, const char *name, mode_t mode, 0, 0, 0, NULL, &created, uid, gid); if (r < 0) goto out; - - if ((!in) && fhp) - in = (*fhp)->inode; } if (r < 0) @@ -9236,15 +10048,16 @@ int Client::ll_create(Inode *parent, const char *name, mode_t mode, ldout(cct, 20) << "ll_create created = " << created << dendl; if (!created) { - r = check_permissions(in, flags, uid, gid); + r = check_permissions(in.get(), flags, uid, gid); if (r < 0) { if (fhp && *fhp) { - _release_fh(*fhp); + int release_r = _release_fh(*fhp); + assert(release_r == 0); // during create, no async data ops should have happened } goto out; } if (fhp && (*fhp == NULL)) { - r = _open(in, flags, mode, fhp); + r = _open(in.get(), flags, mode, fhp); if (r < 0) goto out; } @@ -9264,8 +10077,8 @@ int Client::ll_create(Inode *parent, const char *name, mode_t mode, // passing an Inode in outp requires an additional ref if (outp) { if (in) - _ll_get(in); - *outp = in; + _ll_get(in.get()); + *outp = in.get(); } return r; @@ -9440,7 +10253,7 @@ int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data) tout(cct) << off << std::endl; tout(cct) << len << std::endl; - int r = _write(fh, off, len, data); + int r = _write(fh, off, len, data, NULL, 0); ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r << dendl; return r; @@ -9479,15 +10292,12 @@ int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length) if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE)) return -EOPNOTSUPP; - { - const OSDMap *osdmap = objecter->get_osdmap_read(); - bool full = osdmap->test_flag(CEPH_OSDMAP_FULL); - objecter->put_osdmap_read(); - if (full && !(mode & FALLOC_FL_PUNCH_HOLE)) - return -ENOSPC; - } + Inode *in = fh->inode.get(); - Inode *in = fh->inode; + if (objecter->osdmap_pool_full(in->layout.fl_pg_pool) + && !(mode & FALLOC_FL_PUNCH_HOLE)) { + return -ENOSPC; + } if (in->snapid != CEPH_NOSNAP) return -EROFS; @@ -9495,6 +10305,13 @@ int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length) if ((fh->mode & CEPH_FILE_MODE_WR) == 0) return -EBADF; + uint64_t size = offset + length; + if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) && + size > in->size && + is_quota_bytes_exceeded(in, size - in->size)) { + return -EDQUOT; + } + int have; int r = get_caps(in, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1); if (r < 0) @@ -9570,9 +10387,13 @@ int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length) in->mtime = ceph_clock_now(cct); mark_caps_dirty(in, CEPH_CAP_FILE_WR); - if ((in->size << 1) >= in->max_size && - (in->reported_size << 1) < in->max_size) - check_caps(in, false); + if (is_quota_bytes_approaching(in)) { + check_caps(in, true); + } else { + if ((in->size << 1) >= in->max_size && + (in->reported_size << 1) < in->max_size) + check_caps(in, false); + } } } @@ -9626,6 +10447,10 @@ int Client::fallocate(int fd, int mode, loff_t offset, loff_t length) Fh *fh = get_filehandle(fd); if (!fh) return -EBADF; +#if defined(__linux__) && defined(O_PATH) + if (fh->flags & O_PATH) + return -EBADF; +#endif return _fallocate(fh, mode, offset, length); } @@ -9637,8 +10462,7 @@ int Client::ll_release(Fh *fh) tout(cct) << "ll_release (fh)" << std::endl; tout(cct) << (unsigned long)fh << std::endl; - _release_fh(fh); - return 0; + return _release_fh(fh); } int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner) @@ -9705,7 +10529,7 @@ int Client::describe_layout(const char *relpath, ceph_file_layout *lp) Mutex::Locker lock(client_lock); filepath path(relpath); - Inode *in; + InodeRef in; int r = path_walk(path, &in); if (r < 0) return r; @@ -9723,7 +10547,7 @@ int Client::fdescribe_layout(int fd, ceph_file_layout *lp) Fh *f = get_filehandle(fd); if (!f) return -EBADF; - Inode *in = f->inode; + Inode *in = f->inode.get(); *lp = in->layout; @@ -9774,7 +10598,7 @@ int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector& o Fh *f = get_filehandle(fd); if (!f) return -EBADF; - Inode *in = f->inode; + Inode *in = f->inode.get(); vector extents; Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents); @@ -9828,7 +10652,7 @@ int Client::get_file_stripe_address(int fd, loff_t offset, vector Fh *f = get_filehandle(fd); if (!f) return -EBADF; - Inode *in = f->inode; + Inode *in = f->inode.get(); // which object? vector extents; @@ -9874,7 +10698,7 @@ int Client::enumerate_layout(int fd, vector& result, Fh *f = get_filehandle(fd); if (!f) return -EBADF; - Inode *in = f->inode; + Inode *in = f->inode.get(); // map to a list of extents Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result); @@ -9977,6 +10801,250 @@ bool Client::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool return true; } +void Client::put_qtree(Inode *in) +{ + QuotaTree *qtree = in->qtree; + if (qtree) { + qtree->invalidate(); + in->qtree = NULL; + } +} + +void Client::invalidate_quota_tree(Inode *in) +{ + QuotaTree *qtree = in->qtree; + if (qtree) { + ldout(cct, 10) << "invalidate quota tree node " << *in << dendl; + if (qtree->parent_ref()) { + assert(in->is_dir()); + ldout(cct, 15) << "invalidate quota tree ancestor " << *in << dendl; + Inode *ancestor = qtree->ancestor()->in(); + if (ancestor) + put_qtree(ancestor); + } + put_qtree(in); + } +} + +Inode *Client::get_quota_root(Inode *in) +{ + if (!cct->_conf->client_quota) + return NULL; + + QuotaTree *ancestor = NULL; + QuotaTree *parent = NULL; + + vector inode_list; + while (in) { + if (in->qtree && in->qtree->ancestor()->in()) { + ancestor = in->qtree->ancestor(); + parent = in->qtree; + break; + } + + inode_list.push_back(in); + + if (!in->dn_set.empty()) + in = in->get_first_parent()->dir->parent_inode; + else if (root_parents.count(in)) + in = root_parents[in].get(); + else + in = NULL; + } + + if (!in) { + assert(!parent && !ancestor); + assert(root_ancestor->qtree == NULL); + root_ancestor->qtree = ancestor = new QuotaTree(root_ancestor); + ancestor->set_ancestor(ancestor); + parent = ancestor; + } + assert(parent && ancestor); + + for (vector::reverse_iterator iter = inode_list.rbegin(); + iter != inode_list.rend(); ++iter) { + Inode *cur = *iter; + + if (!cur->qtree) + cur->qtree = new QuotaTree(cur); + + cur->qtree->set_parent(parent); + if (parent->in()->quota.is_enable()) + ancestor = parent; + cur->qtree->set_ancestor(ancestor); + + ldout(cct, 20) << "link quota tree " << cur->ino + << " to parent (" << parent->in()->ino << ")" + << " ancestor (" << ancestor->in()->ino << ")" << dendl; + + parent = cur->qtree; + if (cur->quota.is_enable()) + ancestor = cur->qtree; + } + + return ancestor->in(); +} + +bool Client::is_quota_files_exceeded(Inode *in) +{ + if (!cct->_conf->client_quota) + return false; + + while (in != root_ancestor) { + quota_info_t *quota = &in->quota; + nest_info_t *rstat = &in->rstat; + + if (quota->max_files && rstat->rsize() >= quota->max_files) + return true; + + in = get_quota_root(in); + } + return false; +} + +bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes) +{ + if (!cct->_conf->client_quota) + return false; + + while (in != root_ancestor) { + quota_info_t *quota = &in->quota; + nest_info_t *rstat = &in->rstat; + + if (quota->max_bytes && (rstat->rbytes + new_bytes) > quota->max_bytes) + return true; + + in = get_quota_root(in); + } + return false; +} + +bool Client::is_quota_bytes_approaching(Inode *in) +{ + if (!cct->_conf->client_quota) + return false; + + while (in != root_ancestor) { + quota_info_t *quota = &in->quota; + nest_info_t *rstat = &in->rstat; + + if (quota->max_bytes) { + if (rstat->rbytes >= quota->max_bytes) + return true; + + assert(in->size >= in->reported_size); + uint64_t space = quota->max_bytes - rstat->rbytes; + uint64_t size = in->size - in->reported_size; + if ((space >> 4) < size) + return true; + } + + in = get_quota_root(in); + } + return false; +} + +enum { + POOL_CHECKED = 1, + POOL_CHECKING = 2, + POOL_READ = 4, + POOL_WRITE = 8, +}; + +int Client::check_pool_perm(Inode *in, int need) +{ + if (!cct->_conf->client_check_pool_perm) + return 0; + + int64_t pool = in->layout.fl_pg_pool; + int have = 0; + while (true) { + std::map::iterator it = pool_perms.find(pool); + if (it == pool_perms.end()) + break; + if (it->second == POOL_CHECKING) { + // avoid concurrent checkings + wait_on_list(waiting_for_pool_perm); + } else { + have = it->second; + assert(have & POOL_CHECKED); + break; + } + } + + if (!have) { + pool_perms[pool] = POOL_CHECKING; + + char oid_buf[32]; + snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino); + object_t oid = oid_buf; + + C_SaferCond rd_cond; + ObjectOperation rd_op; + rd_op.stat(NULL, (utime_t*)NULL, NULL); + + objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op, + in->snaprealm->get_snap_context(), ceph_clock_now(cct), 0, + &rd_cond, NULL); + + C_SaferCond wr_cond; + ObjectOperation wr_op; + wr_op.create(true); + + objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op, + in->snaprealm->get_snap_context(), ceph_clock_now(cct), 0, + &wr_cond, NULL); + + client_lock.Unlock(); + int rd_ret = rd_cond.wait(); + int wr_ret = wr_cond.wait(); + client_lock.Lock(); + + bool errored = false; + + if (rd_ret == 0 || rd_ret == -ENOENT) + have |= POOL_READ; + else if (rd_ret != -EPERM) { + ldout(cct, 10) << "check_pool_perm on pool " << pool + << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl; + errored = true; + } + + if (wr_ret == 0 || wr_ret == -EEXIST) + have |= POOL_WRITE; + else if (wr_ret != -EPERM) { + ldout(cct, 10) << "check_pool_perm on pool " << pool + << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl; + errored = true; + } + + if (errored) { + // Indeterminate: erase CHECKING state so that subsequent calls re-check. + // Raise EIO because actual error code might be misleading for + // userspace filesystem user. + pool_perms.erase(pool); + signal_cond_list(waiting_for_pool_perm); + return -EIO; + } + + pool_perms[pool] = have | POOL_CHECKED; + signal_cond_list(waiting_for_pool_perm); + } + + if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) { + ldout(cct, 10) << "check_pool_perm on pool " << pool + << " need " << ccap_string(need) << ", but no read perm" << dendl; + return -EPERM; + } + if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) { + ldout(cct, 10) << "check_pool_perm on pool " << pool + << " need " << ccap_string(need) << ", but no write perm" << dendl; + return -EPERM; + } + + return 0; +} + void Client::set_filer_flags(int flags) { Mutex::Locker l(client_lock); @@ -9992,3 +11060,45 @@ void Client::clear_filer_flags(int flags) objecter->clear_global_op_flag(flags); } +/** + * This is included in cap release messages, to cause + * the MDS to wait until this OSD map epoch. It is necessary + * in corner cases where we cancel RADOS ops, so that + * nobody else tries to do IO to the same objects in + * the same epoch as the cancelled ops. + */ +void Client::set_cap_epoch_barrier(epoch_t e) +{ + ldout(cct, 5) << __func__ << " epoch = " << e << dendl; + cap_epoch_barrier = e; +} + +const char** Client::get_tracked_conf_keys() const +{ + static const char* keys[] = { + "client_cache_size", + "client_cache_mid", + NULL + }; + return keys; +} + +void Client::handle_conf_change(const struct md_config_t *conf, + const std::set &changed) +{ + if (changed.count("client_cache_size") || + changed.count("client_cache_mid")) { + lru.lru_set_max(cct->_conf->client_cache_size); + lru.lru_set_midpoint(cct->_conf->client_cache_mid); + } +} + +void intrusive_ptr_add_ref(Inode *in) +{ + in->get(); +} + +void intrusive_ptr_release(Inode *in) +{ + in->client->put_inode(in); +} diff --git a/src/client/Client.h b/src/client/Client.h index ada58e248081d..78a2da7ee04a8 100644 --- a/src/client/Client.h +++ b/src/client/Client.h @@ -47,12 +47,13 @@ using std::fstream; #include "common/Mutex.h" #include "common/Timer.h" #include "common/Finisher.h" - #include "common/compiler_extensions.h" #include "common/cmdparse.h" #include "osdc/ObjectCacher.h" +#include "InodeRef.h" + class MDSMap; class MonClient; @@ -80,8 +81,6 @@ enum { l_c_first = 20000, l_c_reply, l_c_lat, - l_c_owrlat, - l_c_ordlat, l_c_wrlat, l_c_last, }; @@ -119,7 +118,6 @@ struct DirEntry { DirEntry(const string &n, struct stat& s, int stm) : d_name(n), st(s), stmask(stm) {} }; -struct Inode; struct Cap; class Dir; class Dentry; @@ -136,10 +134,20 @@ typedef void (*client_ino_callback_t)(void *handle, vinodeno_t ino, int64_t off, typedef void (*client_dentry_callback_t)(void *handle, vinodeno_t dirino, vinodeno_t ino, string& name); +typedef int (*client_remount_callback_t)(void *handle); typedef int (*client_getgroups_callback_t)(void *handle, uid_t uid, gid_t **sgids); typedef void(*client_switch_interrupt_callback_t)(void *req, void *data); +struct client_callback_args { + void *handle; + client_ino_callback_t ino_cb; + client_dentry_callback_t dentry_cb; + client_switch_interrupt_callback_t switch_intr_cb; + client_remount_callback_t remount_cb; + client_getgroups_callback_t getgroups_cb; +}; + // ======================================================== // client interface @@ -159,7 +167,7 @@ struct dir_result_t { } - Inode *inode; + InodeRef inode; int64_t offset; // high bits: frag_t, low bits: an offset @@ -172,7 +180,7 @@ struct dir_result_t { int start_shared_gen; // dir shared_gen at start of readdir frag_t buffer_frag; - vector > *buffer; + vector > *buffer; string at_cache_name; // last entry we successfully returned @@ -206,9 +214,9 @@ struct dir_result_t { } }; -class Client : public Dispatcher { +class Client : public Dispatcher, public md_config_obs_t { public: - CephContext *cct; + using Dispatcher::cct; PerfCounters *logger; @@ -226,20 +234,19 @@ class Client : public Dispatcher { SafeTimer timer; + void *callback_handle; client_switch_interrupt_callback_t switch_interrupt_cb; - + client_remount_callback_t remount_cb; client_ino_callback_t ino_invalidate_cb; - void *ino_invalidate_cb_handle; - client_dentry_callback_t dentry_invalidate_cb; - void *dentry_invalidate_cb_handle; - client_getgroups_callback_t getgroups_cb; - void *getgroups_cb_handle; + bool can_invalidate_dentries; + bool require_remount; Finisher async_ino_invalidator; Finisher async_dentry_invalidator; Finisher interrupt_finisher; + Finisher remount_finisher; Finisher objecter_finisher; Context *tick_event; @@ -255,6 +262,9 @@ class Client : public Dispatcher { Messenger *messenger; client_t whoami; + void set_cap_epoch_barrier(epoch_t e); + epoch_t cap_epoch_barrier; + // mds sessions map mds_sessions; // mds -> push seq list waiting_for_mdsmap; @@ -280,7 +290,9 @@ class Client : public Dispatcher { void resend_unsafe_requests(MetaSession *s); // mds requests - ceph_tid_t last_tid, last_flush_seq; + ceph_tid_t last_tid; + ceph_tid_t oldest_tid; // oldest incomplete mds request, excluding setfilelock requests + ceph_tid_t last_flush_seq; map mds_requests; void dump_mds_requests(Formatter *f); @@ -288,12 +300,13 @@ class Client : public Dispatcher { int make_request(MetaRequest *req, int uid, int gid, //MClientRequest *req, int uid, int gid, - Inode **ptarget = 0, bool *pcreated = 0, + InodeRef *ptarget = 0, bool *pcreated = 0, int use_mds=-1, bufferlist *pdirbl=0); void put_request(MetaRequest *request); + void unregister_request(MetaRequest *request); int verify_reply_trace(int r, MetaRequest *request, MClientReply *reply, - Inode **ptarget, bool *pcreated, int uid, int gid); + InodeRef *ptarget, bool *pcreated, int uid, int gid); void encode_cap_releases(MetaRequest *request, mds_rank_t mds); int encode_inode_release(Inode *in, MetaRequest *req, mds_rank_t mds, int drop, @@ -302,12 +315,14 @@ class Client : public Dispatcher { mds_rank_t mds, int drop, int unless); mds_rank_t choose_target_mds(MetaRequest *req); void connect_mds_targets(mds_rank_t mds); - void send_request(MetaRequest *request, MetaSession *session); + void send_request(MetaRequest *request, MetaSession *session, + bool drop_cap_releases=false); MClientRequest *build_client_request(MetaRequest *request); void kick_requests(MetaSession *session); void kick_requests_closed(MetaSession *session); void handle_client_request_forward(MClientRequestForward *reply); void handle_client_reply(MClientReply *reply); + bool is_dir_operation(MetaRequest *request); bool initialized; bool authenticated; @@ -321,7 +336,7 @@ class Client : public Dispatcher { public: entity_name_t get_myname() { return messenger->get_myname(); } - void sync_write_commit(Inode *in); + void sync_write_commit(InodeRef& in); protected: Filer *filer; @@ -332,6 +347,8 @@ class Client : public Dispatcher { // cache ceph::unordered_map inode_map; Inode* root; + map root_parents; + Inode* root_ancestor; LRU lru; // lru list of Dentry's in our local metadata cache. // all inodes with caps sit on either cap_list or delayed_caps. @@ -399,13 +416,15 @@ class Client : public Dispatcher { void put_inode(Inode *in, int n=1); void close_dir(Dir *dir); - friend class C_Client_PutInode; // calls put_inode() + friend class C_Client_FlushComplete; // calls put_inode() friend class C_Client_CacheInvalidate; // calls ino_invalidate_cb friend class C_Client_DentryInvalidate; // calls dentry_invalidate_cb friend class C_Block_Sync; // Calls block map and protected helpers friend class C_C_Tick; // Asserts on client_lock friend class C_Client_SyncCommit; // Asserts on client_lock friend class C_Client_RequestInterrupt; + friend class C_Client_Remount; + friend void intrusive_ptr_release(Inode *in); //int get_cache_size() { return lru.lru_get_size(); } //void set_cache_size(int m) { lru.lru_set_max(m); } @@ -419,20 +438,28 @@ class Client : public Dispatcher { void unlink(Dentry *dn, bool keepdir, bool keepdentry); // path traversal for high-level interface - Inode *cwd; - int path_walk(const filepath& fp, Inode **end, bool followsym=true); + InodeRef cwd; + int path_walk(const filepath& fp, InodeRef *end, bool followsym=true); int fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat=0, nest_info_t *rstat=0); + int fill_stat(InodeRef& in, struct stat *st, frag_info_t *dirstat=0, nest_info_t *rstat=0) { + return fill_stat(in.get(), st, dirstat, rstat); + } void touch_dn(Dentry *dn); // trim cache. - void trim_cache(); + void trim_cache(bool trim_kernel_dcache=false); void trim_cache_for_reconnect(MetaSession *s); void trim_dentry(Dentry *dn); void trim_caps(MetaSession *s, int max); - void _invalidate_kernel_dcache(MetaSession *s); + void _invalidate_kernel_dcache(); void dump_inode(Formatter *f, Inode *in, set& did, bool disconnected); void dump_cache(Formatter *f); // debug + + // force read-only + void force_session_readonly(MetaSession *s); + + void dump_status(Formatter *f); // debug // trace generation ofstream traceout; @@ -452,6 +479,25 @@ class Client : public Dispatcher { int authenticate(); + void put_qtree(Inode *in); + void invalidate_quota_tree(Inode *in); + Inode* get_quota_root(Inode *in); + bool is_quota_files_exceeded(Inode *in); + bool is_quota_bytes_exceeded(Inode *in, int64_t new_bytes); + bool is_quota_bytes_approaching(Inode *in); + + std::map pool_perms; + list waiting_for_pool_perm; + int check_pool_perm(Inode *in, int need); + + /** + * Call this when an OSDMap is seen with a full flag (global or per pool) + * set. + * + * @param pool the pool ID affected, or -1 if all. + */ + void _handle_full_flag(int64_t pool); + public: void set_filer_flags(int flags); void clear_filer_flags(int flags); @@ -472,6 +518,7 @@ class Client : public Dispatcher { // messaging void handle_mds_map(class MMDSMap *m); + void handle_osd_map(class MOSDMap *m); void handle_lease(MClientLease *m); @@ -498,6 +545,7 @@ class Client : public Dispatcher { void maybe_update_snaprealm(SnapRealm *realm, snapid_t snap_created, snapid_t snap_highwater, vector& snaps); + void handle_quota(struct MClientQuota *m); void handle_snap(struct MClientSnap *m); void handle_caps(class MClientCaps *m); void handle_cap_import(MetaSession *session, Inode *in, class MClientCaps *m); @@ -513,19 +561,20 @@ class Client : public Dispatcher { void get_cap_ref(Inode *in, int cap); void put_cap_ref(Inode *in, int cap); void flush_snaps(Inode *in, bool all_again=false, CapSnap *again=0); + void wait_sync_caps(Inode *in, uint16_t flush_tid[]); void wait_sync_caps(uint64_t want); - void queue_cap_snap(Inode *in, snapid_t seq=0); + void queue_cap_snap(Inode *in, SnapContext &old_snapc); void finish_cap_snap(Inode *in, CapSnap *capsnap, int used); void _flushed_cap_snap(Inode *in, snapid_t seq); void _schedule_invalidate_dentry_callback(Dentry *dn, bool del); void _async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name); - void _invalidate_inode_parents(Inode *in); + void _try_to_trim_inode(Inode *in); void _schedule_invalidate_callback(Inode *in, int64_t off, int64_t len, bool keep_caps); void _invalidate_inode_cache(Inode *in); void _invalidate_inode_cache(Inode *in, int64_t off, int64_t len); - void _async_invalidate(Inode *in, int64_t off, int64_t len, bool keep_caps); + void _async_invalidate(InodeRef& in, int64_t off, int64_t len, bool keep_caps); void _release(Inode *in); /** @@ -539,7 +588,7 @@ class Client : public Dispatcher { * * @returns true if the data was already flushed, false otherwise. */ - bool _flush(Inode *in, Context *c=NULL); + bool _flush(Inode *in, Context *c); void _flush_range(Inode *in, int64_t off, uint64_t size); void _flushed(Inode *in); void flush_set_callback(ObjectCacher::ObjectSet *oset); @@ -594,14 +643,13 @@ class Client : public Dispatcher { Fh *_create_fh(Inode *in, int flags, int cmode); int _release_fh(Fh *fh); + void _put_fh(Fh *fh); struct C_Readahead : public Context { Client *client; Fh *f; - C_Readahead(Client *c, Fh *f) - : client(c), - f(f) { } + C_Readahead(Client *c, Fh *f); void finish(int r); }; @@ -610,32 +658,42 @@ class Client : public Dispatcher { // internal interface // call these with client_lock held! - int _do_lookup(Inode *dir, const string& name, Inode **target); - int _lookup(Inode *dir, const string& dname, Inode **target); + int _do_lookup(Inode *dir, const string& name, InodeRef *target); + int _lookup(Inode *dir, const string& dname, InodeRef *target); - int _link(Inode *in, Inode *dir, const char *name, int uid=-1, int gid=-1, Inode **inp = 0); + int _link(Inode *in, Inode *dir, const char *name, int uid=-1, int gid=-1, InodeRef *inp = 0); int _unlink(Inode *dir, const char *name, int uid=-1, int gid=-1); int _rename(Inode *olddir, const char *oname, Inode *ndir, const char *nname, int uid=-1, int gid=-1); - int _mkdir(Inode *dir, const char *name, mode_t mode, int uid=-1, int gid=-1, Inode **inp = 0); + int _mkdir(Inode *dir, const char *name, mode_t mode, int uid=-1, int gid=-1, InodeRef *inp = 0); int _rmdir(Inode *dir, const char *name, int uid=-1, int gid=-1); - int _symlink(Inode *dir, const char *name, const char *target, int uid=-1, int gid=-1, Inode **inp = 0); - int _mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev, int uid=-1, int gid=-1, Inode **inp = 0); - int _setattr(Inode *in, struct stat *attr, int mask, int uid=-1, int gid=-1, Inode **inp = 0); + int _symlink(Inode *dir, const char *name, const char *target, int uid=-1, int gid=-1, InodeRef *inp = 0); + int _mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev, int uid=-1, int gid=-1, InodeRef *inp = 0); + int _setattr(Inode *in, struct stat *attr, int mask, int uid=-1, int gid=-1, InodeRef *inp = 0); + int _setattr(InodeRef &in, struct stat *attr, int mask, int uid=-1, int gid=-1, InodeRef *inp = 0) { + return _setattr(in.get(), attr, mask, uid, gid, inp); + } int _getattr(Inode *in, int mask, int uid=-1, int gid=-1, bool force=false); + int _getattr(InodeRef &in, int mask, int uid=-1, int gid=-1, bool force=false) { + return _getattr(in.get(), mask, uid, gid, force); + } int _readlink(Inode *in, char *buf, size_t size); int _getxattr(Inode *in, const char *name, void *value, size_t len, int uid=-1, int gid=-1); int _listxattr(Inode *in, char *names, size_t len, int uid=-1, int gid=-1); int _setxattr(Inode *in, const char *name, const void *value, size_t len, int flags, int uid=-1, int gid=-1); int _removexattr(Inode *in, const char *nm, int uid=-1, int gid=-1); int _open(Inode *in, int flags, mode_t mode, Fh **fhp, int uid=-1, int gid=-1); - int _create(Inode *in, const char *name, int flags, mode_t mode, Inode **inp, Fh **fhp, + int _create(Inode *in, const char *name, int flags, mode_t mode, InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count, int object_size, const char *data_pool, bool *created = NULL, int uid=-1, int gid=-1); + loff_t _lseek(Fh *fh, loff_t offset, int whence); int _read(Fh *fh, int64_t offset, uint64_t size, bufferlist *bl); - int _write(Fh *fh, int64_t offset, uint64_t size, const char *buf); + int _write(Fh *fh, int64_t offset, uint64_t size, const char *buf, + const struct iovec *iov, int iovcnt); + int _preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write); int _flush(Fh *fh); int _fsync(Fh *fh, bool syncdataonly); + int _fsync(Inode *in, bool syncdataonly); int _sync_fs(); int _fallocate(Fh *fh, int mode, int64_t offset, int64_t length); int _getlk(Fh *fh, struct flock *fl, uint64_t owner); @@ -661,6 +719,11 @@ class Client : public Dispatcher { bool (Client::*exists_cb)(Inode *in); }; + bool _vxattrcb_quota_exists(Inode *in); + size_t _vxattrcb_quota(Inode *in, char *val, size_t size); + size_t _vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size); + size_t _vxattrcb_quota_max_files(Inode *in, char *val, size_t size); + bool _vxattrcb_layout_exists(Inode *in); size_t _vxattrcb_layout(Inode *in, char *val, size_t size); size_t _vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size); @@ -782,6 +845,7 @@ class Client : public Dispatcher { int lchown(const char *path, int uid, int gid); int utime(const char *path, struct utimbuf *buf); int lutime(const char *path, struct utimbuf *buf); + int flock(int fd, int operation, uint64_t owner); int truncate(const char *path, loff_t size); // file ops @@ -795,7 +859,9 @@ class Client : public Dispatcher { int close(int fd); loff_t lseek(int fd, loff_t offset, int whence); int read(int fd, char *buf, loff_t size, loff_t offset=-1); + int preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset=-1); int write(int fd, const char *buf, loff_t size, loff_t offset=-1); + int pwritev(int fd, const struct iovec *iov, int iovcnt, loff_t offset=-1); int fake_write_size(int fd, loff_t size); int ftruncate(int fd, loff_t size); int fsync(int fd, bool syncdataonly); @@ -805,12 +871,16 @@ class Client : public Dispatcher { // full path xattr ops int getxattr(const char *path, const char *name, void *value, size_t size); int lgetxattr(const char *path, const char *name, void *value, size_t size); + int fgetxattr(int fd, const char *name, void *value, size_t size); int listxattr(const char *path, char *list, size_t size); int llistxattr(const char *path, char *list, size_t size); + int flistxattr(int fd, char *list, size_t size); int removexattr(const char *path, const char *name); int lremovexattr(const char *path, const char *name); + int fremovexattr(int fd, const char *name); int setxattr(const char *path, const char *name, const void *value, size_t size, int flags); int lsetxattr(const char *path, const char *name, const void *value, size_t size, int flags); + int fsetxattr(int fd, const char *name, const void *value, size_t size, int flags); int sync_fs(); int64_t drop_caches(); @@ -869,6 +939,7 @@ class Client : public Dispatcher { int ll_listxattr(Inode *in, char *list, size_t size, int uid=-1, int gid=-1); int ll_opendir(Inode *in, dir_result_t **dirpp, int uid = -1, int gid = -1); int ll_releasedir(dir_result_t* dirp); + int ll_fsyncdir(dir_result_t* dirp); int ll_readlink(Inode *in, char *buf, size_t bufsize, int uid = -1, int gid = -1); int ll_mknod(Inode *in, const char *name, mode_t mode, dev_t rdev, struct stat *attr, Inode **out, int uid = -1, int gid = -1); @@ -913,6 +984,7 @@ class Client : public Dispatcher { int ll_getlk(Fh *fh, struct flock *fl, uint64_t owner); int ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep, void *fuse_req); int ll_flock(Fh *fh, int cmd, uint64_t owner, void *fuse_req); + int ll_file_layout(Fh *fh, ceph_file_layout *layout); void ll_interrupt(void *d); int ll_get_stripe_osd(struct Inode *in, uint64_t blockno, ceph_file_layout* layout); @@ -922,10 +994,12 @@ class Client : public Dispatcher { int ll_osdaddr(int osd, uint32_t *addr); int ll_osdaddr(int osd, char* buf, size_t size); - void ll_register_ino_invalidate_cb(client_ino_callback_t cb, void *handle); - void ll_register_dentry_invalidate_cb(client_dentry_callback_t cb, void *handle); - void ll_register_getgroups_cb(client_getgroups_callback_t cb, void *handle); - void ll_register_switch_interrupt_cb(client_switch_interrupt_callback_t cb); + void ll_register_callbacks(struct client_callback_args *args); + int test_dentry_handling(bool can_invalidate); + + virtual const char** get_tracked_conf_keys() const; + virtual void handle_conf_change(const struct md_config_t *conf, + const std::set &changed); }; #endif diff --git a/src/client/Dentry.h b/src/client/Dentry.h index aad6343f8179f..198b375ce0988 100644 --- a/src/client/Dentry.h +++ b/src/client/Dentry.h @@ -5,17 +5,18 @@ #include "include/xlist.h" #include "mds/mdstypes.h" +#include "InodeRef.h" class Dir; struct Inode; class Dentry : public LRUObject { public: - string name; // sort of lame + string name; // sort of lame //const char *name; - Dir *dir; - Inode *inode; - int ref; // 1 if there's a dir beneath me. + Dir *dir; + InodeRef inode; + int ref; // 1 if there's a dir beneath me. uint64_t offset; mds_rank_t lease_mds; utime_t lease_ttl; @@ -47,7 +48,7 @@ class Dentry : public LRUObject { void dump(Formatter *f) const; Dentry() : - dir(0), inode(0), ref(1), offset(0), + dir(0), ref(1), offset(0), lease_mds(-1), lease_gen(0), lease_seq(0), cap_shared_gen(0), item_dentry_list(this) { } private: diff --git a/src/client/Fh.h b/src/client/Fh.h index 6f0aebd5d941a..db3a28c47a8fe 100644 --- a/src/client/Fh.h +++ b/src/client/Fh.h @@ -3,15 +3,16 @@ #include "common/Readahead.h" #include "include/types.h" +#include "InodeRef.h" -struct Inode; class Cond; class ceph_lock_state_t; // file handle for any open file state struct Fh { - Inode *inode; + InodeRef inode; + int _ref; loff_t pos; int mds; // have to talk to mds we opened with (for now) int mode; // the mode i opened the file with @@ -26,8 +27,10 @@ struct Fh { ceph_lock_state_t *fcntl_locks; ceph_lock_state_t *flock_locks; - Fh() : inode(0), pos(0), mds(0), mode(0), flags(0), pos_locked(false), + Fh() : _ref(1), pos(0), mds(0), mode(0), flags(0), pos_locked(false), readahead(), fcntl_locks(NULL), flock_locks(NULL) {} + void get() { ++_ref; } + int put() { return --_ref; } }; diff --git a/src/client/Inode.cc b/src/client/Inode.cc index 7ed61d71e25de..16eee7a7aa494 100644 --- a/src/client/Inode.cc +++ b/src/client/Inode.cc @@ -1,10 +1,11 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab -#include "MetaSession.h" +#include "Client.h" #include "Inode.h" #include "Dentry.h" #include "Dir.h" +#include "MetaSession.h" #include "ClientSnapRealm.h" ostream& operator<<(ostream &out, Inode &in) @@ -44,6 +45,9 @@ ostream& operator<<(ostream &out, Inode &in) if (in.is_dir() && in.has_dir_layout()) out << " has_dir_layout"; + if (in.quota.is_enable()) + out << " " << in.quota; + out << ' ' << &in << ")"; return out; } @@ -123,7 +127,7 @@ int Inode::put_cap_ref(int cap) if (cap & 1) { int c = 1 << n; if (cap_refs[c] <= 0) { - lderr(cct) << "put_cap_ref " << ccap_string(c) << " went negative on " << *this << dendl; + lderr(client->cct) << "put_cap_ref " << ccap_string(c) << " went negative on " << *this << dendl; assert(cap_refs[c] > 0); } if (--cap_refs[c] == 0) @@ -148,10 +152,10 @@ bool Inode::cap_is_valid(Cap* cap) << "cap expire " << cap->session->cap_ttl << std::endl << "cur time " << ceph_clock_now(cct) << std::endl;*/ if ((cap->session->cap_gen <= cap->gen) - && (ceph_clock_now(cct) < cap->session->cap_ttl)) { + && (ceph_clock_now(client->cct) < cap->session->cap_ttl)) { return true; } - return true; + return false; } int Inode::caps_issued(int *implemented) @@ -265,7 +269,7 @@ Dir *Inode::open_dir() { if (!dir) { dir = new Dir(this); - lsubdout(cct, mds, 15) << "open_dir " << dir << " on " << this << dendl; + lsubdout(client->cct, client, 15) << "open_dir " << dir << " on " << this << dendl; assert(dn_set.size() < 2); // dirs can't be hard-linked if (!dn_set.empty()) (*dn_set.begin())->get(); // pin dentry @@ -304,6 +308,21 @@ bool Inode::check_mode(uid_t ruid, gid_t rgid, gid_t *sgids, int sgids_count, ui return (mode & fmode) == fmode; } +void Inode::get() { + _ref++; + lsubdout(client->cct, client, 15) << "inode.get on " << this << " " << ino << '.' << snapid + << " now " << _ref << dendl; +} + +//private method to put a reference; see Client::put_inode() +int Inode::_put(int n) { + _ref -= n; + lsubdout(client->cct, client, 15) << "inode.put on " << this << " " << ino << '.' << snapid + << " now " << _ref << dendl; + assert(_ref >= 0); + return _ref; +} + void Inode::dump(Formatter *f) const { diff --git a/src/client/Inode.h b/src/client/Inode.h index 62cc0d016714e..f18f65272d931 100644 --- a/src/client/Inode.h +++ b/src/client/Inode.h @@ -13,12 +13,16 @@ #include "osdc/ObjectCacher.h" #include "include/assert.h" +#include "InodeRef.h" + +class Client; struct MetaSession; class Dentry; class Dir; struct SnapRealm; struct Inode; class ceph_lock_state_t; +class MetaRequest; struct Cap { MetaSession *session; @@ -41,7 +45,7 @@ struct Cap { struct CapSnap { //snapid_t follows; // map key - Inode *in; + InodeRef in; SnapContext context; int issued, dirty; @@ -54,6 +58,9 @@ struct CapSnap { map xattrs; version_t xattr_version; + bufferlist inline_data; + version_t inline_version; + bool writing, dirty_data; uint64_t flush_tid; xlist::item flushing_item; @@ -61,20 +68,93 @@ struct CapSnap { CapSnap(Inode *i) : in(i), issued(0), dirty(0), size(0), time_warp_seq(0), mode(0), uid(0), gid(0), xattr_version(0), - writing(false), dirty_data(false), flush_tid(0), + inline_version(0), writing(false), dirty_data(false), flush_tid(0), flushing_item(this) {} void dump(Formatter *f) const; }; +class QuotaTree { +private: + Inode *_in; + + int _ancestor_ref; + QuotaTree *_ancestor; + int _parent_ref; + QuotaTree *_parent; + + void _put() + { + if (!_in && !_ancestor_ref && !_parent_ref) { + set_parent(NULL); + set_ancestor(NULL); + delete this; + } + } + ~QuotaTree() {} +public: + QuotaTree(Inode *i) : + _in(i), + _ancestor_ref(0), + _ancestor(NULL), + _parent_ref(0), + _parent(NULL) + { assert(i); } + + Inode *in() { return _in; } + + int ancestor_ref() { return _ancestor_ref; } + int parent_ref() { return _parent_ref; } + + QuotaTree *ancestor() { return _ancestor; } + void set_ancestor(QuotaTree *ancestor) + { + if (ancestor == _ancestor) + return; + + if (_ancestor) { + --_ancestor->_ancestor_ref; + _ancestor->_put(); + } + _ancestor = ancestor; + if (_ancestor) + ++_ancestor->_ancestor_ref; + } + + QuotaTree *parent() { return _parent; } + void set_parent(QuotaTree *parent) + { + if (parent == _parent) + return; + + if (_parent) { + --_parent->_parent_ref; + _parent->_put(); + } + _parent = parent; + if (parent) + ++_parent->_parent_ref; + } + + void invalidate() + { + if (!_in) + return; + + _in = NULL; + set_ancestor(NULL); + set_parent(NULL); + _put(); + } +}; // inode flags #define I_COMPLETE 1 #define I_DIR_ORDERED 2 struct Inode { - CephContext *cct; + Client *client; // -- the actual inode -- inodeno_t ino; @@ -116,6 +196,7 @@ struct Inode { version_t inline_version; bufferlist inline_data; + bool is_root() const { return ino == MDS_INO_ROOT; } bool is_symlink() const { return (mode & S_IFMT) == S_IFLNK; } bool is_dir() const { return (mode & S_IFMT) == S_IFDIR; } bool is_file() const { return (mode & S_IFMT) == S_IFREG; } @@ -136,6 +217,9 @@ struct Inode { unsigned flags; + quota_info_t quota; + QuotaTree* qtree; + bool is_complete_and_ordered() { static const unsigned wants = I_COMPLETE | I_DIR_ORDERED; return (flags & wants) == wants; @@ -159,7 +243,7 @@ struct Inode { SnapRealm *snaprealm; xlist::item snaprealm_item; - Inode *snapdir_parent; // only if we are a snapdir inode + InodeRef snapdir_parent; // only if we are a snapdir inode map cap_snaps; // pending flush to mds //int open_by_mode[CEPH_FILE_MODE_NUM]; @@ -190,19 +274,8 @@ struct Inode { void make_long_path(filepath& p); void make_nosnap_relative_path(filepath& p); - void get() { - _ref++; - lsubdout(cct, mds, 15) << "inode.get on " << this << " " << ino << '.' << snapid - << " now " << _ref << dendl; - } - /// private method to put a reference; see Client::put_inode() - int _put(int n=1) { - _ref -= n; - lsubdout(cct, mds, 15) << "inode.put on " << this << " " << ino << '.' << snapid - << " now " << _ref << dendl; - assert(_ref >= 0); - return _ref; - } + void get(); + int _put(int n=1); int get_num_ref() { return _ref; @@ -220,26 +293,31 @@ struct Inode { ceph_lock_state_t *fcntl_locks; ceph_lock_state_t *flock_locks; - Inode(CephContext *cct_, vinodeno_t vino, ceph_file_layout *newlayout) - : cct(cct_), ino(vino.ino), snapid(vino.snapid), + xlist unsafe_dir_ops; + + Inode(Client *c, vinodeno_t vino, ceph_file_layout *newlayout) + : client(c), ino(vino.ino), snapid(vino.snapid), rdev(0), mode(0), uid(0), gid(0), nlink(0), size(0), truncate_seq(1), truncate_size(-1), time_warp_seq(0), max_size(0), version(0), xattr_version(0), inline_version(0), flags(0), + qtree(NULL), dir_hashed(false), dir_replicated(false), auth_cap(NULL), dirty_caps(0), flushing_caps(0), flushing_cap_seq(0), shared_gen(0), cache_gen(0), snap_caps(0), snap_cap_refs(0), cap_item(this), flushing_cap_item(this), last_flush_tid(0), - snaprealm(0), snaprealm_item(this), snapdir_parent(0), + snaprealm(0), snaprealm_item(this), oset((void *)this, newlayout->fl_pg_pool, ino), reported_size(0), wanted_max_size(0), requested_max_size(0), _ref(0), ll_ref(0), dir(0), dn_set(), - fcntl_locks(NULL), flock_locks(NULL) + fcntl_locks(NULL), flock_locks(NULL), + async_err(0) { memset(&dir_layout, 0, sizeof(dir_layout)); memset(&layout, 0, sizeof(layout)); memset(&flushing_cap_tid, 0, sizeof(__u16)*CEPH_CAP_BITS); + memset("a, 0, sizeof(quota)); } ~Inode() { } @@ -276,6 +354,9 @@ struct Inode { bool have_valid_size(); Dir *open_dir(); + // Record errors to be exposed in fclose/fflush + int async_err; + void dump(Formatter *f) const; }; diff --git a/src/client/InodeRef.h b/src/client/InodeRef.h new file mode 100644 index 0000000000000..822ec0ffcdc96 --- /dev/null +++ b/src/client/InodeRef.h @@ -0,0 +1,12 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_CLIENT_INODEREF_H +#define CEPH_CLIENT_INODEREF_H + +#include +class Inode; +void intrusive_ptr_add_ref(Inode *in); +void intrusive_ptr_release(Inode *in); +typedef boost::intrusive_ptr InodeRef; +#endif diff --git a/src/client/Makefile.am b/src/client/Makefile.am index 53107eba51742..5ef480b8c09bb 100644 --- a/src/client/Makefile.am +++ b/src/client/Makefile.am @@ -1,3 +1,4 @@ +if ENABLE_CLIENT libclient_la_SOURCES = \ client/Client.cc \ client/Inode.cc \ @@ -15,6 +16,7 @@ noinst_HEADERS += \ client/Dir.h \ client/Fh.h \ client/Inode.h \ + client/InodeRef.h \ client/MetaRequest.h \ client/MetaSession.h \ client/ClientSnapRealm.h \ @@ -33,3 +35,4 @@ endif ceph_test_ioctls_SOURCES = client/test_ioctls.c bin_DEBUGPROGRAMS += ceph_test_ioctls +endif # ENABLE_CLIENT diff --git a/src/client/MetaRequest.cc b/src/client/MetaRequest.cc index c8c4552d007a3..330edde1c851c 100644 --- a/src/client/MetaRequest.cc +++ b/src/client/MetaRequest.cc @@ -57,9 +57,6 @@ void MetaRequest::dump(Formatter *f) const MetaRequest::~MetaRequest() { - assert(!_inode); - assert(!_old_inode); - assert(!_other_inode); if (_dentry) _dentry->put(); if (_old_dentry) @@ -68,33 +65,6 @@ MetaRequest::~MetaRequest() reply->put(); } -void MetaRequest::set_inode(Inode *in) { - assert(_inode == NULL); - _inode = in; - _inode->get(); -} -Inode *MetaRequest::inode() { - return _inode; -} - -void MetaRequest::set_old_inode(Inode *in) { - assert(_old_inode == NULL); - _old_inode = in; - _old_inode->get(); -} -Inode *MetaRequest::old_inode() { - return _old_inode; -} - -void MetaRequest::set_other_inode(Inode *in) { - assert(_other_inode == NULL); - _other_inode = in; - _other_inode->get(); -} -Inode *MetaRequest::other_inode() { - return _other_inode; -} - void MetaRequest::set_dentry(Dentry *d) { assert(_dentry == NULL); _dentry = d; diff --git a/src/client/MetaRequest.h b/src/client/MetaRequest.h index 24b2f54cef4e6..2cf13c2d6a988 100644 --- a/src/client/MetaRequest.h +++ b/src/client/MetaRequest.h @@ -11,19 +11,18 @@ #include "include/filepath.h" #include "include/atomic.h" #include "mds/mdstypes.h" +#include "InodeRef.h" #include "common/Mutex.h" #include "messages/MClientRequest.h" class MClientReply; -struct Inode; class Dentry; struct MetaRequest { private: - Inode *_inode; - Inode *_old_inode, *_other_inode; + InodeRef _inode, _old_inode, _other_inode; Dentry *_dentry; //associated with path Dentry *_old_dentry; //associated with path2 public: @@ -53,6 +52,8 @@ struct MetaRequest { MClientReply *reply; // the reply bool kick; + bool aborted; + bool success; // readdir result frag_t readdir_frag; @@ -60,7 +61,7 @@ struct MetaRequest { uint64_t readdir_offset; frag_t readdir_reply_frag; - vector > readdir_result; + vector > readdir_result; bool readdir_end; int readdir_num; string readdir_last_name; @@ -70,15 +71,16 @@ struct MetaRequest { xlist::item item; xlist::item unsafe_item; + xlist::item unsafe_dir_item; Mutex lock; //for get/set sync Cond *caller_cond; // who to take up Cond *dispatch_cond; // who to kick back + list waitfor_safe; - Inode *target; + InodeRef target; MetaRequest(int op) : - _inode(NULL), _old_inode(NULL), _other_inode(NULL), _dentry(NULL), _old_dentry(NULL), tid(0), inode_drop(0), inode_unless(0), @@ -90,37 +92,42 @@ struct MetaRequest { mds(-1), resend_mds(-1), send_to_auth(false), sent_on_mseq(0), num_fwd(0), retry_attempt(0), ref(1), reply(0), - kick(false), + kick(false), aborted(false), success(false), readdir_offset(0), readdir_end(false), readdir_num(0), - got_unsafe(false), item(this), unsafe_item(this), + got_unsafe(false), item(this), unsafe_item(this), unsafe_dir_item(this), lock("MetaRequest lock"), - caller_cond(0), dispatch_cond(0), - target(0) { + caller_cond(0), dispatch_cond(0) { memset(&head, 0, sizeof(ceph_mds_request_head)); head.op = op; } ~MetaRequest(); - void set_inode(Inode *in); - Inode *inode(); - Inode *take_inode() { - Inode *i = _inode; - _inode = 0; - return i; - } - void set_old_inode(Inode *in); - Inode *old_inode(); - Inode *take_old_inode() { - Inode *i = _old_inode; - _old_inode = NULL; - return i; - } - void set_other_inode(Inode *in); - Inode *other_inode(); - Inode *take_other_inode() { - Inode *i = _other_inode; - _other_inode = 0; - return i; + void set_inode(Inode *in) { + _inode = in; + } + Inode *inode() { + return _inode.get(); + } + void take_inode(InodeRef *out) { + out->swap(_inode); + } + void set_old_inode(Inode *in) { + _old_inode = in; + } + Inode *old_inode() { + return _old_inode.get(); + } + void take_old_inode(InodeRef *out) { + out->swap(_old_inode); + } + void set_other_inode(Inode *in) { + _old_inode = in; + } + Inode *other_inode() { + return _other_inode.get(); + } + void take_other_inode(InodeRef *out) { + out->swap(_other_inode); } void set_dentry(Dentry *d); Dentry *dentry(); diff --git a/src/client/MetaSession.cc b/src/client/MetaSession.cc index 9f2a136bd55f3..03752d246ae93 100644 --- a/src/client/MetaSession.cc +++ b/src/client/MetaSession.cc @@ -2,6 +2,7 @@ // vim: ts=8 sw=2 smarttab #include "include/types.h" +#include "messages/MClientCapRelease.h" #include "MetaSession.h" @@ -38,3 +39,22 @@ MetaSession::~MetaSession() if (release) release->put(); } + +void MetaSession::enqueue_cap_release(inodeno_t ino, uint64_t cap_id, ceph_seq_t iseq, + ceph_seq_t mseq, epoch_t osd_barrier) +{ + if (!release) { + release = new MClientCapRelease; + } + + if (osd_barrier > release->osd_epoch_barrier) { + release->osd_epoch_barrier = osd_barrier; + } + + ceph_mds_cap_item i; + i.ino = ino; + i.cap_id = cap_id; + i.seq = iseq; + i.migrate_seq = mseq; + release->caps.push_back(i); +} diff --git a/src/client/MetaSession.h b/src/client/MetaSession.h index 6eb813cb22d4e..36b5814c7c67b 100644 --- a/src/client/MetaSession.h +++ b/src/client/MetaSession.h @@ -37,6 +37,8 @@ struct MetaSession { STATE_STALE, } state; + bool readonly; + list waiting_for_open; xlist caps; @@ -52,7 +54,7 @@ struct MetaSession { MetaSession() : mds_num(-1), con(NULL), seq(0), cap_gen(0), cap_renew_seq(0), num_caps(0), - state(STATE_NEW), s_cap_iterator(NULL), + state(STATE_NEW), readonly(false), s_cap_iterator(NULL), release(NULL) {} ~MetaSession(); @@ -60,6 +62,9 @@ struct MetaSession { const char *get_state_name() const; void dump(Formatter *f) const; + + void enqueue_cap_release(inodeno_t ino, uint64_t cap_id, ceph_seq_t iseq, + ceph_seq_t mseq, epoch_t osd_barrier); }; #endif diff --git a/src/client/ObjecterWriteback.h b/src/client/ObjecterWriteback.h index b7e4bd2571285..b9e6f9c3cf1da 100644 --- a/src/client/ObjecterWriteback.h +++ b/src/client/ObjecterWriteback.h @@ -14,10 +14,10 @@ class ObjecterWriteback : public WritebackHandler { m_lock(lock) { } virtual ~ObjecterWriteback() {} - virtual void read(const object_t& oid, const object_locator_t& oloc, - uint64_t off, uint64_t len, snapid_t snapid, - bufferlist *pbl, uint64_t trunc_size, __u32 trunc_seq, - Context *onfinish) { + virtual void read(const object_t& oid, uint64_t object_no, + const object_locator_t& oloc, uint64_t off, uint64_t len, + snapid_t snapid, bufferlist *pbl, uint64_t trunc_size, + __u32 trunc_seq, int op_flags, Context *onfinish) { m_objecter->read_trunc(oid, oloc, off, len, snapid, pbl, 0, trunc_size, trunc_seq, new C_OnFinisher(new C_Lock(m_lock, onfinish), diff --git a/src/client/SyntheticClient.cc b/src/client/SyntheticClient.cc index df4e4886b634c..fa92f620f598c 100644 --- a/src/client/SyntheticClient.cc +++ b/src/client/SyntheticClient.cc @@ -601,7 +601,7 @@ int SyntheticClient::run() int size = iargs.front(); iargs.pop_front(); int inflight = iargs.front(); iargs.pop_front(); if (run_me()) { - dout(2) << "createobjects " << cout << " of " << size << " bytes" + dout(2) << "createobjects " << count << " of " << size << " bytes" << ", " << inflight << " in flight" << dendl; create_objects(count, size, inflight); } @@ -617,7 +617,7 @@ int SyntheticClient::run() int rskew = iargs.front(); iargs.pop_front(); int wskew = iargs.front(); iargs.pop_front(); if (run_me()) { - dout(2) << "objectrw " << cout << " " << size << " " << wrpc + dout(2) << "objectrw " << count << " " << size << " " << wrpc << " " << overlap << " " << rskew << " " << wskew << dendl; object_rw(count, size, wrpc, overlap, rskew, wskew); } @@ -2389,12 +2389,6 @@ int SyntheticClient::object_rw(int nobj, int osize, int wrpc, utime_t lat = ceph_clock_now(client->cct); lat -= start; - if (client->logger) { - if (write) - client->logger->tset(l_c_owrlat, lat); - else - client->logger->tset(l_c_ordlat, lat); - } } return 0; diff --git a/src/client/fuse_ll.cc b/src/client/fuse_ll.cc index 8161485dc7511..0c9becd6d13ef 100644 --- a/src/client/fuse_ll.cc +++ b/src/client/fuse_ll.cc @@ -464,8 +464,10 @@ static void fuse_ll_write(fuse_req_t req, fuse_ino_t ino, const char *buf, static void fuse_ll_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) { - // NOOP - fuse_reply_err(req, 0); + CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req); + Fh *fh = reinterpret_cast(fi->fh); + int r = cfuse->client->ll_flush(fh); + fuse_reply_err(req, -r); } #ifdef FUSE_IOCTL_COMPAT @@ -484,7 +486,7 @@ static void fuse_ll_ioctl(fuse_req_t req, fuse_ino_t ino, int cmd, void *arg, st struct ceph_file_layout layout; struct ceph_ioctl_layout l; Fh *fh = (Fh*)fi->fh; - cfuse->client->ll_file_layout(fh->inode, &layout); + cfuse->client->ll_file_layout(fh, &layout); l.stripe_unit = layout.fl_stripe_unit; l.stripe_count = layout.fl_stripe_count; l.object_size = layout.fl_object_size; @@ -594,6 +596,15 @@ static void fuse_ll_releasedir(fuse_req_t req, fuse_ino_t ino, fuse_reply_err(req, 0); } +static void fuse_ll_fsyncdir(fuse_req_t req, fuse_ino_t ino, int datasync, + struct fuse_file_info *fi) +{ + CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req); + dir_result_t *dirp = reinterpret_cast(fi->fh); + int r = cfuse->client->ll_fsyncdir(dirp); + fuse_reply_err(req, -r); +} + static void fuse_ll_access(fuse_req_t req, fuse_ino_t ino, int mask) { fuse_reply_err(req, 0); @@ -755,6 +766,21 @@ static void dentry_invalidate_cb(void *handle, vinodeno_t dirino, #endif } +static int remount_cb(void *handle) +{ + // used for trimming kernel dcache. when remounting a file system, linux kernel + // trims all unused dentries in the file system + char cmd[1024]; + CephFuse::Handle *cfuse = (CephFuse::Handle *)handle; + snprintf(cmd, sizeof(cmd), "mount -i -o remount %s", cfuse->mountpoint); + int r = system(cmd); + if (r != 0 && r != -1) { + r = WEXITSTATUS(r); + } + + return r; +} + static void do_init(void *data, fuse_conn_info *bar) { CephFuse::Handle *cfuse = (CephFuse::Handle *)data; @@ -800,7 +826,7 @@ const static struct fuse_lowlevel_ops fuse_ll_oper = { opendir: fuse_ll_opendir, readdir: fuse_ll_readdir, releasedir: fuse_ll_releasedir, - fsyncdir: 0, + fsyncdir: fuse_ll_fsyncdir, statfs: fuse_ll_statfs, setxattr: fuse_ll_setxattr, getxattr: fuse_ll_getxattr, @@ -852,8 +878,6 @@ CephFuse::Handle::~Handle() void CephFuse::Handle::finalize() { - client->ll_register_ino_invalidate_cb(NULL, NULL); - if (se) fuse_remove_signal_handlers(se); if (ch) @@ -938,25 +962,26 @@ int CephFuse::Handle::start() fuse_session_add_chan(se, ch); - client->ll_register_switch_interrupt_cb(switch_interrupt_cb); - - /* - * this is broken: - * - * - the cb needs the request handle to be useful; we should get the - * gids in the method here in fuse_ll.c and pass the gid list in, - * not use a callback. - * - the callback mallocs the list but it is not free()'d - * - * so disable it for now... - - client->ll_register_getgroups_cb(getgroups_cb, this); - - */ - client->ll_register_dentry_invalidate_cb(dentry_invalidate_cb, this); - if (client->cct->_conf->fuse_use_invalidate_cb) - client->ll_register_ino_invalidate_cb(ino_invalidate_cb, this); + struct client_callback_args args = { + handle: this, + ino_cb: client->cct->_conf->fuse_use_invalidate_cb ? ino_invalidate_cb : NULL, + dentry_cb: dentry_invalidate_cb, + switch_intr_cb: switch_interrupt_cb, + remount_cb: remount_cb, + /* + * this is broken: + * + * - the cb needs the request handle to be useful; we should get the + * gids in the method here in fuse_ll.c and pass the gid list in, + * not use a callback. + * - the callback mallocs the list but it is not free()'d + * + * so disable it for now... + getgroups_cb: getgroups_cb, + */ + }; + client->ll_register_callbacks(&args); return 0; } diff --git a/src/cls/CMakeLists.txt b/src/cls/CMakeLists.txt new file mode 100644 index 0000000000000..c6abc1f664be7 --- /dev/null +++ b/src/cls/CMakeLists.txt @@ -0,0 +1,109 @@ +## Rados object classes + +# cls_hello +add_library(cls_hello SHARED hello/cls_hello.cc) +set_target_properties(cls_hello PROPERTIES VERSION "1.0.0" SOVERSION "1") +install(TARGETS cls_hello DESTINATION lib/rados-classes) + +# cls_rbd +if (WITH_RBD) + add_library(cls_rbd SHARED rbd/cls_rbd.cc) + set_target_properties(cls_rbd PROPERTIES VERSION "1.0.0" SOVERSION "1") + install(TARGETS cls_rbd DESTINATION lib/rados-classes) + + add_library(cls_rbd_client rbd/cls_rbd_client.cc) +endif (WITH_RBD) + +# cls_lock +add_library(cls_lock SHARED lock/cls_lock.cc) +set_target_properties(cls_lock PROPERTIES VERSION "1.0.0" SOVERSION "1") +install(TARGETS cls_lock DESTINATION lib/rados-classes) + +add_library(cls_lock_client + lock/cls_lock_client.cc + lock/cls_lock_types.cc + lock/cls_lock_ops.cc) + +# cls_refcount +add_library(cls_refcount SHARED + refcount/cls_refcount.cc + refcount/cls_refcount_ops.cc + ${CMAKE_SOURCE_DIR}/src/common/ceph_json.cc) +target_link_libraries(cls_refcount json_spirit) +set_target_properties(cls_refcount PROPERTIES VERSION "1.0.0" SOVERSION "1") +install(TARGETS cls_refcount DESTINATION lib/rados-classes) + +add_library(cls_refcount_client + refcount/cls_refcount_client.cc + refcount/cls_refcount_ops.cc) + +# cls_version +add_library(cls_version SHARED version/cls_version.cc) +set_target_properties(cls_version PROPERTIES VERSION "1.0.0" SOVERSION "1") +install(TARGETS cls_version DESTINATION lib/rados-classes) + +add_library(cls_version_client + version/cls_version_client.cc + version/cls_version_types.cc) + +# cls_log +add_library(cls_log SHARED log/cls_log.cc) +set_target_properties(cls_log PROPERTIES VERSION "1.0.0" SOVERSION "1") +install(TARGETS cls_log DESTINATION lib/rados-classes) + +add_library(cls_log_client log/cls_log_client.cc) + +# cls_statelog +add_library(cls_statelog SHARED statelog/cls_statelog.cc) +set_target_properties(cls_statelog PROPERTIES VERSION "1.0.0" SOVERSION "1") +install(TARGETS cls_statelog DESTINATION lib/rados-classes) + +add_library(cls_statelog_client statelog/cls_statelog_client.cc) + +# cls_replica_log +add_library(cls_replica_log SHARED replica_log/cls_replica_log.cc) +set_target_properties(cls_replica_log PROPERTIES VERSION "1.0.0" SOVERSION "1") +install(TARGETS cls_replica_log DESTINATION lib/rados-classes) + +add_library(cls_replica_log_client + replica_log/cls_replica_log_types.cc + replica_log/cls_replica_log_ops.cc + replica_log/cls_replica_log_client.cc) + +# cls_user +add_library(cls_user SHARED user/cls_user.cc) +set_target_properties(cls_user PROPERTIES VERSION "1.0.0" SOVERSION "1") +install(TARGETS cls_user DESTINATION lib/rados-classes) + +add_library(cls_user_client + user/cls_user_client.cc + user/cls_user_types.cc + user/cls_user_ops.cc) + +# cls_rgw +if (WITH_RADOSGW) + add_library(cls_rgw SHARED + rgw/cls_rgw.cc + rgw/cls_rgw_ops.cc + rgw/cls_rgw_types.cc + ${CMAKE_SOURCE_DIR}/src/common/ceph_json.cc) + target_link_libraries(cls_rgw json_spirit) + set_target_properties(cls_rgw PROPERTIES VERSION "1.0.0" SOVERSION "1") + install(TARGETS cls_rgw DESTINATION lib/rados-classes) + + add_library(cls_rgw_client + rgw/cls_rgw_client.cc + rgw/cls_rgw_types.cc + rgw/cls_rgw_ops.cc) +endif (WITH_RADOSGW) + +# cls_cephfs +if (WITH_CEPHFS) + add_library(cls_cephfs SHARED + cephfs/cls_cephfs.cc) + set_target_properties(cls_cephfs PROPERTIES VERSION "1.0.0" SOVERSION "1") + install(TARGETS cls_cephfs DESTINATION lib/rados-classes) + + add_library(cls_cephfs_client + cephfs/cls_cephfs_client.cc) +endif (WITH_CEPHFS) diff --git a/src/cls/Makefile-client.am b/src/cls/Makefile-client.am new file mode 100644 index 0000000000000..aa4a4e6054b6d --- /dev/null +++ b/src/cls/Makefile-client.am @@ -0,0 +1,81 @@ +## Rados object client classes + +libcls_lock_client_la_SOURCES = \ + cls/lock/cls_lock_client.cc \ + cls/lock/cls_lock_types.cc \ + cls/lock/cls_lock_ops.cc +noinst_LTLIBRARIES += libcls_lock_client.la +DENCODER_DEPS += libcls_lock_client.la + +libcls_refcount_client_la_SOURCES = \ + cls/refcount/cls_refcount_client.cc \ + cls/refcount/cls_refcount_ops.cc +noinst_LTLIBRARIES += libcls_refcount_client.la +DENCODER_DEPS += libcls_refcount_client.la + +libcls_version_client_a_SOURCES = \ + cls/version/cls_version_client.cc \ + cls/version/cls_version_types.cc +noinst_LIBRARIES += libcls_version_client.a + +libcls_log_client_a_SOURCES = cls/log/cls_log_client.cc +noinst_LIBRARIES += libcls_log_client.a + +libcls_statelog_client_a_SOURCES = cls/statelog/cls_statelog_client.cc +noinst_LIBRARIES += libcls_statelog_client.a + +libcls_replica_log_client_a_SOURCES = \ + cls/replica_log/cls_replica_log_types.cc \ + cls/replica_log/cls_replica_log_ops.cc \ + cls/replica_log/cls_replica_log_client.cc +noinst_LIBRARIES += libcls_replica_log_client.a +DENCODER_DEPS += libcls_replica_log_client.a + +libcls_rgw_client_la_SOURCES = \ + cls/rgw/cls_rgw_client.cc \ + cls/rgw/cls_rgw_types.cc \ + cls/rgw/cls_rgw_ops.cc +noinst_LTLIBRARIES += libcls_rgw_client.la +DENCODER_DEPS += libcls_rgw_client.la + +libcls_rbd_client_la_SOURCES = cls/rbd/cls_rbd_client.cc +noinst_LTLIBRARIES += libcls_rbd_client.la + +libcls_user_client_a_SOURCES = cls/user/cls_user_client.cc \ + cls/user/cls_user_types.cc \ + cls/user/cls_user_ops.cc +DENCODER_DEPS += libcls_user_client.a + +noinst_LIBRARIES += libcls_user_client.a + +libcls_cephfs_client_la_SOURCES = cls/cephfs/cls_cephfs_client.cc +noinst_LTLIBRARIES += libcls_cephfs_client.la + +noinst_HEADERS += \ + cls/lock/cls_lock_types.h \ + cls/lock/cls_lock_ops.h \ + cls/lock/cls_lock_client.h \ + cls/rbd/cls_rbd.h \ + cls/rbd/cls_rbd_client.h \ + cls/refcount/cls_refcount_ops.h \ + cls/refcount/cls_refcount_client.h \ + cls/version/cls_version_types.h \ + cls/version/cls_version_ops.h \ + cls/version/cls_version_client.h \ + cls/log/cls_log_types.h \ + cls/log/cls_log_ops.h \ + cls/log/cls_log_client.h \ + cls/statelog/cls_statelog_types.h \ + cls/statelog/cls_statelog_ops.h \ + cls/statelog/cls_statelog_client.h \ + cls/replica_log/cls_replica_log_types.h \ + cls/replica_log/cls_replica_log_ops.h \ + cls/replica_log/cls_replica_log_client.h \ + cls/rgw/cls_rgw_client.h \ + cls/rgw/cls_rgw_ops.h \ + cls/rgw/cls_rgw_types.h \ + cls/user/cls_user_client.h \ + cls/user/cls_user_ops.h \ + cls/user/cls_user_types.h \ + cls/cephfs/cls_cephfs.h \ + cls/cephfs/cls_cephfs_client.h diff --git a/src/cls/Makefile-server.am b/src/cls/Makefile-server.am new file mode 100644 index 0000000000000..7af69ba18dbc0 --- /dev/null +++ b/src/cls/Makefile-server.am @@ -0,0 +1,65 @@ +## Rados object classes + +if WITH_OSD +libcls_hello_la_SOURCES = cls/hello/cls_hello.cc +libcls_hello_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS) +libcls_hello_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*' +radoslib_LTLIBRARIES += libcls_hello.la + +libcls_rbd_la_SOURCES = cls/rbd/cls_rbd.cc +libcls_rbd_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS) +libcls_rbd_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*' +radoslib_LTLIBRARIES += libcls_rbd.la + +libcls_lock_la_SOURCES = cls/lock/cls_lock.cc +libcls_lock_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS) +libcls_lock_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*' +radoslib_LTLIBRARIES += libcls_lock.la + +libcls_refcount_la_SOURCES = \ + cls/refcount/cls_refcount.cc \ + cls/refcount/cls_refcount_ops.cc \ + common/ceph_json.cc +libcls_refcount_la_LIBADD = libjson_spirit.la $(PTHREAD_LIBS) $(EXTRALIBS) +libcls_refcount_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*' +radoslib_LTLIBRARIES += libcls_refcount.la + +libcls_version_la_SOURCES = cls/version/cls_version.cc +libcls_version_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS) +libcls_version_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*' +radoslib_LTLIBRARIES += libcls_version.la + +libcls_log_la_SOURCES = cls/log/cls_log.cc +libcls_log_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS) +libcls_log_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*' +radoslib_LTLIBRARIES += libcls_log.la + +libcls_statelog_la_SOURCES = cls/statelog/cls_statelog.cc +libcls_statelog_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS) +libcls_statelog_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*' +radoslib_LTLIBRARIES += libcls_statelog.la + +libcls_replica_log_la_SOURCES = cls/replica_log/cls_replica_log.cc +libcls_replica_log_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS) +libcls_replica_log_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*' +radoslib_LTLIBRARIES += libcls_replica_log.la + +libcls_user_la_SOURCES = cls/user/cls_user.cc +libcls_user_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS) +libcls_user_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 -export-symbols-regex '.*__cls_.*' +radoslib_LTLIBRARIES += libcls_user.la + +libcls_rgw_la_SOURCES = \ + cls/rgw/cls_rgw.cc \ + cls/rgw/cls_rgw_ops.cc \ + cls/rgw/cls_rgw_types.cc \ + common/ceph_json.cc +libcls_rgw_la_LIBADD = libjson_spirit.la $(PTHREAD_LIBS) $(EXTRALIBS) +libcls_rgw_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*' +radoslib_LTLIBRARIES += libcls_rgw.la + +libcls_cephfs_la_SOURCES = cls/cephfs/cls_cephfs.cc +libcls_cephfs_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS) +libcls_cephfs_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*' +radoslib_LTLIBRARIES += libcls_cephfs.la +endif # WITH_OSD diff --git a/src/cls/Makefile.am b/src/cls/Makefile.am index ea44fe7671fa3..bac67e902e028 100644 --- a/src/cls/Makefile.am +++ b/src/cls/Makefile.am @@ -1,136 +1,7 @@ -## Rados object classes - -libcls_hello_la_SOURCES = cls/hello/cls_hello.cc -libcls_hello_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS) -libcls_hello_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*' -radoslib_LTLIBRARIES += libcls_hello.la - -libcls_rbd_la_SOURCES = cls/rbd/cls_rbd.cc -libcls_rbd_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS) -libcls_rbd_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*' -radoslib_LTLIBRARIES += libcls_rbd.la - -libcls_lock_la_SOURCES = cls/lock/cls_lock.cc -libcls_lock_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS) -libcls_lock_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*' -radoslib_LTLIBRARIES += libcls_lock.la - -libcls_refcount_la_SOURCES = \ - cls/refcount/cls_refcount.cc \ - cls/refcount/cls_refcount_ops.cc \ - common/ceph_json.cc -libcls_refcount_la_LIBADD = libjson_spirit.la $(PTHREAD_LIBS) $(EXTRALIBS) -libcls_refcount_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*' -radoslib_LTLIBRARIES += libcls_refcount.la - -libcls_version_la_SOURCES = cls/version/cls_version.cc -libcls_version_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS) -libcls_version_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*' -radoslib_LTLIBRARIES += libcls_version.la - -libcls_log_la_SOURCES = cls/log/cls_log.cc -libcls_log_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS) -libcls_log_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*' -radoslib_LTLIBRARIES += libcls_log.la - -libcls_statelog_la_SOURCES = cls/statelog/cls_statelog.cc -libcls_statelog_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS) -libcls_statelog_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*' -radoslib_LTLIBRARIES += libcls_statelog.la - -libcls_replica_log_la_SOURCES = cls/replica_log/cls_replica_log.cc -libcls_replica_log_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS) -libcls_replica_log_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*' -radoslib_LTLIBRARIES += libcls_replica_log.la - -libcls_user_la_SOURCES = cls/user/cls_user.cc -libcls_user_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS) -libcls_user_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 -export-symbols-regex '.*__cls_.*' -radoslib_LTLIBRARIES += libcls_user.la - -libcls_rgw_la_SOURCES = \ - cls/rgw/cls_rgw.cc \ - cls/rgw/cls_rgw_ops.cc \ - cls/rgw/cls_rgw_types.cc \ - common/ceph_json.cc -libcls_rgw_la_LIBADD = libjson_spirit.la $(PTHREAD_LIBS) $(EXTRALIBS) -libcls_rgw_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*' -radoslib_LTLIBRARIES += libcls_rgw.la - -## Rados object client classes - -libcls_lock_client_la_SOURCES = \ - cls/lock/cls_lock_client.cc \ - cls/lock/cls_lock_types.cc \ - cls/lock/cls_lock_ops.cc -noinst_LTLIBRARIES += libcls_lock_client.la -DENCODER_DEPS += libcls_lock_client.la - -libcls_refcount_client_la_SOURCES = \ - cls/refcount/cls_refcount_client.cc \ - cls/refcount/cls_refcount_ops.cc -noinst_LTLIBRARIES += libcls_refcount_client.la -DENCODER_DEPS += libcls_refcount_client.la - -libcls_version_client_a_SOURCES = \ - cls/version/cls_version_client.cc \ - cls/version/cls_version_types.cc -noinst_LIBRARIES += libcls_version_client.a - -libcls_log_client_a_SOURCES = cls/log/cls_log_client.cc -noinst_LIBRARIES += libcls_log_client.a - -libcls_statelog_client_a_SOURCES = cls/statelog/cls_statelog_client.cc -noinst_LIBRARIES += libcls_statelog_client.a - -libcls_replica_log_client_a_SOURCES = \ - cls/replica_log/cls_replica_log_types.cc \ - cls/replica_log/cls_replica_log_ops.cc \ - cls/replica_log/cls_replica_log_client.cc -noinst_LIBRARIES += libcls_replica_log_client.a -DENCODER_DEPS += libcls_replica_log_client.a - -libcls_rgw_client_la_SOURCES = \ - cls/rgw/cls_rgw_client.cc \ - cls/rgw/cls_rgw_types.cc \ - cls/rgw/cls_rgw_ops.cc -noinst_LTLIBRARIES += libcls_rgw_client.la -DENCODER_DEPS += libcls_rgw_client.la - -libcls_rbd_client_la_SOURCES = cls/rbd/cls_rbd_client.cc -noinst_LTLIBRARIES += libcls_rbd_client.la - -libcls_user_client_a_SOURCES = cls/user/cls_user_client.cc \ - cls/user/cls_user_types.cc \ - cls/user/cls_user_ops.cc -DENCODER_DEPS += libcls_user_client.a - -noinst_LIBRARIES += libcls_user_client.a - -noinst_HEADERS += \ - cls/lock/cls_lock_types.h \ - cls/lock/cls_lock_ops.h \ - cls/lock/cls_lock_client.h \ - cls/rbd/cls_rbd.h \ - cls/rbd/cls_rbd_client.h \ - cls/refcount/cls_refcount_ops.h \ - cls/refcount/cls_refcount_client.h \ - cls/version/cls_version_types.h \ - cls/version/cls_version_ops.h \ - cls/version/cls_version_client.h \ - cls/log/cls_log_types.h \ - cls/log/cls_log_ops.h \ - cls/log/cls_log_client.h \ - cls/statelog/cls_statelog_types.h \ - cls/statelog/cls_statelog_ops.h \ - cls/statelog/cls_statelog_client.h \ - cls/replica_log/cls_replica_log_types.h \ - cls/replica_log/cls_replica_log_ops.h \ - cls/replica_log/cls_replica_log_client.h \ - cls/rgw/cls_rgw_client.h \ - cls/rgw/cls_rgw_ops.h \ - cls/rgw/cls_rgw_types.h \ - cls/user/cls_user_client.h \ - cls/user/cls_user_ops.h \ - cls/user/cls_user_types.h +if ENABLE_CLIENT +include cls/Makefile-client.am +endif +if ENABLE_SERVER +include cls/Makefile-server.am +endif diff --git a/src/cls/cephfs/cls_cephfs.cc b/src/cls/cephfs/cls_cephfs.cc new file mode 100644 index 0000000000000..f58f0de39afa2 --- /dev/null +++ b/src/cls/cephfs/cls_cephfs.cc @@ -0,0 +1,143 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include +#include +#include + +#include "objclass/objclass.h" + +#include "cls_cephfs.h" + +CLS_VER(1,0) +CLS_NAME(cephfs_size_scan) + +cls_handle_t h_class; +cls_method_handle_t h_accumulate_inode_metadata; + + + +std::ostream &operator<<(std::ostream &out, ObjCeiling &in) +{ + out << "id: " << in.id << " size: " << in.size; + return out; +} + + +/** + * Set a named xattr to a given value, if and only if the xattr + * is not already set to a greater value. + * + * If the xattr is missing, then it is set to the input integer. + * + * @param xattr_name: name of xattr to compare against and set + * @param input_val: candidate new value, of ::encode()'able type + * @returns 0 on success (irrespective of whether our new value + * was used) else an error code + */ +template +static int set_if_greater(cls_method_context_t hctx, + const std::string &xattr_name, const A input_val) +{ + bufferlist existing_val_bl; + + bool set_val = false; + int r = cls_cxx_getxattr(hctx, xattr_name.c_str(), &existing_val_bl); + if (r == -ENOENT || existing_val_bl.length() == 0) { + set_val = true; + } else if (r >= 0) { + bufferlist::iterator existing_p = existing_val_bl.begin(); + try { + A existing_val; + ::decode(existing_val, existing_p); + if (!existing_p.end()) { + // Trailing junk? Consider it invalid and overwrite + set_val = true; + } else { + // Valid existing value, do comparison + set_val = input_val > existing_val; + } + } catch (const buffer::error &err) { + // Corrupt or empty existing value, overwrite it + set_val = true; + } + } else { + return r; + } + + // Conditionally set the new xattr + if (set_val) { + bufferlist set_bl; + ::encode(input_val, set_bl); + return cls_cxx_setxattr(hctx, xattr_name.c_str(), &set_bl); + } else { + return 0; + } +} + +static int accumulate_inode_metadata(cls_method_context_t hctx, + bufferlist *in, bufferlist *out) +{ + assert(in != NULL); + assert(out != NULL); + + int r = 0; + + // Decode `in` + bufferlist::iterator q = in->begin(); + AccumulateArgs args; + try { + args.decode(q); + } catch (const buffer::error &err) { + return -EINVAL; + } + + ObjCeiling ceiling(args.obj_index, args.obj_size); + r = set_if_greater(hctx, args.obj_xattr_name, ceiling); + if (r < 0) { + return r; + } + + r = set_if_greater(hctx, args.mtime_xattr_name, args.mtime); + if (r < 0) { + return r; + } + + r = set_if_greater(hctx, args.obj_size_xattr_name, args.obj_size); + if (r < 0) { + return r; + } + + return 0; +} + +/** + * initialize class + * + * We do two things here: we register the new class, and then register + * all of the class's methods. + */ +void __cls_init() +{ + // this log message, at level 0, will always appear in the ceph-osd + // log file. + CLS_LOG(0, "loading cephfs_size_scan"); + + cls_register("cephfs", &h_class); + cls_register_cxx_method(h_class, "accumulate_inode_metadata", + CLS_METHOD_WR | CLS_METHOD_RD, + accumulate_inode_metadata, &h_accumulate_inode_metadata); +} + diff --git a/src/cls/cephfs/cls_cephfs.h b/src/cls/cephfs/cls_cephfs.h new file mode 100644 index 0000000000000..d4a5f23811dfc --- /dev/null +++ b/src/cls/cephfs/cls_cephfs.h @@ -0,0 +1,127 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "include/encoding.h" + +/** + * Value class for the xattr we'll use to accumulate + * the highest object seen for a given inode + */ +class ObjCeiling { + public: + uint64_t id; + uint64_t size; + + ObjCeiling() + : id(0), size(0) + {} + + ObjCeiling(uint64_t id_, uint64_t size_) + : id(id_), size(size_) + {} + + bool operator >(ObjCeiling const &rhs) const + { + return id > rhs.id; + } + + void encode(bufferlist &bl) const + { + ENCODE_START(1, 1, bl); + ::encode(id, bl); + ::encode(size, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::iterator &p) + { + DECODE_START(1, p); + ::decode(id, p); + ::decode(size, p); + DECODE_FINISH(p); + } +}; +WRITE_CLASS_ENCODER(ObjCeiling) + +class AccumulateArgs +{ +public: + uint64_t obj_index; + uint64_t obj_size; + int64_t mtime; + std::string obj_xattr_name; + std::string mtime_xattr_name; + std::string obj_size_xattr_name; + + AccumulateArgs( + uint64_t obj_index_, + uint64_t obj_size_, + time_t mtime_, + std::string obj_xattr_name_, + std::string mtime_xattr_name_, + std::string obj_size_xattr_name_) + : obj_index(obj_index_), + obj_size(obj_size_), + mtime(mtime_), + obj_xattr_name(obj_xattr_name_), + mtime_xattr_name(mtime_xattr_name_), + obj_size_xattr_name(obj_size_xattr_name_) + {} + + AccumulateArgs() + : obj_index(0), obj_size(0), mtime(0) + {} + + void encode(bufferlist &bl) const + { + ENCODE_START(1, 1, bl); + ::encode(obj_xattr_name, bl); + ::encode(mtime_xattr_name, bl); + ::encode(obj_size_xattr_name, bl); + ::encode(obj_index, bl); + ::encode(obj_size, bl); + ::encode(mtime, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::iterator &bl) + { + DECODE_START(1, bl); + ::decode(obj_xattr_name, bl); + ::decode(mtime_xattr_name, bl); + ::decode(obj_size_xattr_name, bl); + ::decode(obj_index, bl); + ::decode(obj_size, bl); + ::decode(mtime, bl); + DECODE_FINISH(bl); + } +}; + +class AccumulateResult +{ +public: + // Index of the highest-indexed object seen + uint64_t ceiling_obj_index; + // Size of the highest-index object seen + uint64_t ceiling_obj_size; + // Largest object seen + uint64_t max_obj_size; + // Highest mtime seen + int64_t max_mtime; + + AccumulateResult() + : ceiling_obj_index(0), ceiling_obj_size(0), max_obj_size(0), max_mtime(0) + {} +}; + diff --git a/src/cls/cephfs/cls_cephfs_client.cc b/src/cls/cephfs/cls_cephfs_client.cc new file mode 100644 index 0000000000000..d135922699e2b --- /dev/null +++ b/src/cls/cephfs/cls_cephfs_client.cc @@ -0,0 +1,131 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include "cls_cephfs_client.h" + +#include "mds/CInode.h" + +#define XATTR_CEILING "scan_ceiling" +#define XATTR_MAX_MTIME "scan_max_mtime" +#define XATTR_MAX_SIZE "scan_max_size" + +int ClsCephFSClient::accumulate_inode_metadata( + librados::IoCtx &ctx, + inodeno_t inode_no, + const uint64_t obj_index, + const uint64_t obj_size, + const time_t mtime) +{ + AccumulateArgs args( + obj_index, + obj_size, + mtime, + XATTR_CEILING, + XATTR_MAX_MTIME, + XATTR_MAX_SIZE); + + // Generate 0th object name, where we will accumulate sizes/mtimes + object_t zeroth_object = InodeStore::get_object_name(inode_no, frag_t(), ""); + + // Construct a librados operation invoking our class method + librados::ObjectReadOperation op; + bufferlist inbl; + args.encode(inbl); + op.exec("cephfs", "accumulate_inode_metadata", inbl); + + // Execute op + bufferlist outbl; + return ctx.operate(zeroth_object.name, &op, &outbl); +} + +int ClsCephFSClient::fetch_inode_accumulate_result( + librados::IoCtx &ctx, + const std::string &oid, + inode_backtrace_t *backtrace, + AccumulateResult *result) +{ + assert(backtrace != NULL); + assert(result != NULL); + + librados::ObjectReadOperation op; + + int scan_ceiling_r = 0; + bufferlist scan_ceiling_bl; + op.getxattr(XATTR_CEILING, &scan_ceiling_bl, &scan_ceiling_r); + + int scan_max_size_r = 0; + bufferlist scan_max_size_bl; + op.getxattr(XATTR_MAX_SIZE, &scan_max_size_bl, &scan_max_size_r); + + int scan_max_mtime_r = 0; + bufferlist scan_max_mtime_bl; + op.getxattr(XATTR_MAX_MTIME, &scan_max_mtime_bl, &scan_max_mtime_r); + + int parent_r = 0; + bufferlist parent_bl; + op.getxattr("parent", &parent_bl, &parent_r); + + bufferlist op_bl; + int r = ctx.operate(oid, &op, &op_bl); + if (r < 0 && r != -ENODATA) { + // ENODATA acceptable from parent getxattr (just means there happens + // not to be a backtrace) + return r; + } + + // Load scan_ceiling + try { + bufferlist::iterator scan_ceiling_bl_iter = scan_ceiling_bl.begin(); + ObjCeiling ceiling; + ceiling.decode(scan_ceiling_bl_iter); + result->ceiling_obj_index = ceiling.id; + result->ceiling_obj_size = ceiling.size; + } catch (const buffer::error &err) { + //dout(4) << "Invalid size attr on '" << oid << "'" << dendl; + return -EINVAL; + } + + // Load scan_max_size + try { + bufferlist::iterator scan_max_size_bl_iter = scan_max_size_bl.begin(); + ::decode(result->max_obj_size, scan_max_size_bl_iter); + } catch (const buffer::error &err) { + //dout(4) << "Invalid size attr on '" << oid << "'" << dendl; + return -EINVAL; + } + + // Load scan_max_mtime + try { + bufferlist::iterator scan_max_mtime_bl_iter = scan_max_mtime_bl.begin(); + ::decode(result->max_mtime, scan_max_mtime_bl_iter); + } catch (const buffer::error &err) { + //dout(4) << "Invalid size attr on '" << oid << "'" << dendl; + return -EINVAL; + } + + // Deserialize backtrace + if (parent_bl.length()) { + try { + bufferlist::iterator q = parent_bl.begin(); + backtrace->decode(q); + } catch (buffer::error &e) { + //dout(4) << "Corrupt backtrace on '" << oid << "': " << e << dendl; + return -EINVAL; + } + } + + return 0; +} + diff --git a/src/cls/cephfs/cls_cephfs_client.h b/src/cls/cephfs/cls_cephfs_client.h new file mode 100644 index 0000000000000..5448a31505b4b --- /dev/null +++ b/src/cls/cephfs/cls_cephfs_client.h @@ -0,0 +1,25 @@ + +#include "include/rados/librados.hpp" +#include "mds/mdstypes.h" + +#include "cls_cephfs.h" + +class AccumulateArgs; + +class ClsCephFSClient +{ + public: + static int accumulate_inode_metadata( + librados::IoCtx &ctx, + inodeno_t inode_no, + const uint64_t obj_index, + const uint64_t obj_size, + const time_t mtime); + + static int fetch_inode_accumulate_result( + librados::IoCtx &ctx, + const std::string &oid, + inode_backtrace_t *backtrace, + AccumulateResult *result); +}; + diff --git a/src/cls/lock/cls_lock.cc b/src/cls/lock/cls_lock.cc index b4772e0d08f81..cefb870d173b4 100644 --- a/src/cls/lock/cls_lock.cc +++ b/src/cls/lock/cls_lock.cc @@ -42,6 +42,7 @@ cls_method_handle_t h_unlock_op; cls_method_handle_t h_break_lock; cls_method_handle_t h_get_info; cls_method_handle_t h_list_locks; +cls_method_handle_t h_assert_locked; #define LOCK_PREFIX "lock." @@ -438,6 +439,78 @@ static int list_locks(cls_method_context_t hctx, bufferlist *in, bufferlist *out return 0; } +/** + * Assert that the object is currently locked + * + * Input: + * @param cls_lock_assert_op request input + * + * Output: + * @param none + * + * @return 0 on success, -errno on failure. + */ +int assert_locked(cls_method_context_t hctx, bufferlist *in, bufferlist *out) +{ + CLS_LOG(20, "assert_locked"); + + cls_lock_assert_op op; + try { + bufferlist::iterator iter = in->begin(); + ::decode(op, iter); + } catch (const buffer::error& err) { + return -EINVAL; + } + + if (op.type != LOCK_EXCLUSIVE && op.type != LOCK_SHARED) { + return -EINVAL; + } + + if (op.name.empty()) { + return -EINVAL; + } + + // see if there's already a locker + lock_info_t linfo; + int r = read_lock(hctx, op.name, &linfo); + if (r < 0) { + CLS_ERR("Could not read lock info: %s", cpp_strerror(r).c_str()); + return r; + } + + if (linfo.lockers.empty()) { + CLS_LOG(20, "object not locked"); + return -EBUSY; + } + + if (linfo.lock_type != op.type) { + CLS_LOG(20, "lock type mismatch: current=%s, assert=%s", + cls_lock_type_str(linfo.lock_type), cls_lock_type_str(op.type)); + return -EBUSY; + } + + if (linfo.tag != op.tag) { + CLS_LOG(20, "lock tag mismatch: current=%s, assert=%s", linfo.tag.c_str(), + op.tag.c_str()); + return -EBUSY; + } + + entity_inst_t inst; + r = cls_get_request_origin(hctx, &inst); + assert(r == 0); + + locker_id_t id; + id.cookie = op.cookie; + id.locker = inst.name; + + map::iterator iter = linfo.lockers.find(id); + if (iter == linfo.lockers.end()) { + CLS_LOG(20, "not locked by assert client"); + return -EBUSY; + } + return 0; +} + void __cls_init() { CLS_LOG(20, "Loaded lock class!"); @@ -458,6 +531,9 @@ void __cls_init() cls_register_cxx_method(h_class, "list_locks", CLS_METHOD_RD, list_locks, &h_list_locks); + cls_register_cxx_method(h_class, "assert_locked", + CLS_METHOD_RD, + assert_locked, &h_assert_locked); return; } diff --git a/src/cls/lock/cls_lock_client.cc b/src/cls/lock/cls_lock_client.cc index 54af41cd04926..30466fbd7e5ba 100644 --- a/src/cls/lock/cls_lock_client.cc +++ b/src/cls/lock/cls_lock_client.cc @@ -175,6 +175,30 @@ namespace rados { return get_lock_info_finish(&it, lockers, type, tag); } + void assert_locked(librados::ObjectOperation *rados_op, + const std::string& name, ClsLockType type, + const std::string& cookie, const std::string& tag) + { + cls_lock_assert_op op; + op.name = name; + op.type = type; + op.cookie = cookie; + op.tag = tag; + bufferlist in; + ::encode(op, in); + rados_op->exec("lock", "assert_locked", in); + } + + void Lock::assert_locked_exclusive(ObjectOperation *op) + { + assert_locked(op, name, LOCK_EXCLUSIVE, cookie, tag); + } + + void Lock::assert_locked_shared(ObjectOperation *op) + { + assert_locked(op, name, LOCK_SHARED, cookie, tag); + } + void Lock::lock_shared(ObjectWriteOperation *op) { lock(op, name, LOCK_SHARED, diff --git a/src/cls/lock/cls_lock_client.h b/src/cls/lock/cls_lock_client.h index 4e2144c79b4ca..b60d25e098a9f 100644 --- a/src/cls/lock/cls_lock_client.h +++ b/src/cls/lock/cls_lock_client.h @@ -14,7 +14,6 @@ namespace rados { namespace cls { namespace lock { - extern void lock(librados::ObjectWriteOperation *rados_op, const std::string& name, ClsLockType type, const std::string& cookie, const std::string& tag, @@ -55,6 +54,11 @@ namespace rados { map *lockers, ClsLockType *type, std::string *tag); + extern void assert_locked(librados::ObjectOperation *rados_op, + const std::string& name, ClsLockType type, + const std::string& cookie, + const std::string& tag); + class Lock { std::string name; std::string cookie; @@ -79,6 +83,9 @@ namespace rados { } } + void assert_locked_exclusive(librados::ObjectOperation *rados_op); + void assert_locked_shared(librados::ObjectOperation *rados_op); + /* ObjectWriteOperation */ void lock_exclusive(librados::ObjectWriteOperation *ioctx); void lock_shared(librados::ObjectWriteOperation *ioctx); diff --git a/src/cls/lock/cls_lock_ops.cc b/src/cls/lock/cls_lock_ops.cc index 557e7d4d5d069..7de832623d268 100644 --- a/src/cls/lock/cls_lock_ops.cc +++ b/src/cls/lock/cls_lock_ops.cc @@ -169,4 +169,22 @@ void cls_lock_list_locks_reply::generate_test_instances(listdump_string("name", name); + f->dump_string("type", cls_lock_type_str(type)); + f->dump_string("cookie", cookie); + f->dump_string("tag", tag); +} + +void cls_lock_assert_op::generate_test_instances(list& o) +{ + cls_lock_assert_op *i = new cls_lock_assert_op; + i->name = "name"; + i->type = LOCK_SHARED; + i->cookie = "cookie"; + i->tag = "tag"; + o.push_back(i); + o.push_back(new cls_lock_assert_op); +} diff --git a/src/cls/lock/cls_lock_ops.h b/src/cls/lock/cls_lock_ops.h index 95f745e5c4bf2..d2076f1169a70 100644 --- a/src/cls/lock/cls_lock_ops.h +++ b/src/cls/lock/cls_lock_ops.h @@ -170,4 +170,37 @@ struct cls_lock_list_locks_reply }; WRITE_CLASS_ENCODER(cls_lock_list_locks_reply) +struct cls_lock_assert_op +{ + string name; + ClsLockType type; + string cookie; + string tag; + + cls_lock_assert_op() : type(LOCK_NONE) {} + + void encode(bufferlist &bl) const { + ENCODE_START(1, 1, bl); + ::encode(name, bl); + uint8_t t = (uint8_t)type; + ::encode(t, bl); + ::encode(cookie, bl); + ::encode(tag, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::iterator &bl) { + DECODE_START_LEGACY_COMPAT_LEN(1, 1, 1, bl); + ::decode(name, bl); + uint8_t t; + ::decode(t, bl); + type = (ClsLockType)t; + ::decode(cookie, bl); + ::decode(tag, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + static void generate_test_instances(list& o); +}; +WRITE_CLASS_ENCODER(cls_lock_assert_op) + #endif diff --git a/src/cls/log/cls_log_ops.h b/src/cls/log/cls_log_ops.h index 6bcb69d1334f7..ad251fdcb8439 100644 --- a/src/cls/log/cls_log_ops.h +++ b/src/cls/log/cls_log_ops.h @@ -33,7 +33,7 @@ struct cls_log_list_op { int max_entries; /* upperbound to returned num of entries might return less than that and still be truncated */ - cls_log_list_op() {} + cls_log_list_op() : max_entries(0) {} void encode(bufferlist& bl) const { ENCODE_START(1, 1, bl); diff --git a/src/cls/rbd/cls_rbd.cc b/src/cls/rbd/cls_rbd.cc index 68d960627125f..74af0a27329ea 100644 --- a/src/cls/rbd/cls_rbd.cc +++ b/src/cls/rbd/cls_rbd.cc @@ -37,9 +37,11 @@ #include #include +#include "common/bit_vector.hpp" #include "common/errno.h" #include "objclass/objclass.h" #include "include/rbd_types.h" +#include "include/rbd/object_map_types.h" #include "cls/rbd/cls_rbd.h" @@ -63,6 +65,7 @@ CLS_NAME(rbd) cls_handle_t h_class; cls_method_handle_t h_create; cls_method_handle_t h_get_features; +cls_method_handle_t h_set_features; cls_method_handle_t h_get_size; cls_method_handle_t h_set_size; cls_method_handle_t h_get_parent; @@ -71,6 +74,8 @@ cls_method_handle_t h_get_protection_status; cls_method_handle_t h_set_protection_status; cls_method_handle_t h_get_stripe_unit_count; cls_method_handle_t h_set_stripe_unit_count; +cls_method_handle_t h_get_flags; +cls_method_handle_t h_set_flags; cls_method_handle_t h_remove_parent; cls_method_handle_t h_add_child; cls_method_handle_t h_remove_child; @@ -90,6 +95,16 @@ cls_method_handle_t h_dir_list; cls_method_handle_t h_dir_add_image; cls_method_handle_t h_dir_remove_image; cls_method_handle_t h_dir_rename_image; +cls_method_handle_t h_object_map_load; +cls_method_handle_t h_object_map_save; +cls_method_handle_t h_object_map_resize; +cls_method_handle_t h_object_map_update; +cls_method_handle_t h_object_map_snap_add; +cls_method_handle_t h_object_map_snap_remove; +cls_method_handle_t h_metadata_set; +cls_method_handle_t h_metadata_remove; +cls_method_handle_t h_metadata_list; +cls_method_handle_t h_metadata_get; cls_method_handle_t h_old_snapshots_list; cls_method_handle_t h_old_snapshot_add; cls_method_handle_t h_old_snapshot_remove; @@ -98,6 +113,7 @@ cls_method_handle_t h_old_snapshot_remove; #define RBD_SNAP_KEY_PREFIX "snapshot_" #define RBD_DIR_ID_KEY_PREFIX "id_" #define RBD_DIR_NAME_KEY_PREFIX "name_" +#define RBD_METADATA_KEY_PREFIX "metadata_" static int snap_read_header(cls_method_context_t hctx, bufferlist& bl) { @@ -273,46 +289,110 @@ int create(cls_method_context_t hctx, bufferlist *in, bufferlist *out) /** * Input: - * @param snap_id which snapshot to query, or CEPH_NOSNAP (uint64_t) + * @param snap_id which snapshot to query, or CEPH_NOSNAP (uint64_t) (deprecated) + * @param read_only true if the image will be used read-only (bool) * * Output: * @param features list of enabled features for the given snapshot (uint64_t) + * @param incompatible incompatible feature bits * @returns 0 on success, negative error code on failure */ int get_features(cls_method_context_t hctx, bufferlist *in, bufferlist *out) { - uint64_t features, snap_id; + uint64_t snap_id; + bool read_only = false; bufferlist::iterator iter = in->begin(); try { ::decode(snap_id, iter); + if (!iter.end()) { + ::decode(read_only, iter); + } } catch (const buffer::error &err) { return -EINVAL; } - CLS_LOG(20, "get_features snap_id=%llu", (unsigned long long)snap_id); + CLS_LOG(20, "get_features snap_id=%" PRIu64 ", read_only=%d", + snap_id, read_only); - if (snap_id == CEPH_NOSNAP) { - int r = read_key(hctx, "features", &features); - if (r < 0) { - CLS_ERR("failed to read features off disk: %s", cpp_strerror(r).c_str()); - return r; - } - } else { + // NOTE: keep this deprecated snapshot logic to support negative + // test cases in older (pre-Infernalis) releases. Remove once older + // releases are no longer supported. + if (snap_id != CEPH_NOSNAP) { cls_rbd_snap snap; string snapshot_key; key_from_snap_id(snap_id, &snapshot_key); int r = read_key(hctx, snapshot_key, &snap); - if (r < 0) + if (r < 0) { return r; + } + } - features = snap.features; + uint64_t features; + int r = read_key(hctx, "features", &features); + if (r < 0) { + CLS_ERR("failed to read features off disk: %s", cpp_strerror(r).c_str()); + return r; } - uint64_t incompatible = features & RBD_FEATURES_INCOMPATIBLE; + uint64_t incompatible = (read_only ? features & RBD_FEATURES_INCOMPATIBLE : + features & RBD_FEATURES_RW_INCOMPATIBLE); ::encode(features, *out); ::encode(incompatible, *out); + return 0; +} +/** + * set the image features + * + * Input: + * @params features image features + * @params mask image feature mask + * + * Output: + * none + * + * @returns 0 on success, negative error code upon failure + */ +int set_features(cls_method_context_t hctx, bufferlist *in, bufferlist *out) +{ + uint64_t features; + uint64_t mask; + bufferlist::iterator iter = in->begin(); + try { + ::decode(features, iter); + ::decode(mask, iter); + } catch (const buffer::error &err) { + return -EINVAL; + } + + if ((mask & RBD_FEATURES_MUTABLE) != mask) { + CLS_ERR("Attempting to set immutable feature: %" PRIu64, + mask & ~RBD_FEATURES_MUTABLE); + return -EINVAL; + } + + // check that features exists to make sure this is a header object + // that was created correctly + uint64_t orig_features = 0; + int r = read_key(hctx, "features", &orig_features); + if (r < 0 && r != -ENOENT) { + CLS_ERR("Could not read image's features off disk: %s", + cpp_strerror(r).c_str()); + return r; + } + + features = (orig_features & ~mask) | (features & mask); + CLS_LOG(10, "set_features features=%" PRIu64 " orig_features=%" PRIu64, + features, orig_features); + + bufferlist bl; + ::encode(features, bl); + r = cls_cxx_map_set_val(hctx, "features", &bl); + if (r < 0) { + CLS_ERR("error updating features: %s", cpp_strerror(r).c_str()); + return r; + } return 0; } @@ -706,6 +786,127 @@ int set_stripe_unit_count(cls_method_context_t hctx, bufferlist *in, bufferlist return 0; } +/** + * get the image flags + * + * Input: + * @param snap_id which snapshot to query, to CEPH_NOSNAP (uint64_t) + * + * Output: + * @param flags image flags + * + * @returns 0 on success, negative error code upon failure + */ +int get_flags(cls_method_context_t hctx, bufferlist *in, bufferlist *out) +{ + uint64_t snap_id; + bufferlist::iterator iter = in->begin(); + try { + ::decode(snap_id, iter); + } catch (const buffer::error &err) { + return -EINVAL; + } + + CLS_LOG(20, "get_flags snap_id=%llu", (unsigned long long)snap_id); + + uint64_t flags = 0; + if (snap_id == CEPH_NOSNAP) { + int r = read_key(hctx, "flags", &flags); + if (r < 0 && r != -ENOENT) { + CLS_ERR("failed to read flags off disk: %s", cpp_strerror(r).c_str()); + return r; + } + } else { + cls_rbd_snap snap; + string snapshot_key; + key_from_snap_id(snap_id, &snapshot_key); + int r = read_key(hctx, snapshot_key, &snap); + if (r < 0) { + return r; + } + flags = snap.flags; + } + + ::encode(flags, *out); + return 0; +} + +/** + * set the image flags + * + * Input: + * @params flags image flags + * @params mask image flag mask + * @params snap_id which snapshot to update, or CEPH_NOSNAP (uint64_t) + * + * Output: + * none + * + * @returns 0 on success, negative error code upon failure + */ +int set_flags(cls_method_context_t hctx, bufferlist *in, bufferlist *out) +{ + uint64_t flags; + uint64_t mask; + uint64_t snap_id = CEPH_NOSNAP; + bufferlist::iterator iter = in->begin(); + try { + ::decode(flags, iter); + ::decode(mask, iter); + if (!iter.end()) { + ::decode(snap_id, iter); + } + } catch (const buffer::error &err) { + return -EINVAL; + } + + // check that size exists to make sure this is a header object + // that was created correctly + int r; + uint64_t orig_flags = 0; + cls_rbd_snap snap_meta; + string snap_meta_key; + if (snap_id == CEPH_NOSNAP) { + r = read_key(hctx, "flags", &orig_flags); + if (r < 0 && r != -ENOENT) { + CLS_ERR("Could not read image's flags off disk: %s", + cpp_strerror(r).c_str()); + return r; + } + } else { + key_from_snap_id(snap_id, &snap_meta_key); + r = read_key(hctx, snap_meta_key, &snap_meta); + if (r < 0) { + CLS_ERR("Could not read snapshot: snap_id=%" PRIu64 ": %s", + snap_id, cpp_strerror(r).c_str()); + return r; + } + orig_flags = snap_meta.flags; + } + + flags = (orig_flags & ~mask) | (flags & mask); + CLS_LOG(20, "set_flags snap_id=%" PRIu64 ", orig_flags=%" PRIu64 ", " + "new_flags=%" PRIu64 ", mask=%" PRIu64, snap_id, orig_flags, + flags, mask); + + if (snap_id == CEPH_NOSNAP) { + bufferlist bl; + ::encode(flags, bl); + r = cls_cxx_map_set_val(hctx, "flags", &bl); + } else { + snap_meta.flags = flags; + + bufferlist bl; + ::encode(snap_meta, bl); + r = cls_cxx_map_set_val(hctx, snap_meta_key, &bl); + } + + if (r < 0) { + CLS_ERR("error updating flags: %s", cpp_strerror(r).c_str()); + return r; + } + return 0; +} /** * get the current parent, if any @@ -862,6 +1063,58 @@ int remove_parent(cls_method_context_t hctx, bufferlist *in, bufferlist *out) if (r < 0) return r; + uint64_t features; + r = read_key(hctx, "features", &features); + if (r < 0) { + return r; + } + + // remove the parent from all snapshots + if ((features & RBD_FEATURE_DEEP_FLATTEN) != 0) { + int max_read = RBD_MAX_KEYS_READ; + vector snap_ids; + string last_read = RBD_SNAP_KEY_PREFIX; + + do { + set keys; + r = cls_cxx_map_get_keys(hctx, last_read, max_read, &keys); + if (r < 0) { + return r; + } + + for (std::set::const_iterator it = keys.begin(); + it != keys.end(); ++it) { + if ((*it).find(RBD_SNAP_KEY_PREFIX) != 0) { + break; + } + + uint64_t snap_id = snap_id_from_key(*it); + cls_rbd_snap snap_meta; + r = read_key(hctx, *it, &snap_meta); + if (r < 0) { + CLS_ERR("Could not read snapshot: snap_id=%" PRIu64 ": %s", + snap_id, cpp_strerror(r).c_str()); + return r; + } + + snap_meta.parent = cls_rbd_parent(); + + bufferlist bl; + ::encode(snap_meta, bl); + r = cls_cxx_map_set_val(hctx, *it, &bl); + if (r < 0) { + CLS_ERR("Could not update snapshot: snap_id=%" PRIu64 ": %s", + snap_id, cpp_strerror(r).c_str()); + return r; + } + } + + if (!keys.empty()) { + last_read = *(keys.rbegin()); + } + } while (r == max_read); + } + cls_rbd_parent parent; r = read_key(hctx, "parent", &parent); if (r < 0) @@ -872,7 +1125,6 @@ int remove_parent(cls_method_context_t hctx, bufferlist *in, bufferlist *out) CLS_ERR("error removing parent: %d", r); return r; } - return 0; } @@ -1242,6 +1494,11 @@ int snapshot_add(cls_method_context_t hctx, bufferlist *in, bufferlist *out) CLS_ERR("Could not read image's features off disk: %s", cpp_strerror(r).c_str()); return r; } + r = read_key(hctx, "flags", &snap_meta.flags); + if (r < 0 && r != -ENOENT) { + CLS_ERR("Could not read image's flags off disk: %s", cpp_strerror(r).c_str()); + return r; + } int max_read = RBD_MAX_KEYS_READ; string last_read = RBD_SNAP_KEY_PREFIX; @@ -1790,6 +2047,485 @@ int dir_remove_image(cls_method_context_t hctx, bufferlist *in, bufferlist *out) return dir_remove_image_helper(hctx, name, id); } +int object_map_read(cls_method_context_t hctx, BitVector<2> &object_map) +{ + uint64_t size; + int r = cls_cxx_stat(hctx, &size, NULL); + if (r < 0) { + return r; + } + if (size == 0) { + return -ENOENT; + } + + bufferlist bl; + r = cls_cxx_read(hctx, 0, size, &bl); + if (r < 0) { + return r; + } + + try { + bufferlist::iterator iter = bl.begin(); + ::decode(object_map, iter); + } catch (const buffer::error &err) { + CLS_ERR("failed to decode object map: %s", err.what()); + return -EINVAL; + } + return 0; +} + +/** + * Load an rbd image's object map + * + * Input: + * none + * + * Output: + * @param object map bit vector + * @returns 0 on success, negative error code on failure + */ +int object_map_load(cls_method_context_t hctx, bufferlist *in, bufferlist *out) +{ + BitVector<2> object_map; + int r = object_map_read(hctx, object_map); + if (r < 0) { + return r; + } + + object_map.set_crc_enabled(false); + ::encode(object_map, *out); + return 0; +} + +/** + * Save an rbd image's object map + * + * Input: + * @param object map bit vector + * + * Output: + * @returns 0 on success, negative error code on failure + */ +int object_map_save(cls_method_context_t hctx, bufferlist *in, bufferlist *out) +{ + BitVector<2> object_map; + try { + bufferlist::iterator iter = in->begin(); + ::decode(object_map, iter); + } catch (const buffer::error &err) { + return -EINVAL; + } + + bufferlist bl; + ::encode(object_map, bl); + CLS_LOG(20, "object_map_save: object size=%" PRIu64 ", byte size=%u", + object_map.size(), bl.length()); + return cls_cxx_write_full(hctx, &bl); +} + +/** + * Resize an rbd image's object map + * + * Input: + * @param object_count the max number of objects in the image + * @param default_state the default state of newly created objects + * + * Output: + * @returns 0 on success, negative error code on failure + */ +int object_map_resize(cls_method_context_t hctx, bufferlist *in, bufferlist *out) +{ + uint64_t object_count; + uint8_t default_state; + try { + bufferlist::iterator iter = in->begin(); + ::decode(object_count, iter); + ::decode(default_state, iter); + } catch (const buffer::error &err) { + return -EINVAL; + } + + BitVector<2> object_map; + int r = object_map_read(hctx, object_map); + if ((r < 0) && (r != -ENOENT)) { + return r; + } + + size_t orig_object_map_size = object_map.size(); + if (object_count < orig_object_map_size) { + for (uint64_t i = object_count + 1; i < orig_object_map_size; ++i) { + if (object_map[i] != default_state) { + CLS_ERR("object map indicates object still exists: %" PRIu64, i); + return -ESTALE; + } + } + object_map.resize(object_count); + } else if (object_count > orig_object_map_size) { + object_map.resize(object_count); + for (uint64_t i = orig_object_map_size; i < object_count; ++i) { + object_map[i] = default_state; + } + } + + bufferlist map; + ::encode(object_map, map); + CLS_LOG(20, "object_map_resize: object size=%" PRIu64 ", byte size=%u", + object_count, map.length()); + return cls_cxx_write_full(hctx, &map); +} + +/** + * Update an rbd image's object map + * + * Input: + * @param start_object_no the start object iterator + * @param end_object_no the end object iterator + * @param new_object_state the new object state + * @param current_object_state optional current object state filter + * + * Output: + * @returns 0 on success, negative error code on failure + */ +int object_map_update(cls_method_context_t hctx, bufferlist *in, bufferlist *out) +{ + uint64_t start_object_no; + uint64_t end_object_no; + uint8_t new_object_state; + boost::optional current_object_state; + try { + bufferlist::iterator iter = in->begin(); + ::decode(start_object_no, iter); + ::decode(end_object_no, iter); + ::decode(new_object_state, iter); + ::decode(current_object_state, iter); + } catch (const buffer::error &err) { + return -EINVAL; + } + + uint64_t size; + int r = cls_cxx_stat(hctx, &size, NULL); + if (r < 0) { + return r; + } + + BitVector<2> object_map; + bufferlist header_bl; + r = cls_cxx_read(hctx, 0, object_map.get_header_length(), &header_bl); + if (r < 0) { + return r; + } + + try { + bufferlist::iterator it = header_bl.begin(); + object_map.decode_header(it); + } catch (const buffer::error &err) { + CLS_ERR("failed to decode object map header: %s", err.what()); + return -EINVAL; + } + + bufferlist footer_bl; + r = cls_cxx_read(hctx, object_map.get_footer_offset(), + size - object_map.get_footer_offset(), &footer_bl); + try { + bufferlist::iterator it = footer_bl.begin(); + object_map.decode_footer(it); + } catch (const buffer::error &err) { + CLS_ERR("failed to decode object map footer: %s", err.what()); + } + + if (start_object_no >= end_object_no || end_object_no > object_map.size()) { + return -ERANGE; + } + + uint64_t byte_offset; + uint64_t byte_length; + object_map.get_data_extents(start_object_no, + end_object_no - start_object_no, + &byte_offset, &byte_length); + + bufferlist data_bl; + r = cls_cxx_read(hctx, object_map.get_header_length() + byte_offset, + byte_length, &data_bl); + if (r < 0) { + return r; + } + + try { + bufferlist::iterator it = data_bl.begin(); + object_map.decode_data(it, byte_offset); + } catch (const buffer::error &err) { + CLS_ERR("failed to decode data chunk [%" PRIu64 "]: %s", + byte_offset, err.what()); + return -EINVAL; + } + + bool updated = false; + for (uint64_t object_no = start_object_no; object_no < end_object_no; + ++object_no) { + uint8_t state = object_map[object_no]; + if ((!current_object_state || state == *current_object_state || + (*current_object_state == OBJECT_EXISTS && + state == OBJECT_EXISTS_CLEAN)) && state != new_object_state) { + object_map[object_no] = new_object_state; + updated = true; + } + } + + if (updated) { + CLS_LOG(20, "object_map_update: %" PRIu64 "~%" PRIu64 " -> %" PRIu64, + byte_offset, byte_length, + object_map.get_header_length() + byte_offset); + + bufferlist data_bl; + object_map.encode_data(data_bl, byte_offset, byte_length); + r = cls_cxx_write(hctx, object_map.get_header_length() + byte_offset, + data_bl.length(), &data_bl); + + footer_bl.clear(); + object_map.encode_footer(footer_bl); + r = cls_cxx_write(hctx, object_map.get_footer_offset(), footer_bl.length(), + &footer_bl); + } + return r; +} + +/** + * Mark all _EXISTS objects as _EXISTS_CLEAN so future writes to the + * image HEAD can be tracked. + * + * Input: + * none + * + * Output: + * @returns 0 on success, negative error code on failure + */ +int object_map_snap_add(cls_method_context_t hctx, bufferlist *in, + bufferlist *out) +{ + BitVector<2> object_map; + int r = object_map_read(hctx, object_map); + if (r < 0) { + return r; + } + + bool updated = false; + for (uint64_t i = 0; i < object_map.size(); ++i) { + if (object_map[i] == OBJECT_EXISTS) { + object_map[i] = OBJECT_EXISTS_CLEAN; + updated = true; + } + } + + if (updated) { + bufferlist bl; + ::encode(object_map, bl); + r = cls_cxx_write_full(hctx, &bl); + } + return r; +} + +/** + * Mark all _EXISTS_CLEAN objects as _EXISTS in the current object map + * if the provided snapshot object map object is marked as _EXISTS. + * + * Input: + * @param snapshot object map bit vector + * + * Output: + * @returns 0 on success, negative error code on failure + */ +int object_map_snap_remove(cls_method_context_t hctx, bufferlist *in, + bufferlist *out) +{ + BitVector<2> src_object_map; + try { + bufferlist::iterator iter = in->begin(); + ::decode(src_object_map, iter); + } catch (const buffer::error &err) { + return -EINVAL; + } + + BitVector<2> dst_object_map; + int r = object_map_read(hctx, dst_object_map); + if (r < 0) { + return r; + } + + bool updated = false; + for (uint64_t i = 0; i < dst_object_map.size(); ++i) { + if (dst_object_map[i] == OBJECT_EXISTS_CLEAN && + (i >= src_object_map.size() || src_object_map[i] == OBJECT_EXISTS)) { + dst_object_map[i] = OBJECT_EXISTS; + updated = true; + } + } + + if (updated) { + bufferlist bl; + ::encode(dst_object_map, bl); + r = cls_cxx_write_full(hctx, &bl); + } + return r; +} + +static const string metadata_key_for_name(const string &name) +{ + return RBD_METADATA_KEY_PREFIX + name; +} + +static const string metadata_name_from_key(const string &key) +{ + return key.substr(strlen(RBD_METADATA_KEY_PREFIX)); +} + +/** + * Input: + * @param start_after which name to begin listing after + * (use the empty string to start at the beginning) + * @param max_return the maximum number of names to lis(if 0 means no limit) + + * Output: + * @param value + * @returns 0 on success, negative error code on failure + */ +int metadata_list(cls_method_context_t hctx, bufferlist *in, bufferlist *out) +{ + string start_after; + uint64_t max_return; + + try { + bufferlist::iterator iter = in->begin(); + ::decode(start_after, iter); + ::decode(max_return, iter); + } catch (const buffer::error &err) { + return -EINVAL; + } + + map data; + string last_read = metadata_key_for_name(start_after); + int max_read = max_return ? MIN(RBD_MAX_KEYS_READ, max_return) : RBD_MAX_KEYS_READ; + + do { + map raw_data; + int r = cls_cxx_map_get_vals(hctx, last_read, RBD_METADATA_KEY_PREFIX, + max_read, &raw_data); + if (r < 0) { + CLS_ERR("failed to read the vals off of disk: %s", cpp_strerror(r).c_str()); + return r; + } + if (raw_data.empty()) + break; + + map::iterator it = raw_data.begin(); + if (metadata_name_from_key(it->first) == last_read) + ++it; + for (; it != raw_data.end(); ++it) + data[metadata_name_from_key(it->first)].swap(it->second); + + last_read = raw_data.rbegin()->first; + if (max_return) + max_read = MIN(RBD_MAX_KEYS_READ, max_return-data.size()); + } while (max_return && max_read); + + ::encode(data, *out); + return 0; +} + +/** + * Input: + * @param data + * + * Output: + * @returns 0 on success, negative error code on failure + */ +int metadata_set(cls_method_context_t hctx, bufferlist *in, bufferlist *out) +{ + map data, raw_data; + + bufferlist::iterator iter = in->begin(); + try { + ::decode(data, iter); + } catch (const buffer::error &err) { + return -EINVAL; + } + + for (map::iterator it = data.begin(); + it != data.end(); ++it) { + CLS_LOG(20, "metdata_set key=%s value=%.*s", it->first.c_str(), + it->second.length(), it->second.c_str()); + raw_data[metadata_key_for_name(it->first)].swap(it->second); + } + int r = cls_cxx_map_set_vals(hctx, &raw_data); + if (r < 0) { + CLS_ERR("error writing metadata: %d", r); + return r; + } + + return 0; +} + +/** + * Input: + * @param key + * + * Output: + * @returns 0 on success, negative error code on failure + */ +int metadata_remove(cls_method_context_t hctx, bufferlist *in, bufferlist *out) +{ + string key; + + bufferlist::iterator iter = in->begin(); + try { + ::decode(key, iter); + } catch (const buffer::error &err) { + return -EINVAL; + } + + CLS_LOG(20, "metdata_set key=%s", key.c_str()); + + int r = cls_cxx_map_remove_key(hctx, metadata_key_for_name(key)); + if (r < 0) { + CLS_ERR("error remove metadata: %d", r); + return r; + } + + return 0; +} + +/** + * Input: + * @param key + * + * Output: + * @param metadata value associated with the key + * @returns 0 on success, negative error code on failure + */ +int metadata_get(cls_method_context_t hctx, bufferlist *in, bufferlist *out) +{ + string key; + bufferlist value; + + bufferlist::iterator iter = in->begin(); + try { + ::decode(key, iter); + } catch (const buffer::error &err) { + return -EINVAL; + } + + CLS_LOG(20, "metdata_get key=%s", key.c_str()); + + int r = cls_cxx_map_get_val(hctx, metadata_key_for_name(key), &value); + if (r < 0) { + CLS_ERR("error get metadata: %d", r); + return r; + } + + ::encode(value, *out); + return 0; +} + + /****************************** Old format *******************************/ int old_snapshots_list(cls_method_context_t hctx, bufferlist *in, bufferlist *out) @@ -1997,6 +2733,9 @@ void __cls_init() cls_register_cxx_method(h_class, "get_features", CLS_METHOD_RD, get_features, &h_get_features); + cls_register_cxx_method(h_class, "set_features", + CLS_METHOD_RD | CLS_METHOD_WR, + set_features, &h_set_features); cls_register_cxx_method(h_class, "get_size", CLS_METHOD_RD, get_size, &h_get_size); @@ -2045,6 +2784,24 @@ void __cls_init() cls_register_cxx_method(h_class, "set_stripe_unit_count", CLS_METHOD_RD | CLS_METHOD_WR, set_stripe_unit_count, &h_set_stripe_unit_count); + cls_register_cxx_method(h_class, "get_flags", + CLS_METHOD_RD, + get_flags, &h_get_flags); + cls_register_cxx_method(h_class, "set_flags", + CLS_METHOD_RD | CLS_METHOD_WR, + set_flags, &h_set_flags); + cls_register_cxx_method(h_class, "metadata_list", + CLS_METHOD_RD, + metadata_list, &h_metadata_list); + cls_register_cxx_method(h_class, "metadata_set", + CLS_METHOD_RD | CLS_METHOD_WR, + metadata_set, &h_metadata_set); + cls_register_cxx_method(h_class, "metadata_remove", + CLS_METHOD_RD | CLS_METHOD_WR, + metadata_remove, &h_metadata_remove); + cls_register_cxx_method(h_class, "metadata_get", + CLS_METHOD_RD, + metadata_get, &h_metadata_get); /* methods for the rbd_children object */ cls_register_cxx_method(h_class, "add_child", @@ -2085,7 +2842,27 @@ void __cls_init() CLS_METHOD_RD | CLS_METHOD_WR, dir_rename_image, &h_dir_rename_image); - /* methods for the old format */ + /* methods for the rbd_object_map.$image_id object */ + cls_register_cxx_method(h_class, "object_map_load", + CLS_METHOD_RD, + object_map_load, &h_object_map_load); + cls_register_cxx_method(h_class, "object_map_save", + CLS_METHOD_RD | CLS_METHOD_WR, + object_map_save, &h_object_map_save); + cls_register_cxx_method(h_class, "object_map_resize", + CLS_METHOD_RD | CLS_METHOD_WR, + object_map_resize, &h_object_map_resize); + cls_register_cxx_method(h_class, "object_map_update", + CLS_METHOD_RD | CLS_METHOD_WR, + object_map_update, &h_object_map_update); + cls_register_cxx_method(h_class, "object_map_snap_add", + CLS_METHOD_RD | CLS_METHOD_WR, + object_map_snap_add, &h_object_map_snap_add); + cls_register_cxx_method(h_class, "object_map_snap_remove", + CLS_METHOD_RD | CLS_METHOD_WR, + object_map_snap_remove, &h_object_map_snap_remove); + + /* methods for the old format */ cls_register_cxx_method(h_class, "snap_list", CLS_METHOD_RD, old_snapshots_list, &h_old_snapshots_list); diff --git a/src/cls/rbd/cls_rbd.h b/src/cls/rbd/cls_rbd.h index b0f6e1502ffbf..5f79d5a836569 100644 --- a/src/cls/rbd/cls_rbd.h +++ b/src/cls/rbd/cls_rbd.h @@ -63,6 +63,7 @@ struct cls_rbd_snap { uint64_t features; uint8_t protection_status; cls_rbd_parent parent; + uint64_t flags; /// true if we have a parent bool has_parent() const { @@ -70,20 +71,22 @@ struct cls_rbd_snap { } cls_rbd_snap() : id(CEPH_NOSNAP), image_size(0), features(0), - protection_status(RBD_PROTECTION_STATUS_UNPROTECTED) + protection_status(RBD_PROTECTION_STATUS_UNPROTECTED), + flags(0) {} void encode(bufferlist& bl) const { - ENCODE_START(3, 1, bl); + ENCODE_START(4, 1, bl); ::encode(id, bl); ::encode(name, bl); ::encode(image_size, bl); ::encode(features, bl); ::encode(parent, bl); ::encode(protection_status, bl); + ::encode(flags, bl); ENCODE_FINISH(bl); } void decode(bufferlist::iterator& p) { - DECODE_START(3, p); + DECODE_START(4, p); ::decode(id, p); ::decode(name, p); ::decode(image_size, p); @@ -94,6 +97,9 @@ struct cls_rbd_snap { if (struct_v >= 3) { ::decode(protection_status, p); } + if (struct_v >= 4) { + ::decode(flags, p); + } DECODE_FINISH(p); } void dump(Formatter *f) const { @@ -127,6 +133,7 @@ struct cls_rbd_snap { t->name = "snap"; t->image_size = 123456; t->features = 123; + t->flags = 31; o.push_back(t); t = new cls_rbd_snap; t->id = 2; @@ -138,6 +145,7 @@ struct cls_rbd_snap { t->parent.snapid = 456; t->parent.overlap = 12345; t->protection_status = RBD_PROTECTION_STATUS_PROTECTED; + t->flags = 14; o.push_back(t); } }; diff --git a/src/cls/rbd/cls_rbd_client.cc b/src/cls/rbd/cls_rbd_client.cc index 7bc596a3ec601..0385ec98a1d40 100644 --- a/src/cls/rbd/cls_rbd_client.cc +++ b/src/cls/rbd/cls_rbd_client.cc @@ -47,7 +47,7 @@ namespace librbd { } int get_mutable_metadata(librados::IoCtx *ioctx, const std::string &oid, - uint64_t *size, uint64_t *features, + bool read_only, uint64_t *size, uint64_t *features, uint64_t *incompatible_features, map *lockers, @@ -68,11 +68,15 @@ namespace librbd { bufferlist sizebl, featuresbl, parentbl, empty; snapid_t snap = CEPH_NOSNAP; ::encode(snap, sizebl); - ::encode(snap, featuresbl); - ::encode(snap, parentbl); op.exec("rbd", "get_size", sizebl); + + ::encode(snap, featuresbl); + ::encode(read_only, featuresbl); op.exec("rbd", "get_features", featuresbl); + op.exec("rbd", "get_snapcontext", empty); + + ::encode(snap, parentbl); op.exec("rbd", "get_parent", parentbl); rados::cls::lock::get_lock_info_start(&op, RBD_LOCK_NAME); @@ -150,6 +154,18 @@ namespace librbd { return 0; } + int set_features(librados::IoCtx *ioctx, const std::string &oid, + uint64_t features, uint64_t mask) + { + bufferlist inbl; + ::encode(features, inbl); + ::encode(mask, inbl); + + librados::ObjectWriteOperation op; + op.exec("rbd", "set_features", inbl); + return ioctx->operate(oid, &op); + } + int get_object_prefix(librados::IoCtx *ioctx, const std::string &oid, std::string *object_prefix) { @@ -190,12 +206,18 @@ namespace librbd { } int set_size(librados::IoCtx *ioctx, const std::string &oid, - uint64_t size) + uint64_t size) { - bufferlist bl, bl2; - ::encode(size, bl); + librados::ObjectWriteOperation op; + set_size(&op, size); + return ioctx->operate(oid, &op); + } - return ioctx->exec(oid, "rbd", "set_size", bl, bl2); + void set_size(librados::ObjectWriteOperation *op, uint64_t size) + { + bufferlist bl; + ::encode(size, bl); + op->exec("rbd", "set_size", bl); } int get_parent(librados::IoCtx *ioctx, const std::string &oid, @@ -234,10 +256,63 @@ namespace librbd { return ioctx->exec(oid, "rbd", "set_parent", inbl, outbl); } + int get_flags(librados::IoCtx *ioctx, const std::string &oid, + uint64_t *flags, const std::vector &snap_ids, + vector *snap_flags) + { + bufferlist inbl; + ::encode(static_cast(CEPH_NOSNAP), inbl); + + librados::ObjectReadOperation op; + op.exec("rbd", "get_flags", inbl); + for (size_t i = 0; i < snap_ids.size(); ++i) { + bufferlist snapbl; + ::encode(snap_ids[i], snapbl); + op.exec("rbd", "get_flags", snapbl); + } + + snap_flags->clear(); + snap_flags->resize(snap_ids.size()); + + bufferlist outbl; + int r = ioctx->operate(oid, &op, &outbl); + if (r < 0) { + return r; + } + + try { + bufferlist::iterator iter = outbl.begin(); + ::decode(*flags, iter); + for (size_t i = 0; i < snap_ids.size(); ++i) { + ::decode((*snap_flags)[i], iter); + } + } catch (const buffer::error &err) { + return -EBADMSG; + } + return 0; + } + + void set_flags(librados::ObjectWriteOperation *op, snapid_t snap_id, + uint64_t flags, uint64_t mask) + { + bufferlist inbl; + ::encode(flags, inbl); + ::encode(mask, inbl); + ::encode(snap_id, inbl); + op->exec("rbd", "set_flags", inbl); + } + int remove_parent(librados::IoCtx *ioctx, const std::string &oid) { - bufferlist inbl, outbl; - return ioctx->exec(oid, "rbd", "remove_parent", inbl, outbl); + librados::ObjectWriteOperation op; + remove_parent(&op); + return ioctx->operate(oid, &op); + } + + void remove_parent(librados::ObjectWriteOperation *op) + { + bufferlist inbl; + op->exec("rbd", "remove_parent", inbl); } int add_child(librados::IoCtx *ioctx, const std::string &oid, @@ -252,16 +327,23 @@ namespace librbd { return ioctx->exec(oid, "rbd", "add_child", in, out); } - int remove_child(librados::IoCtx *ioctx, const std::string &oid, - parent_spec pspec, const std::string &c_imageid) + void remove_child(librados::ObjectWriteOperation *op, + parent_spec pspec, const std::string &c_imageid) { - bufferlist in, out; + bufferlist in; ::encode(pspec.pool_id, in); ::encode(pspec.image_id, in); ::encode(pspec.snap_id, in); ::encode(c_imageid, in); + op->exec("rbd", "remove_child", in); + } - return ioctx->exec(oid, "rbd", "remove_child", in, out); + int remove_child(librados::IoCtx *ioctx, const std::string &oid, + parent_spec pspec, const std::string &c_imageid) + { + librados::ObjectWriteOperation op; + remove_child(&op, pspec, c_imageid); + return ioctx->operate(oid, &op); } int get_children(librados::IoCtx *ioctx, const std::string &oid, @@ -287,11 +369,18 @@ namespace librbd { int snapshot_add(librados::IoCtx *ioctx, const std::string &oid, snapid_t snap_id, const std::string &snap_name) { - bufferlist bl, bl2; + librados::ObjectWriteOperation op; + snapshot_add(&op, snap_id, snap_name); + return ioctx->operate(oid, &op); + } + + void snapshot_add(librados::ObjectWriteOperation *op, snapid_t snap_id, + const std::string &snap_name) + { + bufferlist bl; ::encode(snap_name, bl); ::encode(snap_id, bl); - - return ioctx->exec(oid, "rbd", "snapshot_add", bl, bl2); + op->exec("rbd", "snapshot_add", bl); } int snapshot_remove(librados::IoCtx *ioctx, const std::string &oid, @@ -329,7 +418,6 @@ namespace librbd { const std::vector &ids, std::vector *names, std::vector *sizes, - std::vector *features, std::vector *parents, std::vector *protection_statuses) { @@ -337,8 +425,6 @@ namespace librbd { names->resize(ids.size()); sizes->clear(); sizes->resize(ids.size()); - features->clear(); - features->resize(ids.size()); parents->clear(); parents->resize(ids.size()); protection_statuses->clear(); @@ -348,17 +434,15 @@ namespace librbd { for (vector::const_iterator it = ids.begin(); it != ids.end(); ++it) { snapid_t snap_id = it->val; - bufferlist bl1, bl2, bl3, bl4, bl5; + bufferlist bl1, bl2, bl3, bl4; ::encode(snap_id, bl1); op.exec("rbd", "get_snapshot_name", bl1); ::encode(snap_id, bl2); op.exec("rbd", "get_size", bl2); ::encode(snap_id, bl3); - op.exec("rbd", "get_features", bl3); + op.exec("rbd", "get_parent", bl3); ::encode(snap_id, bl4); - op.exec("rbd", "get_parent", bl4); - ::encode(snap_id, bl5); - op.exec("rbd", "get_protection_status", bl5); + op.exec("rbd", "get_protection_status", bl4); } bufferlist outbl; @@ -370,15 +454,11 @@ namespace librbd { bufferlist::iterator iter = outbl.begin(); for (size_t i = 0; i < ids.size(); ++i) { uint8_t order; - uint64_t incompat_features; // get_snapshot_name ::decode((*names)[i], iter); // get_size ::decode(order, iter); ::decode((*sizes)[i], iter); - // get_features - ::decode((*features)[i], iter); - ::decode(incompat_features, iter); // get_parent ::decode((*parents)[i].spec.pool_id, iter); ::decode((*parents)[i].spec.image_id, iter); @@ -631,5 +711,135 @@ namespace librbd { ::encode(id, in); return ioctx->exec(oid, "rbd", "dir_rename_image", in, out); } + + int object_map_load(librados::IoCtx *ioctx, const std::string &oid, + ceph::BitVector<2> *object_map) + { + bufferlist in; + bufferlist out; + int r = ioctx->exec(oid, "rbd", "object_map_load", in, out); + if (r < 0) { + return r; + } + + try { + bufferlist::iterator iter = out.begin(); + ::decode(*object_map, iter); + } catch (const buffer::error &err) { + return -EBADMSG; + } + return 0; + } + + void object_map_save(librados::ObjectWriteOperation *rados_op, + const ceph::BitVector<2> &object_map) + { + ceph::BitVector<2> object_map_copy(object_map); + object_map_copy.set_crc_enabled(false); + + bufferlist in; + ::encode(object_map_copy, in); + rados_op->exec("rbd", "object_map_save", in); + } + + void object_map_resize(librados::ObjectWriteOperation *rados_op, + uint64_t object_count, uint8_t default_state) + { + bufferlist in; + ::encode(object_count, in); + ::encode(default_state, in); + rados_op->exec("rbd", "object_map_resize", in); + } + + void object_map_update(librados::ObjectWriteOperation *rados_op, + uint64_t start_object_no, uint64_t end_object_no, + uint8_t new_object_state, + const boost::optional ¤t_object_state) + { + bufferlist in; + ::encode(start_object_no, in); + ::encode(end_object_no, in); + ::encode(new_object_state, in); + ::encode(current_object_state, in); + rados_op->exec("rbd", "object_map_update", in); + } + + void object_map_snap_add(librados::ObjectWriteOperation *rados_op) + { + bufferlist in; + rados_op->exec("rbd", "object_map_snap_add", in); + } + + void object_map_snap_remove(librados::ObjectWriteOperation *rados_op, + const ceph::BitVector<2> &object_map) + { + ceph::BitVector<2> object_map_copy(object_map); + object_map_copy.set_crc_enabled(false); + + bufferlist in; + ::encode(object_map_copy, in); + rados_op->exec("rbd", "object_map_snap_remove", in); + } + + int metadata_set(librados::IoCtx *ioctx, const std::string &oid, + const map &data) + { + bufferlist in; + ::encode(data, in); + bufferlist out; + return ioctx->exec(oid, "rbd", "metadata_set", in, out); + } + + int metadata_remove(librados::IoCtx *ioctx, const std::string &oid, + const std::string &key) + { + bufferlist in; + ::encode(key, in); + bufferlist out; + return ioctx->exec(oid, "rbd", "metadata_remove", in, out); + } + + int metadata_list(librados::IoCtx *ioctx, const std::string &oid, + const std::string &start, uint64_t max_return, + map *pairs) + { + assert(pairs); + bufferlist in, out; + ::encode(start, in); + ::encode(max_return, in); + int r = ioctx->exec(oid, "rbd", "metadata_list", in, out); + if (r < 0) + return r; + + bufferlist::iterator iter = out.begin(); + try { + ::decode(*pairs, iter); + } catch (const buffer::error &err) { + return -EBADMSG; + } + + return 0; + } + + int metadata_get(librados::IoCtx *ioctx, const std::string &oid, + const std::string &key, string *s) + { + assert(s); + bufferlist in, out; + ::encode(key, in); + int r = ioctx->exec(oid, "rbd", "metadata_get", in, out); + if (r < 0) + return r; + + bufferlist::iterator iter = out.begin(); + try { + ::decode(*s, iter); + } catch (const buffer::error &err) { + return -EBADMSG; + } + + return 0; + } + } // namespace cls_client } // namespace librbd diff --git a/src/cls/rbd/cls_rbd_client.h b/src/cls/rbd/cls_rbd_client.h index 34cc45b839099..486d17f36d8d6 100644 --- a/src/cls/rbd/cls_rbd_client.h +++ b/src/cls/rbd/cls_rbd_client.h @@ -5,6 +5,7 @@ #define CEPH_LIBRBD_CLS_RBD_CLIENT_H #include "cls/lock/cls_lock_types.h" +#include "common/bit_vector.hpp" #include "common/snap_types.h" #include "include/rados/librados.hpp" #include "include/types.h" @@ -19,7 +20,7 @@ namespace librbd { int get_immutable_metadata(librados::IoCtx *ioctx, const std::string &oid, std::string *object_prefix, uint8_t *order); int get_mutable_metadata(librados::IoCtx *ioctx, const std::string &oid, - uint64_t *size, uint64_t *features, + bool read_only, uint64_t *size, uint64_t *features, uint64_t *incompatible_features, map *lockers, @@ -34,28 +35,39 @@ namespace librbd { const std::string &object_prefix); int get_features(librados::IoCtx *ioctx, const std::string &oid, snapid_t snap_id, uint64_t *features); + int set_features(librados::IoCtx *ioctx, const std::string &oid, + uint64_t features, uint64_t mask); int get_object_prefix(librados::IoCtx *ioctx, const std::string &oid, std::string *object_prefix); int get_size(librados::IoCtx *ioctx, const std::string &oid, snapid_t snap_id, uint64_t *size, uint8_t *order); int set_size(librados::IoCtx *ioctx, const std::string &oid, uint64_t size); - int set_size(librados::IoCtx *ioctx, const std::string &oid, - uint64_t size); + void set_size(librados::ObjectWriteOperation *op, uint64_t size); int get_parent(librados::IoCtx *ioctx, const std::string &oid, snapid_t snap_id, parent_spec *pspec, uint64_t *parent_overlap); int set_parent(librados::IoCtx *ioctx, const std::string &oid, parent_spec pspec, uint64_t parent_overlap); + int get_flags(librados::IoCtx *ioctx, const std::string &oid, + uint64_t *flags, const std::vector &snap_ids, + vector *snap_flags); + void set_flags(librados::ObjectWriteOperation *op, snapid_t snap_id, + uint64_t flags, uint64_t mask); int remove_parent(librados::IoCtx *ioctx, const std::string &oid); + void remove_parent(librados::ObjectWriteOperation *op); int add_child(librados::IoCtx *ioctx, const std::string &oid, parent_spec pspec, const std::string &c_imageid); + void remove_child(librados::ObjectWriteOperation *op, + parent_spec pspec, const std::string &c_imageid); int remove_child(librados::IoCtx *ioctx, const std::string &oid, parent_spec pspec, const std::string &c_imageid); int get_children(librados::IoCtx *ioctx, const std::string &oid, parent_spec pspec, set& children); int snapshot_add(librados::IoCtx *ioctx, const std::string &oid, snapid_t snap_id, const std::string &snap_name); + void snapshot_add(librados::ObjectWriteOperation *op, snapid_t snap_id, + const std::string &snap_name); int snapshot_remove(librados::IoCtx *ioctx, const std::string &oid, snapid_t snap_id); int get_snapcontext(librados::IoCtx *ioctx, const std::string &oid, @@ -64,7 +76,6 @@ namespace librbd { const std::vector &ids, std::vector *names, std::vector *sizes, - std::vector *features, std::vector *parents, std::vector *protection_statuses); int copyup(librados::IoCtx *ioctx, const std::string &oid, @@ -77,6 +88,15 @@ namespace librbd { uint64_t *stripe_unit, uint64_t *stripe_count); int set_stripe_unit_count(librados::IoCtx *ioctx, const std::string &oid, uint64_t stripe_unit, uint64_t stripe_count); + int metadata_list(librados::IoCtx *ioctx, const std::string &oid, + const std::string &start, uint64_t max_return, + map *pairs); + int metadata_set(librados::IoCtx *ioctx, const std::string &oid, + const map &data); + int metadata_remove(librados::IoCtx *ioctx, const std::string &oid, + const std::string &key); + int metadata_get(librados::IoCtx *ioctx, const std::string &oid, + const std::string &key, string *v); // operations on rbd_id objects int get_id(librados::IoCtx *ioctx, const std::string &oid, std::string *id); @@ -99,6 +119,21 @@ namespace librbd { const std::string &src, const std::string &dest, const std::string &id); + // operations on the rbd_object_map.$image_id object + int object_map_load(librados::IoCtx *ioctx, const std::string &oid, + ceph::BitVector<2> *object_map); + void object_map_save(librados::ObjectWriteOperation *rados_op, + const ceph::BitVector<2> &object_map); + void object_map_resize(librados::ObjectWriteOperation *rados_op, + uint64_t object_count, uint8_t default_state); + void object_map_update(librados::ObjectWriteOperation *rados_op, + uint64_t start_object_no, uint64_t end_object_no, + uint8_t new_object_state, + const boost::optional ¤t_object_state); + void object_map_snap_add(librados::ObjectWriteOperation *rados_op); + void object_map_snap_remove(librados::ObjectWriteOperation *rados_op, + const ceph::BitVector<2> &object_map); + // class operations on the old format, kept for // backwards compatability int old_snapshot_add(librados::IoCtx *ioctx, const std::string &oid, diff --git a/src/cls/refcount/cls_refcount.cc b/src/cls/refcount/cls_refcount.cc index 5e8edeb887af8..c97460f65cd7f 100644 --- a/src/cls/refcount/cls_refcount.cc +++ b/src/cls/refcount/cls_refcount.cc @@ -53,7 +53,7 @@ static int read_refcount(cls_method_context_t hctx, bool implicit_ref, obj_refco bufferlist bl; objr->refs.clear(); int ret = cls_cxx_getxattr(hctx, REFCOUNT_ATTR, &bl); - if (ret == -ENOENT || ret == -ENODATA) { + if (ret == -ENODATA) { if (implicit_ref) { objr->refs[wildcard_tag] = true; } diff --git a/src/cls/rgw/cls_rgw.cc b/src/cls/rgw/cls_rgw.cc index eb4a4232d1892..db488e40fb62c 100644 --- a/src/cls/rgw/cls_rgw.cc +++ b/src/cls/rgw/cls_rgw.cc @@ -13,6 +13,8 @@ #include "objclass/objclass.h" #include "cls/rgw/cls_rgw_ops.h" #include "common/Clock.h" +#include "common/strtol.h" +#include "common/escape.h" #include "global/global_context.h" @@ -27,6 +29,16 @@ cls_method_handle_t h_rgw_bucket_check_index; cls_method_handle_t h_rgw_bucket_rebuild_index; cls_method_handle_t h_rgw_bucket_prepare_op; cls_method_handle_t h_rgw_bucket_complete_op; +cls_method_handle_t h_rgw_bucket_link_olh; +cls_method_handle_t h_rgw_bucket_unlink_instance_op; +cls_method_handle_t h_rgw_bucket_read_olh_log; +cls_method_handle_t h_rgw_bucket_trim_olh_log; +cls_method_handle_t h_rgw_bucket_clear_olh; +cls_method_handle_t h_rgw_obj_remove; +cls_method_handle_t h_rgw_obj_check_attrs_prefix; +cls_method_handle_t h_rgw_bi_get_op; +cls_method_handle_t h_rgw_bi_put_op; +cls_method_handle_t h_rgw_bi_list_op; cls_method_handle_t h_rgw_bi_log_list_op; cls_method_handle_t h_rgw_dir_suggest_changes; cls_method_handle_t h_rgw_user_usage_log_add; @@ -42,13 +54,17 @@ cls_method_handle_t h_rgw_gc_remove; #define BI_PREFIX_CHAR 0x80 -#define BI_BUCKET_OBJS_INDEX 0 -#define BI_BUCKET_LOG_INDEX 1 +#define BI_BUCKET_OBJS_INDEX 0 +#define BI_BUCKET_LOG_INDEX 1 +#define BI_BUCKET_OBJ_INSTANCE_INDEX 2 +#define BI_BUCKET_OLH_DATA_INDEX 3 -#define BI_BUCKET_LAST_INDEX 2 +#define BI_BUCKET_LAST_INDEX 4 -static string bucket_index_prefixes[] = { "", /* special handling for the objs index */ - "0_", +static string bucket_index_prefixes[] = { "", /* special handling for the objs list index */ + "0_", /* bucket log index */ + "1000_", /* obj instance index */ + "1001_", /* olh data index */ /* this must be the last index */ "9999_",}; @@ -73,7 +89,7 @@ int bi_entry_type(const string& s) ++i) { const string& t = bucket_index_prefixes[i]; - if (s.compare(0, t.size(), t) == 0) { + if (s.compare(1, t.size(), t) == 0) { return i; } } @@ -81,6 +97,20 @@ int bi_entry_type(const string& s) return -EINVAL; } +static bool bi_entry_gt(const string& first, const string& second) +{ + int fi = bi_entry_type(first); + int si = bi_entry_type(second); + + if (fi > si) { + return true; + } else if (fi < si) { + return false; + } + + return first > second; +} + static void get_time_key(utime_t& ut, string *key) { char buf[32]; @@ -106,22 +136,24 @@ static void bi_log_index_key(cls_method_context_t hctx, string& key, string& id, key.append(id); } -static int log_index_operation(cls_method_context_t hctx, string& obj, RGWModifyOp op, +static int log_index_operation(cls_method_context_t hctx, cls_rgw_obj_key& obj_key, RGWModifyOp op, string& tag, utime_t& timestamp, rgw_bucket_entry_ver& ver, RGWPendingState state, uint64_t index_ver, - string& max_marker) + string& max_marker, uint16_t bilog_flags) { bufferlist bl; struct rgw_bi_log_entry entry; - entry.object = obj; + entry.object = obj_key.name; + entry.instance = obj_key.instance; entry.timestamp = timestamp; entry.op = op; entry.ver = ver; entry.state = state; entry.index_ver = index_ver; entry.tag = tag; + entry.bilog_flags = bilog_flags; string key; bi_log_index_key(hctx, key, entry.id, index_ver); @@ -187,6 +219,187 @@ static int get_obj_vals(cls_method_context_t hctx, const string& start, const st return 0; } +/* + * get a monotonically decreasing string representation. + * For num = x, num = y, where x > y, str(x) < str(y) + * Another property is that string size starts short and grows as num increases + */ +static void decreasing_str(uint64_t num, string *str) +{ + char buf[32]; + if (num < 0x10) { /* 16 */ + snprintf(buf, sizeof(buf), "9%02lld", 15 - (long long)num); + } else if (num < 0x100) { /* 256 */ + snprintf(buf, sizeof(buf), "8%03lld", 255 - (long long)num); + } else if (num < 0x1000) /* 4096 */ { + snprintf(buf, sizeof(buf), "7%04lld", 4095 - (long long)num); + } else if (num < 0x10000) /* 65536 */ { + snprintf(buf, sizeof(buf), "6%05lld", 65535 - (long long)num); + } else if (num < 0x100000000) /* 4G */ { + snprintf(buf, sizeof(buf), "5%010lld", 0xFFFFFFFF - (long long)num); + } else { + snprintf(buf, sizeof(buf), "4%020lld", (long long)-num); + } + + *str = buf; +} + +/* + * we now hold two different indexes for objects. The first one holds the list of objects in the + * order that we want them to be listed. The second one only holds the objects instances (for + * versioned objects), and they're not arranged in any particular order. + * When listing objects we'll use the first index, when doing operations on the objects themselves + * we'll use the second index. Note that regular objects only map to the first index anyway + */ + +static void get_list_index_key(struct rgw_bucket_dir_entry& entry, string *index_key) +{ + *index_key = entry.key.name; + + string ver_str; + decreasing_str(entry.versioned_epoch, &ver_str); + string instance_delim("\0i", 2); + string ver_delim("\0v", 2); + + index_key->append(ver_delim); + index_key->append(ver_str); + index_key->append(instance_delim); + index_key->append(entry.key.instance); +} + +static void encode_obj_versioned_data_key(const cls_rgw_obj_key& key, string *index_key, bool append_delete_marker_suffix = false) +{ + *index_key = BI_PREFIX_CHAR; + index_key->append(bucket_index_prefixes[BI_BUCKET_OBJ_INSTANCE_INDEX]); + index_key->append(key.name); + string delim("\0i", 2); + index_key->append(delim); + index_key->append(key.instance); + if (append_delete_marker_suffix) { + string dm("\0d", 2); + index_key->append(dm); + } +} + +static void encode_obj_index_key(const cls_rgw_obj_key& key, string *index_key) +{ + if (key.instance.empty()) { + *index_key = key.name; + } else { + encode_obj_versioned_data_key(key, index_key); + } +} + +static void encode_olh_data_key(const cls_rgw_obj_key& key, string *index_key) +{ + *index_key = BI_PREFIX_CHAR; + index_key->append(bucket_index_prefixes[BI_BUCKET_OLH_DATA_INDEX]); + index_key->append(key.name); +} + +template +static int read_index_entry(cls_method_context_t hctx, string& name, T *entry); + +static int encode_list_index_key(cls_method_context_t hctx, const cls_rgw_obj_key& key, string *index_key) +{ + if (key.instance.empty()) { + *index_key = key.name; + return 0; + } + + string obj_index_key; + encode_obj_index_key(key, &obj_index_key); + + rgw_bucket_dir_entry entry; + + int ret = read_index_entry(hctx, obj_index_key, &entry); + if (ret == -ENOENT) { + /* couldn't find the entry, set key value after the current object */ + char buf[2] = { 0x1, 0 }; + string s(buf); + *index_key = key.name + s; + return 0; + } + if (ret < 0) { + CLS_LOG(1, "ERROR: encode_list_index_key(): cls_cxx_map_get_val returned %d\n", ret); + return ret; + } + + get_list_index_key(entry, index_key); + + return 0; +} + +static void split_key(const string& key, list& vals) +{ + size_t pos = 0; + const char *p = key.c_str(); + while (pos < key.size()) { + size_t len = strlen(p); + vals.push_back(p); + pos += len + 1; + p += len + 1; + } +} + +/* + * list index key structure: + * + * \0[v\0i] + */ +static void decode_list_index_key(const string& index_key, cls_rgw_obj_key *key, uint64_t *ver) +{ + size_t len = strlen(index_key.c_str()); + + key->instance.clear(); + *ver = 0; + + if (len == index_key.size()) { + key->name = index_key; + return; + } + + list vals; + split_key(index_key, vals); + + assert(!vals.empty()); + + list::iterator iter = vals.begin(); + key->name = *iter; + ++iter; + + assert(iter != vals.end()); + + for (; iter != vals.end(); ++iter) { + string& val = *iter; + if (val[0] == 'i') { + key->instance = val.substr(1); + } else if (val[0] == 'v') { + string err; + const char *s = val.c_str() + 1; + *ver = strict_strtoll(s, 10, &err); + assert(err.empty()); + } + } +} + +static int read_bucket_header(cls_method_context_t hctx, struct rgw_bucket_dir_header *header) +{ + bufferlist bl; + int rc = cls_cxx_map_read_header(hctx, &bl); + if (rc < 0) + return rc; + bufferlist::iterator iter = bl.begin(); + try { + ::decode(*header, iter); + } catch (buffer::error& err) { + CLS_LOG(1, "ERROR: read_bucket_header(): failed to decode header\n"); + return -EIO; + } + + return 0; +} + int rgw_bucket_list(cls_method_context_t hctx, bufferlist *in, bufferlist *out) { bufferlist::iterator iter = in->begin(); @@ -201,22 +414,18 @@ int rgw_bucket_list(cls_method_context_t hctx, bufferlist *in, bufferlist *out) struct rgw_cls_list_ret ret; struct rgw_bucket_dir& new_dir = ret.dir; - bufferlist header_bl; - int rc = cls_cxx_map_read_header(hctx, &header_bl); - if (rc < 0) + int rc = read_bucket_header(hctx, &new_dir.header); + if (rc < 0) { + CLS_LOG(1, "ERROR: rgw_bucket_list(): failed to read header\n"); return rc; - bufferlist::iterator header_iter = header_bl.begin(); - try { - ::decode(new_dir.header, header_iter); - } catch (buffer::error& err) { - CLS_LOG(1, "ERROR: rgw_bucket_list(): failed to decode header\n"); - return -EINVAL; } bufferlist bl; map keys; - rc = get_obj_vals(hctx, op.start_obj, op.filter_prefix, op.num_entries + 1, &keys); + string start_key; + encode_list_index_key(hctx, op.start_obj, &start_key); + rc = get_obj_vals(hctx, start_key, op.filter_prefix, op.num_entries + 1, &keys); if (rc < 0) return rc; @@ -242,8 +451,23 @@ int rgw_bucket_list(cls_method_context_t hctx, bufferlist *in, bufferlist *out) CLS_LOG(1, "ERROR: rgw_bucket_list(): failed to decode entry, key=%s\n", kiter->first.c_str()); return -EINVAL; } - + + cls_rgw_obj_key key; + uint64_t ver; + decode_list_index_key(kiter->first, &key, &ver); + + if (!entry.is_valid()) { + CLS_LOG(20, "entry %s[%s] is not valid\n", key.name.c_str(), key.instance.c_str()); + continue; + } + + if (!op.list_versions && !entry.is_visible()) { + CLS_LOG(20, "entry %s[%s] is not visible\n", key.name.c_str(), key.instance.c_str()); + continue; + } m[kiter->first] = entry; + + CLS_LOG(20, "got entry %s[%s] m.size()=%d\n", key.name.c_str(), key.instance.c_str(), (int)m.size()); } ret.is_truncated = (kiter != keys.end() && !done); @@ -254,16 +478,10 @@ int rgw_bucket_list(cls_method_context_t hctx, bufferlist *in, bufferlist *out) static int check_index(cls_method_context_t hctx, struct rgw_bucket_dir_header *existing_header, struct rgw_bucket_dir_header *calc_header) { - bufferlist header_bl; - int rc = cls_cxx_map_read_header(hctx, &header_bl); - if (rc < 0) + int rc = read_bucket_header(hctx, existing_header); + if (rc < 0) { + CLS_LOG(1, "ERROR: check_index(): failed to read header\n"); return rc; - bufferlist::iterator header_iter = header_bl.begin(); - try { - ::decode(*existing_header, header_iter); - } catch (buffer::error& err) { - CLS_LOG(1, "ERROR: rgw_bucket_list(): failed to decode header\n"); - return -EINVAL; } calc_header->tag_timeout = existing_header->tag_timeout; @@ -300,8 +518,8 @@ static int check_index(cls_method_context_t hctx, struct rgw_bucket_dir_header * } struct rgw_bucket_category_stats& stats = calc_header->stats[entry.meta.category]; stats.num_entries++; - stats.total_size += entry.meta.size; - stats.total_size_rounded += get_rounded_size(entry.meta.size); + stats.total_size += entry.meta.accounted_size; + stats.total_size_rounded += get_rounded_size(entry.meta.accounted_size); start_obj = kiter->first; } @@ -384,17 +602,11 @@ int rgw_bucket_set_tag_timeout(cls_method_context_t hctx, bufferlist *in, buffer return -EINVAL; } - bufferlist header_bl; struct rgw_bucket_dir_header header; - int rc = cls_cxx_map_read_header(hctx, &header_bl); - if (rc < 0) + int rc = read_bucket_header(hctx, &header); + if (rc < 0) { + CLS_LOG(1, "ERROR: rgw_bucket_complete_op(): failed to read header\n"); return rc; - bufferlist::iterator header_iter = header_bl.begin(); - try { - ::decode(header, header_iter); - } catch (buffer::error& err) { - CLS_LOG(1, "ERROR: rgw_bucket_complete_op(): failed to decode header\n"); - return -EINVAL; } header.tag_timeout = op.tag_timeout; @@ -402,6 +614,9 @@ int rgw_bucket_set_tag_timeout(cls_method_context_t hctx, bufferlist *in, buffer return write_bucket_header(hctx, &header); } +static int read_key_entry(cls_method_context_t hctx, cls_rgw_obj_key& key, string *idx, struct rgw_bucket_dir_entry *entry, + bool special_delete_marker_name = false); + int rgw_bucket_prepare_op(cls_method_context_t hctx, bufferlist *in, bufferlist *out) { // decode request @@ -419,63 +634,45 @@ int rgw_bucket_prepare_op(cls_method_context_t hctx, bufferlist *in, bufferlist return -EINVAL; } - CLS_LOG(1, "rgw_bucket_prepare_op(): request: op=%d name=%s tag=%s\n", op.op, op.name.c_str(), op.tag.c_str()); + CLS_LOG(1, "rgw_bucket_prepare_op(): request: op=%d name=%s instance=%s tag=%s\n", + op.op, op.key.name.c_str(), op.key.instance.c_str(), op.tag.c_str()); // get on-disk state - bufferlist cur_value; - int rc = cls_cxx_map_get_val(hctx, op.name, &cur_value); - if (rc < 0 && rc != -ENOENT) - return rc; + string idx; struct rgw_bucket_dir_entry entry; + int rc = read_key_entry(hctx, op.key, &idx, &entry); + if (rc < 0 && rc != -ENOENT) + return rc; bool noent = (rc == -ENOENT); rc = 0; - if (!noent) { - try { - bufferlist::iterator biter = cur_value.begin(); - ::decode(entry, biter); - } catch (buffer::error& err) { - CLS_LOG(1, "ERROR: rgw_bucket_prepare_op(): failed to decode entry\n"); - /* ignoring error */ - - noent = true; - } - } - if (noent) { // no entry, initialize fields - entry.name = op.name; + entry.key = op.key; entry.ver = rgw_bucket_entry_ver(); entry.exists = false; entry.locator = op.locator; } // fill in proper state - struct rgw_bucket_pending_info& info = entry.pending_map[op.tag]; + struct rgw_bucket_pending_info info; info.timestamp = ceph_clock_now(g_ceph_context); info.state = CLS_RGW_STATE_PENDING_MODIFY; info.op = op.op; + entry.pending_map.insert(pair(op.tag, info)); - - bufferlist header_bl; struct rgw_bucket_dir_header header; - rc = cls_cxx_map_read_header(hctx, &header_bl); - if (rc < 0) + rc = read_bucket_header(hctx, &header); + if (rc < 0) { + CLS_LOG(1, "ERROR: rgw_bucket_complete_op(): failed to read header\n"); return rc; - - bufferlist::iterator header_iter = header_bl.begin(); - try { - ::decode(header, header_iter); - } catch (buffer::error& err) { - CLS_LOG(1, "ERROR: rgw_bucket_complete_op(): failed to decode header\n"); - return -EINVAL; } if (op.log_op) { - rc = log_index_operation(hctx, op.name, op.op, op.tag, entry.meta.mtime, - entry.ver, info.state, header.ver, header.max_marker); + rc = log_index_operation(hctx, op.key, op.op, op.tag, entry.meta.mtime, + entry.ver, info.state, header.ver, header.max_marker, op.bilog_flags); if (rc < 0) return rc; } @@ -483,7 +680,7 @@ int rgw_bucket_prepare_op(cls_method_context_t hctx, bufferlist *in, bufferlist // write out new key to disk bufferlist info_bl; ::encode(entry, info_bl); - rc = cls_cxx_map_set_val(hctx, op.name, &info_bl); + rc = cls_cxx_map_set_val(hctx, idx, &info_bl); if (rc < 0) return rc; @@ -494,11 +691,26 @@ static void unaccount_entry(struct rgw_bucket_dir_header& header, struct rgw_buc { struct rgw_bucket_category_stats& stats = header.stats[entry.meta.category]; stats.num_entries--; - stats.total_size -= entry.meta.size; - stats.total_size_rounded -= get_rounded_size(entry.meta.size); + stats.total_size -= entry.meta.accounted_size; + stats.total_size_rounded -= get_rounded_size(entry.meta.accounted_size); +} + +static void log_entry(const char *func, const char *str, struct rgw_bucket_dir_entry *entry) +{ + CLS_LOG(1, "%s(): %s: ver=%ld:%llu name=%s instance=%s locator=%s\n", func, str, + (long)entry->ver.pool, (unsigned long long)entry->ver.epoch, + entry->key.name.c_str(), entry->key.instance.c_str(), entry->locator.c_str()); } -static int read_index_entry(cls_method_context_t hctx, string& name, struct rgw_bucket_dir_entry *entry) +static void log_entry(const char *func, const char *str, struct rgw_bucket_olh_entry *entry) +{ + CLS_LOG(1, "%s(): %s: epoch=%llu name=%s instance=%s tag=%s\n", func, str, + (unsigned long long)entry->epoch, entry->key.name.c_str(), entry->key.instance.c_str(), + entry->tag.c_str()); +} + +template +static int read_index_entry(cls_method_context_t hctx, string& name, T *entry) { bufferlist current_entry; int rc = cls_cxx_map_get_val(hctx, name, ¤t_entry); @@ -514,9 +726,39 @@ static int read_index_entry(cls_method_context_t hctx, string& name, struct rgw_ return -EIO; } - CLS_LOG(1, "read_index_entry(): existing entry: ver=%ld:%llu name=%s locator=%s\n", - (long)entry->ver.pool, (unsigned long long)entry->ver.epoch, - entry->name.c_str(), entry->locator.c_str()); + log_entry(__func__, "existing entry", entry); + return 0; +} + +static int read_key_entry(cls_method_context_t hctx, cls_rgw_obj_key& key, string *idx, struct rgw_bucket_dir_entry *entry, + bool special_delete_marker_name) +{ + encode_obj_index_key(key, idx); + int rc = read_index_entry(hctx, *idx, entry); + if (rc < 0) { + return rc; + } + + if (key.instance.empty() && + entry->flags & RGW_BUCKET_DIRENT_FLAG_VER_MARKER) { + /* we only do it where key.instance is empty. In this case the delete marker will have a + * separate entry in the index to avoid collisions with the actual object, as it's mutable + */ + if (special_delete_marker_name) { + encode_obj_versioned_data_key(key, idx, true); + rc = read_index_entry(hctx, *idx, entry); + if (rc == 0) { + return 0; + } + } + encode_obj_versioned_data_key(key, idx); + rc = read_index_entry(hctx, *idx, entry); + if (rc < 0) { + *entry = rgw_bucket_dir_entry(); /* need to reset entry because we initialized it earlier */ + return rc; + } + } + return 0; } @@ -531,30 +773,25 @@ int rgw_bucket_complete_op(cls_method_context_t hctx, bufferlist *in, bufferlist CLS_LOG(1, "ERROR: rgw_bucket_complete_op(): failed to decode request\n"); return -EINVAL; } - CLS_LOG(1, "rgw_bucket_complete_op(): request: op=%d name=%s ver=%lu:%llu tag=%s\n", - op.op, op.name.c_str(), + CLS_LOG(1, "rgw_bucket_complete_op(): request: op=%d name=%s instance=%s ver=%lu:%llu tag=%s\n", + op.op, op.key.name.c_str(), op.key.instance.c_str(), (unsigned long)op.ver.pool, (unsigned long long)op.ver.epoch, op.tag.c_str()); - bufferlist header_bl; struct rgw_bucket_dir_header header; - int rc = cls_cxx_map_read_header(hctx, &header_bl); - if (rc < 0) - return rc; - bufferlist::iterator header_iter = header_bl.begin(); - try { - ::decode(header, header_iter); - } catch (buffer::error& err) { - CLS_LOG(1, "ERROR: rgw_bucket_complete_op(): failed to decode header\n"); + int rc = read_bucket_header(hctx, &header); + if (rc < 0) { + CLS_LOG(1, "ERROR: rgw_bucket_complete_op(): failed to read header\n"); return -EINVAL; } struct rgw_bucket_dir_entry entry; bool ondisk = true; - rc = read_index_entry(hctx, op.name, &entry); + string idx; + rc = read_key_entry(hctx, op.key, &idx, &entry); if (rc == -ENOENT) { - entry.name = op.name; + entry.key = op.key; entry.ver = op.ver; entry.meta = op.meta; entry.locator = op.locator; @@ -564,6 +801,7 @@ int rgw_bucket_complete_op(cls_method_context_t hctx, bufferlist *in, bufferlist } entry.index_ver = header.ver; + entry.flags = 0; /* resetting entry flags, entry might have been previously a delete marker */ if (op.tag.size()) { map::iterator pinter = entry.pending_map.find(op.tag); @@ -589,8 +827,8 @@ int rgw_bucket_complete_op(cls_method_context_t hctx, bufferlist *in, bufferlist bufferlist op_bl; if (cancel) { if (op.log_op) { - rc = log_index_operation(hctx, op.name, op.op, op.tag, entry.meta.mtime, entry.ver, - CLS_RGW_STATE_COMPLETE, header.ver, header.max_marker); + rc = log_index_operation(hctx, op.key, op.op, op.tag, entry.meta.mtime, entry.ver, + CLS_RGW_STATE_COMPLETE, header.ver, header.max_marker, op.bilog_flags); if (rc < 0) return rc; } @@ -598,92 +836,965 @@ int rgw_bucket_complete_op(cls_method_context_t hctx, bufferlist *in, bufferlist if (op.tag.size()) { bufferlist new_key_bl; ::encode(entry, new_key_bl); - return cls_cxx_map_set_val(hctx, op.name, &new_key_bl); + return cls_cxx_map_set_val(hctx, idx, &new_key_bl); } else { return 0; } } - if (entry.exists) { - unaccount_entry(header, entry); + if (entry.exists) { + unaccount_entry(header, entry); + } + + entry.ver = op.ver; + switch ((int)op.op) { + case CLS_RGW_OP_DEL: + if (ondisk) { + if (!entry.pending_map.size()) { + int ret = cls_cxx_map_remove_key(hctx, idx); + if (ret < 0) + return ret; + } else { + entry.exists = false; + bufferlist new_key_bl; + ::encode(entry, new_key_bl); + int ret = cls_cxx_map_set_val(hctx, idx, &new_key_bl); + if (ret < 0) + return ret; + } + } else { + return -ENOENT; + } + break; + case CLS_RGW_OP_ADD: + { + struct rgw_bucket_dir_entry_meta& meta = op.meta; + struct rgw_bucket_category_stats& stats = header.stats[meta.category]; + entry.meta = meta; + entry.key = op.key; + entry.exists = true; + entry.tag = op.tag; + stats.num_entries++; + stats.total_size += meta.accounted_size; + stats.total_size_rounded += get_rounded_size(meta.accounted_size); + bufferlist new_key_bl; + ::encode(entry, new_key_bl); + int ret = cls_cxx_map_set_val(hctx, idx, &new_key_bl); + if (ret < 0) + return ret; + } + break; + } + + if (op.log_op) { + rc = log_index_operation(hctx, op.key, op.op, op.tag, entry.meta.mtime, entry.ver, + CLS_RGW_STATE_COMPLETE, header.ver, header.max_marker, op.bilog_flags); + if (rc < 0) + return rc; + } + + list::iterator remove_iter; + CLS_LOG(20, "rgw_bucket_complete_op(): remove_objs.size()=%d\n", (int)op.remove_objs.size()); + for (remove_iter = op.remove_objs.begin(); remove_iter != op.remove_objs.end(); ++remove_iter) { + cls_rgw_obj_key& remove_key = *remove_iter; + CLS_LOG(1, "rgw_bucket_complete_op(): removing entries, read_index_entry name=%s instance=%s\n", + remove_key.name.c_str(), remove_key.instance.c_str()); + struct rgw_bucket_dir_entry remove_entry; + string k; + int ret = read_key_entry(hctx, remove_key, &k, &remove_entry); + if (ret < 0) { + CLS_LOG(1, "rgw_bucket_complete_op(): removing entries, read_index_entry name=%s instance=%s ret=%d\n", + remove_key.name.c_str(), remove_key.instance.c_str(), ret); + continue; + } + CLS_LOG(0, "rgw_bucket_complete_op(): entry.name=%s entry.instance=%s entry.meta.category=%d\n", + remove_entry.key.name.c_str(), remove_entry.key.instance.c_str(), remove_entry.meta.category); + unaccount_entry(header, remove_entry); + + if (op.log_op) { + rc = log_index_operation(hctx, remove_key, CLS_RGW_OP_DEL, op.tag, remove_entry.meta.mtime, + remove_entry.ver, CLS_RGW_STATE_COMPLETE, header.ver, header.max_marker, op.bilog_flags); + if (rc < 0) + continue; + } + + ret = cls_cxx_map_remove_key(hctx, k); + if (ret < 0) { + CLS_LOG(1, "rgw_bucket_complete_op(): cls_cxx_map_remove_key, failed to remove entry, name=%s instance=%s read_index_entry ret=%d\n", remove_key.name.c_str(), remove_key.instance.c_str(), rc); + continue; + } + } + + return write_bucket_header(hctx, &header); +} + +template +static int write_entry(cls_method_context_t hctx, T& entry, const string& key) +{ + bufferlist bl; + ::encode(entry, bl); + return cls_cxx_map_set_val(hctx, key, &bl); +} + +static int read_olh(cls_method_context_t hctx,cls_rgw_obj_key& obj_key, struct rgw_bucket_olh_entry *olh_data_entry, string *index_key, bool *found) +{ + cls_rgw_obj_key olh_key; + olh_key.name = obj_key.name; + + encode_olh_data_key(olh_key, index_key); + int ret = read_index_entry(hctx, *index_key, olh_data_entry); + if (ret < 0 && ret != -ENOENT) { + CLS_LOG(0, "ERROR: read_index_entry() olh_key=%s ret=%d", olh_key.name.c_str(), ret); + return ret; + } + if (found) { + *found = (ret != -ENOENT); + } + return 0; +} + +static void update_olh_log(struct rgw_bucket_olh_entry& olh_data_entry, OLHLogOp op, const string& op_tag, + cls_rgw_obj_key& key, bool delete_marker, uint64_t epoch) +{ + vector& log = olh_data_entry.pending_log[olh_data_entry.epoch]; + rgw_bucket_olh_log_entry log_entry; + log_entry.epoch = epoch; + log_entry.op = op; + log_entry.op_tag = op_tag; + log_entry.key = key; + log_entry.delete_marker = delete_marker; + log.push_back(log_entry); +} + +static string escape_str(const string& s) +{ + int len = escape_json_attr_len(s.c_str(), s.size()); + char escaped[len]; + escape_json_attr(s.c_str(), s.size(), escaped); + return string(escaped); +} + +static int write_obj_instance_entry(cls_method_context_t hctx, struct rgw_bucket_dir_entry& instance_entry, const string& instance_idx) +{ + CLS_LOG(20, "write_entry() instance=%s idx=%s flags=%d", escape_str(instance_entry.key.instance).c_str(), instance_idx.c_str(), instance_entry.flags); + /* write the instance entry */ + int ret = write_entry(hctx, instance_entry, instance_idx); + if (ret < 0) { + CLS_LOG(0, "ERROR: write_entry() instance_key=%s ret=%d", escape_str(instance_idx).c_str(), ret); + return ret; + } + return 0; +} + +/* + * write object instance entry, and if needed also the list entry + */ +static int write_obj_entries(cls_method_context_t hctx, struct rgw_bucket_dir_entry& instance_entry, const string& instance_idx) +{ + int ret = write_obj_instance_entry(hctx, instance_entry, instance_idx); + if (ret < 0) { + return ret; + } + string instance_list_idx; + get_list_index_key(instance_entry, &instance_list_idx); + + if (instance_idx != instance_list_idx) { + CLS_LOG(20, "write_entry() idx=%s flags=%d", escape_str(instance_list_idx).c_str(), instance_entry.flags); + /* write a new list entry for the object instance */ + ret = write_entry(hctx, instance_entry, instance_list_idx); + if (ret < 0) { + CLS_LOG(0, "ERROR: write_entry() instance=%s instance_list_idx=%s ret=%d", instance_entry.key.instance.c_str(), instance_list_idx.c_str(), ret); + return ret; + } + } + return 0; +} + + +class BIVerObjEntry { + cls_method_context_t hctx; + cls_rgw_obj_key key; + string instance_idx; + + struct rgw_bucket_dir_entry instance_entry; + + bool initialized; + +public: + BIVerObjEntry(cls_method_context_t& _hctx, const cls_rgw_obj_key& _key) : hctx(_hctx), key(_key), initialized(false) { + } + + int init(bool check_delete_marker = true) { + int ret = read_key_entry(hctx, key, &instance_idx, &instance_entry, + check_delete_marker && key.instance.empty()); /* this is potentially a delete marker, for null objects we + keep separate instance entry for the delete markers */ + + if (ret < 0) { + CLS_LOG(0, "ERROR: read_key_entry() idx=%s ret=%d", instance_idx.c_str(), ret); + return ret; + } + initialized = true; + CLS_LOG(20, "read instance_entry key.name=%s key.instance=%s flags=%d", instance_entry.key.name.c_str(), instance_entry.key.instance.c_str(), instance_entry.flags); + return 0; + } + + rgw_bucket_dir_entry& get_dir_entry() { + return instance_entry; + } + + void init_as_delete_marker(rgw_bucket_dir_entry_meta& meta) { + /* a deletion marker, need to initialize it, there's no instance entry for it yet */ + instance_entry.key = key; + instance_entry.flags = RGW_BUCKET_DIRENT_FLAG_DELETE_MARKER; + instance_entry.meta = meta; + instance_entry.tag = "delete-marker"; + + initialized = true; + } + + + int unlink_list_entry() { + string list_idx; + /* this instance has a previous list entry, remove that entry */ + get_list_index_key(instance_entry, &list_idx); + CLS_LOG(20, "unlink_list_entry() list_idx=%s", escape_str(list_idx).c_str()); + int ret = cls_cxx_map_remove_key(hctx, list_idx); + if (ret < 0) { + CLS_LOG(0, "ERROR: cls_cxx_map_remove_key() list_idx=%s ret=%d", list_idx.c_str(), ret); + return ret; + } + return 0; + } + + int unlink() { + /* remove the instance entry */ + CLS_LOG(20, "unlink() idx=%s", escape_str(instance_idx).c_str()); + int ret = cls_cxx_map_remove_key(hctx, instance_idx); + if (ret < 0) { + CLS_LOG(0, "ERROR: cls_cxx_map_remove_key() instance_idx=%s ret=%d", instance_idx.c_str(), ret); + return ret; + } + return 0; + } + + int write_entries(uint64_t flags_set, uint64_t flags_reset) { + if (!initialized) { + int ret = init(); + if (ret < 0) { + return ret; + } + } + instance_entry.flags &= ~flags_reset; + instance_entry.flags |= flags_set; + + /* write the instance and list entries */ + bool special_delete_marker_key = (instance_entry.is_delete_marker() && instance_entry.key.instance.empty()); + encode_obj_versioned_data_key(key, &instance_idx, special_delete_marker_key); + int ret = write_obj_entries(hctx, instance_entry, instance_idx); + if (ret < 0) { + CLS_LOG(0, "ERROR: write_obj_entries() instance_idx=%s ret=%d", instance_idx.c_str(), ret); + return ret; + } + + return 0; + } + + int write(uint64_t epoch, bool current) { + if (instance_entry.versioned_epoch > 0) { + CLS_LOG(20, "%s(): instance_entry.versioned_epoch=%d epoch=%d", __func__, (int)instance_entry.versioned_epoch, (int)epoch); + /* this instance has a previous list entry, remove that entry */ + int ret = unlink_list_entry(); + if (ret < 0) { + return ret; + } + } + + uint64_t flags = RGW_BUCKET_DIRENT_FLAG_VER; + if (current) { + flags |= RGW_BUCKET_DIRENT_FLAG_CURRENT; + } + + instance_entry.versioned_epoch = epoch; + return write_entries(flags, 0); + } + + int demote_current() { + return write_entries(0, RGW_BUCKET_DIRENT_FLAG_CURRENT); + } + + bool is_delete_marker() { + return instance_entry.is_delete_marker(); + } + + int find_next_key(cls_rgw_obj_key *next_key, bool *found) { + string list_idx; + /* this instance has a previous list entry, remove that entry */ + get_list_index_key(instance_entry, &list_idx); + /* this is the current head, need to update! */ + map keys; + string filter = key.name; /* list key starts with key name, filter it to avoid a case where we cross to + different namespace */ + int ret = cls_cxx_map_get_vals(hctx, list_idx, filter, 1, &keys); + if (ret < 0) { + return ret; + } + + if (keys.size() < 1) { + *found = false; + return 0; + } + + rgw_bucket_dir_entry next_entry; + + map::reverse_iterator last = keys.rbegin(); + try { + bufferlist::iterator iter = last->second.begin(); + ::decode(next_entry, iter); + } catch (buffer::error& err) { + CLS_LOG(0, "ERROR; failed to decode entry: %s", last->first.c_str()); + return -EIO; + } + + *found = (key.name == next_entry.key.name); + if (*found) { + *next_key = next_entry.key; + } + + return 0; + } + +}; + + +class BIOLHEntry { + cls_method_context_t hctx; + cls_rgw_obj_key key; + + string olh_data_idx; + struct rgw_bucket_olh_entry olh_data_entry; + + bool initialized; +public: + BIOLHEntry(cls_method_context_t& _hctx, const cls_rgw_obj_key& _key) : hctx(_hctx), key(_key), initialized(false) { } + + int init(bool *exists) { + /* read olh */ + int ret = read_olh(hctx, key, &olh_data_entry, &olh_data_idx, exists); + if (ret < 0) { + return ret; + } + + initialized = true; + return 0; + } + + bool apply_epoch(uint64_t candidate_epoch) { + if (candidate_epoch < olh_data_entry.epoch) { + return false; + } + + olh_data_entry.epoch = candidate_epoch; + return true; + } + + bool start_modify(uint64_t candidate_epoch) { + if (candidate_epoch) { + if (candidate_epoch < olh_data_entry.epoch) { + return false; /* olh cannot be modified, old epoch */ + } + olh_data_entry.epoch = candidate_epoch; + } else { + if (olh_data_entry.epoch == 0) { + olh_data_entry.epoch = 2; /* versioned epoch should start with 2, 1 is reserved to converted plain entries */ + } else { + olh_data_entry.epoch++; + } + } + return true; + } + + uint64_t get_epoch() { + return olh_data_entry.epoch; + } + + rgw_bucket_olh_entry& get_entry() { + return olh_data_entry; + } + + void update(cls_rgw_obj_key& key, bool delete_marker) { + olh_data_entry.delete_marker = delete_marker; + olh_data_entry.key = key; + } + + int write() { + /* write the olh data entry */ + int ret = write_entry(hctx, olh_data_entry, olh_data_idx); + if (ret < 0) { + CLS_LOG(0, "ERROR: write_entry() olh_key=%s ret=%d", olh_data_idx.c_str(), ret); + return ret; + } + + return 0; + } + + void update_log(OLHLogOp op, const string& op_tag, cls_rgw_obj_key& key, bool delete_marker, uint64_t epoch = 0) { + if (epoch == 0) { + epoch = olh_data_entry.epoch; + } + update_olh_log(olh_data_entry, op, op_tag, key, delete_marker, epoch); + } + + bool exists() { return olh_data_entry.exists; } + + void set_exists(bool exists) { + olh_data_entry.exists = exists; + } + + bool pending_removal() { return olh_data_entry.pending_removal; } + + void set_pending_removal(bool pending_removal) { + olh_data_entry.pending_removal = pending_removal; + } + + const string& get_tag() { return olh_data_entry.tag; } + void set_tag(const string& tag) { + olh_data_entry.tag = tag; + } +}; + +static int write_version_marker(cls_method_context_t hctx, cls_rgw_obj_key& key) +{ + struct rgw_bucket_dir_entry entry; + entry.key = key; + entry.flags = RGW_BUCKET_DIRENT_FLAG_VER_MARKER; + int ret = write_entry(hctx, entry, key.name); + if (ret < 0) { + CLS_LOG(0, "ERROR: write_entry returned ret=%d", ret); + return ret; + } + return 0; +} + +/* + * plain entries are the ones who were created when bucket was not versioned, + * if we override these objects, we need to convert these to versioned entries -- ones that have + * both data entry, and listing key. Their version is going to be empty though + */ +static int convert_plain_entry_to_versioned(cls_method_context_t hctx, cls_rgw_obj_key& key, bool demote_current, bool instance_only) +{ + if (!key.instance.empty()) { + return -EINVAL; + } + + struct rgw_bucket_dir_entry entry; + + string orig_idx; + int ret = read_key_entry(hctx, key, &orig_idx, &entry); + if (ret != -ENOENT) { + if (ret < 0) { + CLS_LOG(0, "ERROR: read_key_entry() returned ret=%d", ret); + return ret; + } + + entry.versioned_epoch = 1; /* converted entries are always 1 */ + entry.flags |= RGW_BUCKET_DIRENT_FLAG_VER; + + if (demote_current) { + entry.flags &= ~RGW_BUCKET_DIRENT_FLAG_CURRENT; + } + + string new_idx; + encode_obj_versioned_data_key(key, &new_idx); + + if (instance_only) { + ret = write_obj_instance_entry(hctx, entry, new_idx); + } else { + ret = write_obj_entries(hctx, entry, new_idx); + } + if (ret < 0) { + CLS_LOG(0, "ERROR: write_obj_entries new_idx=%s returned %d", new_idx.c_str(), ret); + return ret; + } + } + + ret = write_version_marker(hctx, key); + if (ret < 0) { + return ret; + } + + return 0; +} + +/* + * link an object version to an olh, update the relevant index entries. It will also handle the + * deletion marker case. We have a few entries that we need to take care of. For object 'foo', + * instance BAR, we'd update the following (not actual encoding): + * - olh data: [BI_BUCKET_OLH_DATA_INDEX]foo + * - object instance data: [BI_BUCKET_OBJ_INSTANCE_INDEX]foo,BAR + * - object instance list entry: foo,123,BAR + * + * The instance list entry needs to be ordered by newer to older, so we generate an appropriate + * number string that follows the name. + * The top instance for each object is marked appropriately. + * We generate instance entry for deletion markers here, as they are not created prior. + */ +static int rgw_bucket_link_olh(cls_method_context_t hctx, bufferlist *in, bufferlist *out) +{ + string olh_data_idx; + string instance_idx; + + // decode request + rgw_cls_link_olh_op op; + bufferlist::iterator iter = in->begin(); + try { + ::decode(op, iter); + } catch (buffer::error& err) { + CLS_LOG(0, "ERROR: rgw_bucket_link_olh_op(): failed to decode request\n"); + return -EINVAL; + } + + BIVerObjEntry obj(hctx, op.key); + BIOLHEntry olh(hctx, op.key); + + /* read instance entry */ + int ret = obj.init(op.delete_marker); + bool existed = (ret == 0); + if (ret == -ENOENT && op.delete_marker) { + ret = 0; + } + if (ret < 0) { + return ret; + } + + bool removing; + + /* + * Special handling for null instance object / delete-marker. For these objects we're going to + * have separate instances for a data object vs. delete-marker to avoid collisions. We now check + * if we got to overwrite a previous entry, and in that case we'll remove its list entry. + */ + if (op.key.instance.empty()) { + BIVerObjEntry other_obj(hctx, op.key); + ret = other_obj.init(!op.delete_marker); /* try reading the other null versioned entry */ + existed = (ret >= 0 && !other_obj.is_delete_marker()); + if (ret >= 0 && other_obj.is_delete_marker() != op.delete_marker) { + ret = other_obj.unlink_list_entry(); + if (ret < 0) { + return ret; + } + ret = other_obj.unlink(); + if (ret < 0) { + return ret; + } + } + + removing = existed && op.delete_marker; + } else { + removing = (existed && !obj.is_delete_marker() && op.delete_marker); + } + + if (op.delete_marker) { + /* a deletion marker, need to initialize entry as such */ + obj.init_as_delete_marker(op.meta); + } + + /* read olh */ + bool olh_found; + ret = olh.init(&olh_found); + if (ret < 0) { + return ret; + } + + if (!olh.start_modify(op.olh_epoch)) { + ret = obj.write(op.olh_epoch, false); + if (ret < 0) { + return ret; + } + if (removing) { + olh.update_log(CLS_RGW_OLH_OP_REMOVE_INSTANCE, op.op_tag, op.key, false, op.olh_epoch); + } + return 0; + } + + if (olh_found) { + const string& olh_tag = olh.get_tag(); + if (op.olh_tag != olh_tag) { + if (!olh.pending_removal()) { + CLS_LOG(5, "NOTICE: op.olh_tag (%s) != olh.tag (%s)", op.olh_tag.c_str(), olh_tag.c_str()); + return -ECANCELED; + } + /* if pending removal, this is a new olh instance */ + olh.set_tag(op.olh_tag); + } + if (olh.exists()) { + rgw_bucket_olh_entry& olh_entry = olh.get_entry(); + /* found olh, previous instance is no longer the latest, need to update */ + if (!(olh_entry.key == op.key)) { + BIVerObjEntry old_obj(hctx, olh_entry.key); + + ret = old_obj.demote_current(); + if (ret < 0) { + CLS_LOG(0, "ERROR: could not demote current on previous key ret=%d", ret); + return ret; + } + } + } + olh.set_pending_removal(false); + } else { + bool instance_only = (op.key.instance.empty() && op.delete_marker); + cls_rgw_obj_key key(op.key.name); + ret = convert_plain_entry_to_versioned(hctx, key, true, instance_only); + if (ret < 0) { + CLS_LOG(0, "ERROR: convert_plain_entry_to_versioned ret=%d", ret); + return ret; + } + olh.set_tag(op.olh_tag); + } + + /* update the olh log */ + olh.update_log(CLS_RGW_OLH_OP_LINK_OLH, op.op_tag, op.key, op.delete_marker); + if (removing) { + olh.update_log(CLS_RGW_OLH_OP_REMOVE_INSTANCE, op.op_tag, op.key, false); + } + + olh.update(op.key, op.delete_marker); + + olh.set_exists(true); + + ret = olh.write(); + if (ret < 0) { + CLS_LOG(0, "ERROR: failed to update olh ret=%d", ret); + return ret; + } + + /* write the instance and list entries */ + ret = obj.write(olh.get_epoch(), true); + if (ret < 0) { + return ret; + } + + struct rgw_bucket_dir_header header; + ret = read_bucket_header(hctx, &header); + if (ret < 0) { + CLS_LOG(1, "ERROR: rgw_bucket_unlink_instance(): failed to read header\n"); + return ret; + } + + if (op.log_op) { + rgw_bucket_dir_entry& entry = obj.get_dir_entry(); + + rgw_bucket_entry_ver ver; + ver.epoch = (op.olh_epoch ? op.olh_epoch : olh.get_epoch()); + + RGWModifyOp operation = (op.delete_marker ? CLS_RGW_OP_LINK_OLH_DM : CLS_RGW_OP_LINK_OLH); + ret = log_index_operation(hctx, op.key, operation, op.op_tag, + entry.meta.mtime, ver, + CLS_RGW_STATE_COMPLETE, header.ver, header.max_marker, op.bilog_flags | RGW_BILOG_FLAG_VERSIONED_OP); + if (ret < 0) + return ret; + } + + return write_bucket_header(hctx, &header); /* updates header version */ +} + +static int rgw_bucket_unlink_instance(cls_method_context_t hctx, bufferlist *in, bufferlist *out) +{ + string olh_data_idx; + string instance_idx; + + // decode request + rgw_cls_unlink_instance_op op; + bufferlist::iterator iter = in->begin(); + try { + ::decode(op, iter); + } catch (buffer::error& err) { + CLS_LOG(0, "ERROR: rgw_bucket_rm_obj_instance_op(): failed to decode request\n"); + return -EINVAL; + } + + cls_rgw_obj_key dest_key = op.key; + if (dest_key.instance == "null") { + dest_key.instance.clear(); + } + + BIVerObjEntry obj(hctx, dest_key); + BIOLHEntry olh(hctx, dest_key); + + int ret = obj.init(); + if (ret == -ENOENT) { + return 0; /* already removed */ + } + if (ret < 0) { + CLS_LOG(0, "ERROR: obj.init() returned ret=%d", ret); + return ret; + } + + ret = olh.init(NULL); + if (ret < 0) { + CLS_LOG(0, "ERROR: olh.init() returned ret=%d", ret); + return ret; + } + + if (!olh.start_modify(op.olh_epoch)) { + ret = obj.unlink_list_entry(); + if (ret < 0) { + return ret; + } + + if (!obj.is_delete_marker()) { + olh.update_log(CLS_RGW_OLH_OP_REMOVE_INSTANCE, op.op_tag, op.key, false, op.olh_epoch); + } + + return 0; + } + + rgw_bucket_olh_entry& olh_entry = olh.get_entry(); + cls_rgw_obj_key& olh_key = olh_entry.key; + CLS_LOG(20, "%s(): updating olh log: existing olh entry: %s[%s] (delete_marker=%d)", __func__, + olh_key.name.c_str(), olh_key.instance.c_str(), olh_entry.delete_marker); + + if (olh_key == dest_key) { + /* this is the current head, need to update! */ + cls_rgw_obj_key next_key; + bool found; + ret = obj.find_next_key(&next_key, &found); + if (ret < 0) { + CLS_LOG(0, "ERROR: obj.find_next_key() returned ret=%d", ret); + return ret; + } + + if (found) { + BIVerObjEntry next(hctx, next_key); + ret = next.write(olh.get_epoch(), true); + if (ret < 0) { + CLS_LOG(0, "ERROR: next.write() returned ret=%d", ret); + return ret; + } + + CLS_LOG(20, "%s(): updating olh log: link olh -> %s[%s] (is_delete=%d)", __func__, + next_key.name.c_str(), next_key.instance.c_str(), (int)next.is_delete_marker()); + + olh.update(next_key, next.is_delete_marker()); + olh.update_log(CLS_RGW_OLH_OP_LINK_OLH, op.op_tag, next_key, next.is_delete_marker()); + } else { + /* next_key is empty */ + olh.update(next_key, false); + olh.update_log(CLS_RGW_OLH_OP_UNLINK_OLH, op.op_tag, next_key, false); + olh.set_exists(false); + olh.set_pending_removal(true); + } + } + + if (!obj.is_delete_marker()) { + olh.update_log(CLS_RGW_OLH_OP_REMOVE_INSTANCE, op.op_tag, op.key, false); + } else { + /* this is a delete marker, it's our responsibility to remove its instance entry */ + ret = obj.unlink(); + if (ret < 0) { + return ret; + } + } + + ret = obj.unlink_list_entry(); + if (ret < 0) { + return ret; + } + + ret = olh.write(); + if (ret < 0) { + return ret; + } + + struct rgw_bucket_dir_header header; + ret = read_bucket_header(hctx, &header); + if (ret < 0) { + CLS_LOG(1, "ERROR: rgw_bucket_unlink_instance(): failed to read header\n"); + return ret; + } + + if (op.log_op) { + rgw_bucket_entry_ver ver; + ver.epoch = (op.olh_epoch ? op.olh_epoch : olh.get_epoch()); + + utime_t mtime = ceph_clock_now(g_ceph_context); /* mtime has no real meaning in instance removal context */ + ret = log_index_operation(hctx, op.key, CLS_RGW_OP_UNLINK_INSTANCE, op.op_tag, + mtime, ver, + CLS_RGW_STATE_COMPLETE, header.ver, header.max_marker, + op.bilog_flags | RGW_BILOG_FLAG_VERSIONED_OP); + if (ret < 0) + return ret; + } + + return write_bucket_header(hctx, &header); /* updates header version */ +} + +static int rgw_bucket_read_olh_log(cls_method_context_t hctx, bufferlist *in, bufferlist *out) +{ + // decode request + rgw_cls_read_olh_log_op op; + bufferlist::iterator iter = in->begin(); + try { + ::decode(op, iter); + } catch (buffer::error& err) { + CLS_LOG(0, "ERROR: rgw_bucket_read_olh_log(): failed to decode request\n"); + return -EINVAL; + } + + if (!op.olh.instance.empty()) { + CLS_LOG(1, "bad key passed in (non empty instance)"); + return -EINVAL; + } + + struct rgw_bucket_olh_entry olh_data_entry; + string olh_data_key; + encode_olh_data_key(op.olh, &olh_data_key); + int ret = read_index_entry(hctx, olh_data_key, &olh_data_entry); + if (ret < 0 && ret != -ENOENT) { + CLS_LOG(0, "ERROR: read_index_entry() olh_key=%s ret=%d", olh_data_key.c_str(), ret); + return ret; + } + + if (olh_data_entry.tag != op.olh_tag) { + CLS_LOG(1, "NOTICE: %s(): olh_tag_mismatch olh_data_entry.tag=%s op.olh_tag=%s", __func__, olh_data_entry.tag.c_str(), op.olh_tag.c_str()); + return -ECANCELED; + } + + rgw_cls_read_olh_log_ret op_ret; + +#define MAX_OLH_LOG_ENTRIES 1000 + map >& log = olh_data_entry.pending_log; + + if (log.begin()->first > op.ver_marker && log.size() <= MAX_OLH_LOG_ENTRIES) { + op_ret.log = log; + op_ret.is_truncated = false; + } else { + map >::iterator iter = log.upper_bound(op.ver_marker); + + for (int i = 0; i < MAX_OLH_LOG_ENTRIES && iter != log.end(); ++i, ++iter) { + op_ret.log[iter->first] = iter->second; + } + op_ret.is_truncated = (iter != log.end()); + } + + ::encode(op_ret, *out); + + return 0; +} + +static int rgw_bucket_trim_olh_log(cls_method_context_t hctx, bufferlist *in, bufferlist *out) +{ + // decode request + rgw_cls_trim_olh_log_op op; + bufferlist::iterator iter = in->begin(); + try { + ::decode(op, iter); + } catch (buffer::error& err) { + CLS_LOG(0, "ERROR: rgw_bucket_trim_olh_log(): failed to decode request\n"); + return -EINVAL; + } + + if (!op.olh.instance.empty()) { + CLS_LOG(1, "bad key passed in (non empty instance)"); + return -EINVAL; + } + + /* read olh entry */ + struct rgw_bucket_olh_entry olh_data_entry; + string olh_data_key; + encode_olh_data_key(op.olh, &olh_data_key); + int ret = read_index_entry(hctx, olh_data_key, &olh_data_entry); + if (ret < 0 && ret != -ENOENT) { + CLS_LOG(0, "ERROR: read_index_entry() olh_key=%s ret=%d", olh_data_key.c_str(), ret); + return ret; + } + + if (olh_data_entry.tag != op.olh_tag) { + CLS_LOG(1, "NOTICE: %s(): olh_tag_mismatch olh_data_entry.tag=%s op.olh_tag=%s", __func__, olh_data_entry.tag.c_str(), op.olh_tag.c_str()); + return -ECANCELED; + } + + /* remove all versions up to and including ver from the pending map */ + map >& log = olh_data_entry.pending_log; + map >::iterator liter = log.begin(); + while (liter != log.end() && liter->first <= op.ver) { + map >::iterator rm_iter = liter; + ++liter; + log.erase(rm_iter); + } + + /* write the olh data entry */ + ret = write_entry(hctx, olh_data_entry, olh_data_key); + if (ret < 0) { + CLS_LOG(0, "ERROR: write_entry() olh_key=%s ret=%d", olh_data_key.c_str(), ret); + return ret; + } + + return 0; +} + +static int rgw_bucket_clear_olh(cls_method_context_t hctx, bufferlist *in, bufferlist *out) +{ + // decode request + rgw_cls_bucket_clear_olh_op op; + bufferlist::iterator iter = in->begin(); + try { + ::decode(op, iter); + } catch (buffer::error& err) { + CLS_LOG(0, "ERROR: rgw_bucket_clear_olh(): failed to decode request\n"); + return -EINVAL; + } + + if (!op.key.instance.empty()) { + CLS_LOG(1, "bad key passed in (non empty instance)"); + return -EINVAL; + } + + /* read olh entry */ + struct rgw_bucket_olh_entry olh_data_entry; + string olh_data_key; + encode_olh_data_key(op.key, &olh_data_key); + int ret = read_index_entry(hctx, olh_data_key, &olh_data_entry); + if (ret < 0 && ret != -ENOENT) { + CLS_LOG(0, "ERROR: read_index_entry() olh_key=%s ret=%d", olh_data_key.c_str(), ret); + return ret; + } + + if (olh_data_entry.tag != op.olh_tag) { + CLS_LOG(1, "NOTICE: %s(): olh_tag_mismatch olh_data_entry.tag=%s op.olh_tag=%s", __func__, olh_data_entry.tag.c_str(), op.olh_tag.c_str()); + return -ECANCELED; } - entry.ver = op.ver; - switch ((int)op.op) { - case CLS_RGW_OP_DEL: - if (ondisk) { - if (!entry.pending_map.size()) { - int ret = cls_cxx_map_remove_key(hctx, op.name); - if (ret < 0) - return ret; - } else { - entry.exists = false; - bufferlist new_key_bl; - ::encode(entry, new_key_bl); - int ret = cls_cxx_map_set_val(hctx, op.name, &new_key_bl); - if (ret < 0) - return ret; - } - } else { - return -ENOENT; - } - break; - case CLS_RGW_OP_ADD: - { - struct rgw_bucket_dir_entry_meta& meta = op.meta; - struct rgw_bucket_category_stats& stats = header.stats[meta.category]; - entry.meta = meta; - entry.name = op.name; - entry.exists = true; - entry.tag = op.tag; - stats.num_entries++; - stats.total_size += meta.size; - stats.total_size_rounded += get_rounded_size(meta.size); - bufferlist new_key_bl; - ::encode(entry, new_key_bl); - int ret = cls_cxx_map_set_val(hctx, op.name, &new_key_bl); - if (ret < 0) - return ret; - } - break; + ret = cls_cxx_map_remove_key(hctx, olh_data_key); + if (ret < 0) { + CLS_LOG(1, "NOTICE: %s(): can't remove key %s ret=%d", __func__, olh_data_key.c_str(), ret); + return ret; } - if (op.log_op) { - rc = log_index_operation(hctx, op.name, op.op, op.tag, entry.meta.mtime, entry.ver, - CLS_RGW_STATE_COMPLETE, header.ver, header.max_marker); - if (rc < 0) - return rc; - } + rgw_bucket_dir_entry plain_entry; - list::iterator remove_iter; - CLS_LOG(20, "rgw_bucket_complete_op(): remove_objs.size()=%d\n", (int)op.remove_objs.size()); - for (remove_iter = op.remove_objs.begin(); remove_iter != op.remove_objs.end(); ++remove_iter) { - string& remove_oid_name = *remove_iter; - CLS_LOG(1, "rgw_bucket_complete_op(): removing entries, read_index_entry name=%s\n", remove_oid_name.c_str()); - struct rgw_bucket_dir_entry remove_entry; - int ret = read_index_entry(hctx, remove_oid_name, &remove_entry); - if (ret < 0) { - CLS_LOG(1, "rgw_bucket_complete_op(): removing entries, read_index_entry name=%s ret=%d\n", remove_oid_name.c_str(), ret); - continue; - } - CLS_LOG(0, "rgw_bucket_complete_op(): entry.name=%s entry.meta.category=%d\n", remove_entry.name.c_str(), remove_entry.meta.category); - unaccount_entry(header, remove_entry); + /* read plain entry, make sure it's a versioned place holder */ + ret = read_index_entry(hctx, op.key.name, &plain_entry); + if (ret == -ENOENT) { + /* we're done, no entry existing */ + return 0; + } + if (ret < 0) { + CLS_LOG(0, "ERROR: read_index_entry key=%s ret=%d", op.key.name.c_str(), ret); + return ret; + } - if (op.log_op) { - rc = log_index_operation(hctx, remove_oid_name, CLS_RGW_OP_DEL, op.tag, remove_entry.meta.mtime, - remove_entry.ver, CLS_RGW_STATE_COMPLETE, header.ver, header.max_marker); - if (rc < 0) - continue; - } + if ((plain_entry.flags & RGW_BUCKET_DIRENT_FLAG_VER_MARKER) == 0) { + /* it's not a version marker, don't remove it */ + return 0; + } - ret = cls_cxx_map_remove_key(hctx, remove_oid_name); - if (ret < 0) { - CLS_LOG(1, "rgw_bucket_complete_op(): cls_cxx_map_remove_key, failed to remove entry, name=%s read_index_entry ret=%d\n", remove_oid_name.c_str(), rc); - continue; - } + ret = cls_cxx_map_remove_key(hctx, op.key.name); + if (ret < 0) { + CLS_LOG(1, "NOTICE: %s(): can't remove key %s ret=%d", __func__, op.key.name.c_str(), ret); + return ret; } - return write_bucket_header(hctx, &header); + return 0; } int rgw_dir_suggest_changes(cls_method_context_t hctx, bufferlist *in, bufferlist *out) @@ -693,18 +1804,12 @@ int rgw_dir_suggest_changes(cls_method_context_t hctx, bufferlist *in, bufferlis bufferlist header_bl; struct rgw_bucket_dir_header header; bool header_changed = false; - int rc = cls_cxx_map_read_header(hctx, &header_bl); - if (rc < 0) - return rc; - uint64_t tag_timeout; - try { - bufferlist::iterator header_iter = header_bl.begin(); - ::decode(header, header_iter); - } catch (buffer::error& error) { - CLS_LOG(1, "ERROR: rgw_dir_suggest_changes(): failed to decode header\n"); - return -EINVAL; + int rc = read_bucket_header(hctx, &header); + if (rc < 0) { + CLS_LOG(1, "ERROR: rgw_dir_suggest_changes(): failed to read header\n"); + return rc; } tag_timeout = (header.tag_timeout ? header.tag_timeout : CEPH_RGW_TAG_TIMEOUT); @@ -724,7 +1829,9 @@ int rgw_dir_suggest_changes(cls_method_context_t hctx, bufferlist *in, bufferlis } bufferlist cur_disk_bl; - int ret = cls_cxx_map_get_val(hctx, cur_change.name, &cur_disk_bl); + string cur_change_key; + encode_obj_index_key(cur_change.key, &cur_change_key); + int ret = cls_cxx_map_get_val(hctx, cur_change_key, &cur_disk_bl); if (ret < 0 && ret != -ENOENT) return -EINVAL; @@ -757,29 +1864,30 @@ int rgw_dir_suggest_changes(cls_method_context_t hctx, bufferlist *in, bufferlis struct rgw_bucket_category_stats& old_stats = header.stats[cur_disk.meta.category]; CLS_LOG(10, "total_entries: %" PRId64 " -> %" PRId64 "\n", old_stats.num_entries, old_stats.num_entries - 1); old_stats.num_entries--; - old_stats.total_size -= cur_disk.meta.size; - old_stats.total_size_rounded -= get_rounded_size(cur_disk.meta.size); + old_stats.total_size -= cur_disk.meta.accounted_size; + old_stats.total_size_rounded -= get_rounded_size(cur_disk.meta.accounted_size); header_changed = true; } struct rgw_bucket_category_stats& stats = header.stats[cur_change.meta.category]; switch(op) { case CEPH_RGW_REMOVE: - CLS_LOG(10, "CEPH_RGW_REMOVE name=%s\n", cur_change.name.c_str()); - ret = cls_cxx_map_remove_key(hctx, cur_change.name); + CLS_LOG(10, "CEPH_RGW_REMOVE name=%s instance=%s\n", cur_change.key.name.c_str(), cur_change.key.instance.c_str()); + ret = cls_cxx_map_remove_key(hctx, cur_change_key); if (ret < 0) return ret; break; case CEPH_RGW_UPDATE: - CLS_LOG(10, "CEPH_RGW_UPDATE name=%s total_entries: %" PRId64 " -> %" PRId64 "\n", cur_change.name.c_str(), stats.num_entries, stats.num_entries + 1); + CLS_LOG(10, "CEPH_RGW_UPDATE name=%s instance=%s total_entries: %" PRId64 " -> %" PRId64 "\n", + cur_change.key.name.c_str(), cur_change.key.instance.c_str(), stats.num_entries, stats.num_entries + 1); stats.num_entries++; - stats.total_size += cur_change.meta.size; - stats.total_size_rounded += get_rounded_size(cur_change.meta.size); + stats.total_size += cur_change.meta.accounted_size; + stats.total_size_rounded += get_rounded_size(cur_change.meta.accounted_size); header_changed = true; cur_change.index_ver = header.ver; bufferlist cur_state_bl; ::encode(cur_change, cur_state_bl); - ret = cls_cxx_map_set_val(hctx, cur_change.name, &cur_state_bl); + ret = cls_cxx_map_set_val(hctx, cur_change_key, &cur_state_bl); if (ret < 0) return ret; break; @@ -793,6 +1901,356 @@ int rgw_dir_suggest_changes(cls_method_context_t hctx, bufferlist *in, bufferlis return 0; } +static int rgw_obj_remove(cls_method_context_t hctx, bufferlist *in, bufferlist *out) +{ + // decode request + rgw_cls_obj_remove_op op; + bufferlist::iterator iter = in->begin(); + try { + ::decode(op, iter); + } catch (buffer::error& err) { + CLS_LOG(0, "ERROR: %s(): failed to decode request", __func__); + return -EINVAL; + } + + if (op.keep_attr_prefixes.empty()) { + return cls_cxx_remove(hctx); + } + + map attrset; + int ret = cls_cxx_getxattrs(hctx, &attrset); + if (ret < 0 && ret != -ENOENT) { + CLS_LOG(0, "ERROR: %s(): cls_cxx_getxattrs() returned %d", __func__, ret); + return ret; + } + + map new_attrs; + for (list::iterator iter = op.keep_attr_prefixes.begin(); + iter != op.keep_attr_prefixes.end(); ++iter) { + string& check_prefix = *iter; + + for (map::iterator aiter = attrset.lower_bound(check_prefix); + aiter != attrset.end(); ++aiter) { + const string& attr = aiter->first; + + if (attr.substr(0, check_prefix.size()) > check_prefix) { + break; + } + + new_attrs[attr] = aiter->second; + } + } + + CLS_LOG(20, "%s(): removing object", __func__); + ret = cls_cxx_remove(hctx); + if (ret < 0) { + CLS_LOG(0, "ERROR: %s(): cls_cxx_remove returned %d", __func__, ret); + return ret; + } + + if (new_attrs.empty()) { + /* no data to keep */ + return 0; + } + + ret = cls_cxx_create(hctx, false); + if (ret < 0) { + CLS_LOG(0, "ERROR: %s(): cls_cxx_create returned %d", __func__, ret); + return ret; + } + + for (map::iterator aiter = new_attrs.begin(); + aiter != new_attrs.end(); ++aiter) { + const string& attr = aiter->first; + + ret = cls_cxx_setxattr(hctx, attr.c_str(), &aiter->second); + CLS_LOG(20, "%s(): setting attr: %s", __func__, attr.c_str()); + if (ret < 0) { + CLS_LOG(0, "ERROR: %s(): cls_cxx_setxattr (attr=%s) returned %d", __func__, attr.c_str(), ret); + return ret; + } + } + + return 0; +} + +static int rgw_obj_check_attrs_prefix(cls_method_context_t hctx, bufferlist *in, bufferlist *out) +{ + // decode request + rgw_cls_obj_check_attrs_prefix op; + bufferlist::iterator iter = in->begin(); + try { + ::decode(op, iter); + } catch (buffer::error& err) { + CLS_LOG(0, "ERROR: %s(): failed to decode request", __func__); + return -EINVAL; + } + + if (op.check_prefix.empty()) { + return -EINVAL; + } + + map attrset; + int ret = cls_cxx_getxattrs(hctx, &attrset); + if (ret < 0 && ret != -ENOENT) { + CLS_LOG(0, "ERROR: %s(): cls_cxx_getxattrs() returned %d", __func__, ret); + return ret; + } + + bool exist = false; + + for (map::iterator aiter = attrset.lower_bound(op.check_prefix); + aiter != attrset.end(); ++aiter) { + const string& attr = aiter->first; + + if (attr.substr(0, op.check_prefix.size()) > op.check_prefix) { + break; + } + + exist = true; + } + + if (exist == op.fail_if_exist) { + return -ECANCELED; + } + + return 0; +} + +static int rgw_bi_get_op(cls_method_context_t hctx, bufferlist *in, bufferlist *out) +{ + // decode request + rgw_cls_bi_get_op op; + bufferlist::iterator iter = in->begin(); + try { + ::decode(op, iter); + } catch (buffer::error& err) { + CLS_LOG(0, "ERROR: %s(): failed to decode request", __func__); + return -EINVAL; + } + + string idx; + + switch (op.type) { + case PlainIdx: + idx = op.key.name; + break; + case InstanceIdx: + encode_obj_index_key(op.key, &idx); + break; + case OLHIdx: + encode_olh_data_key(op.key, &idx); + break; + default: + CLS_LOG(10, "%s(): invalid key type encoding: %d", __func__, op.type); + return -EINVAL; + } + + rgw_cls_bi_get_ret op_ret; + + rgw_cls_bi_entry& entry = op_ret.entry; + + entry.type = op.type; + entry.idx = idx; + + int r = cls_cxx_map_get_val(hctx, idx, &entry.data); + if (r < 0) { + CLS_LOG(10, "%s(): cls_cxx_map_get_val() returned %d", __func__, r); + return r; + } + + ::encode(op_ret, *out); + + return 0; +} + +static int rgw_bi_put_op(cls_method_context_t hctx, bufferlist *in, bufferlist *out) +{ + // decode request + rgw_cls_bi_put_op op; + bufferlist::iterator iter = in->begin(); + try { + ::decode(op, iter); + } catch (buffer::error& err) { + CLS_LOG(0, "ERROR: %s(): failed to decode request", __func__); + return -EINVAL; + } + + rgw_cls_bi_entry& entry = op.entry; + + int r = cls_cxx_map_set_val(hctx, entry.idx, &entry.data); + if (r < 0) { + CLS_LOG(0, "ERROR: %s(): cls_cxx_map_set_val() returned r=%d", __func__, r); + } + + return 0; +} + +static int list_plain_entries(cls_method_context_t hctx, const string& name, const string& marker, uint32_t max, + list *entries) +{ + string filter = name; + string start_key = marker; + int count = 0; + map keys; + do { + if (count >= (int)max) { + return count; + } + keys.clear(); +#define BI_GET_NUM_KEYS 128 + int ret = cls_cxx_map_get_vals(hctx, start_key, filter, BI_GET_NUM_KEYS, &keys); + if (ret < 0) { + return ret; + } + + map::iterator iter; + for (iter = keys.begin(); iter != keys.end(); ++iter) { + rgw_cls_bi_entry entry; + entry.type = PlainIdx; + entry.idx = iter->first; + entry.data = iter->second; + + bufferlist::iterator biter = entry.data.begin(); + + rgw_bucket_dir_entry e; + try { + ::decode(e, biter); + } catch (buffer::error& err) { + CLS_LOG(0, "ERROR: %s(): failed to decode buffer", __func__); + return -EIO; + } + + CLS_LOG(20, "%s(): entry.idx=%s e.key.name=%s", __func__, escape_str(entry.idx).c_str(), escape_str(e.key.name).c_str()); + + if (e.key.name != name) { + return count; + } + + entries->push_back(entry); + count++; + start_key = entry.idx; + } + } while (!keys.empty()); + + return count; +} + +static int list_instance_entries(cls_method_context_t hctx, const string& name, const string& marker, uint32_t max, + list *entries) +{ + cls_rgw_obj_key key(name); + string first_instance_idx; + encode_obj_versioned_data_key(key, &first_instance_idx); + string start_key = first_instance_idx; + if (bi_entry_gt(marker, start_key)) { + start_key = marker; + } + int count = 0; + map keys; + string filter = first_instance_idx; + bool started = true; + do { + if (count >= (int)max) { + return count; + } + keys.clear(); +#define BI_GET_NUM_KEYS 128 + int ret; + if (started) { + ret = cls_cxx_map_get_val(hctx, start_key, &keys[start_key]); + if (ret == -ENOENT) { + ret = cls_cxx_map_get_vals(hctx, start_key, filter, BI_GET_NUM_KEYS, &keys); + } + started = false; + } else { + ret = cls_cxx_map_get_vals(hctx, start_key, filter, BI_GET_NUM_KEYS, &keys); + } + CLS_LOG(20, "%s(): start_key=%s keys.size()=%d", __func__, escape_str(start_key).c_str(), (int)keys.size()); + if (ret < 0) { + return ret; + } + + map::iterator iter; + for (iter = keys.begin(); iter != keys.end(); ++iter) { + rgw_cls_bi_entry entry; + entry.type = InstanceIdx; + entry.idx = iter->first; + entry.data = iter->second; + + CLS_LOG(20, "%s(): entry.idx=%s", __func__, escape_str(entry.idx).c_str()); + + bufferlist::iterator biter = entry.data.begin(); + + rgw_bucket_dir_entry e; + try { + ::decode(e, biter); + } catch (buffer::error& err) { + CLS_LOG(0, "ERROR: %s(): failed to decode buffer (size=%d)", __func__, entry.data.length()); + return -EIO; + } + + if (e.key.name != name) { + return count; + } + + entries->push_back(entry); + count++; + start_key = entry.idx; + } + } while (!keys.empty()); + + return count; +} + +static int rgw_bi_list_op(cls_method_context_t hctx, bufferlist *in, bufferlist *out) +{ + // decode request + rgw_cls_bi_list_op op; + bufferlist::iterator iter = in->begin(); + try { + ::decode(op, iter); + } catch (buffer::error& err) { + CLS_LOG(0, "ERROR: %s(): failed to decode request", __func__); + return -EINVAL; + } + + rgw_cls_bi_list_ret op_ret; + + string filter = op.name; +#define MAX_BI_LIST_ENTRIES 1000 + int32_t max = (op.max < MAX_BI_LIST_ENTRIES ? op.max : MAX_BI_LIST_ENTRIES); + string start_key = op.marker; + int ret = list_plain_entries(hctx, op.name, op.marker, max, &op_ret.entries); + if (ret < 0) { + CLS_LOG(0, "ERROR: %s(): list_plain_entries retured ret=%d", __func__, ret); + return ret; + } + int count = ret; + + ret = list_instance_entries(hctx, op.name, op.marker, max - count, &op_ret.entries); + if (ret < 0) { + CLS_LOG(0, "ERROR: %s(): list_instance_entries retured ret=%d", __func__, ret); + return ret; + } + + cls_rgw_obj_key key(op.name); + rgw_cls_bi_entry entry; + encode_olh_data_key(key, &entry.idx); + ret = cls_cxx_map_get_val(hctx, entry.idx, &entry.data); + if (ret < 0 && ret != -ENOENT) { + CLS_LOG(0, "ERROR: %s(): cls_cxx_map_get_val retured ret=%d", __func__, ret); + return ret; + } else if (ret >= 0) { + entry.type = OLHIdx; + op_ret.entries.push_back(entry); + } + + ::encode(op_ret, *out); + + return 0; +} + int bi_log_record_decode(bufferlist& bl, rgw_bi_log_entry& e) { bufferlist::iterator iter = bl.begin(); @@ -814,8 +2272,6 @@ static int bi_log_iterate_entries(cls_method_context_t hctx, const string& marke map keys; string filter_prefix, end_key; - bufferlist start_bl; - bool start_key_added = false; uint32_t i = 0; string key; @@ -829,17 +2285,13 @@ static int bi_log_iterate_entries(cls_method_context_t hctx, const string& marke key.append(marker); start_key = key; - int ret = cls_cxx_map_get_val(hctx, start_key, &start_bl); - if ((ret < 0) && (ret != -ENOENT)) { - return ret; - } } else { start_key = key_iter; } if (end_marker.empty()) { end_key = BI_PREFIX_CHAR; - end_key.append(bucket_index_prefixes[BI_BUCKET_LAST_INDEX]); + end_key.append(bucket_index_prefixes[BI_BUCKET_LOG_INDEX + 1]); } else { end_key = BI_PREFIX_CHAR; end_key.append(bucket_index_prefixes[BI_BUCKET_LOG_INDEX]); @@ -856,10 +2308,6 @@ static int bi_log_iterate_entries(cls_method_context_t hctx, const string& marke if (ret < 0) return ret; - if ((start_bl.length() > 0) && (!start_key_added)) { - keys[start_key] = start_bl; - start_key_added = true; - } map::iterator iter = keys.begin(); if (iter == keys.end()) break; @@ -1608,6 +3056,19 @@ void __cls_init() cls_register_cxx_method(h_class, "bucket_rebuild_index", CLS_METHOD_RD | CLS_METHOD_WR, rgw_bucket_rebuild_index, &h_rgw_bucket_rebuild_index); cls_register_cxx_method(h_class, "bucket_prepare_op", CLS_METHOD_RD | CLS_METHOD_WR, rgw_bucket_prepare_op, &h_rgw_bucket_prepare_op); cls_register_cxx_method(h_class, "bucket_complete_op", CLS_METHOD_RD | CLS_METHOD_WR, rgw_bucket_complete_op, &h_rgw_bucket_complete_op); + cls_register_cxx_method(h_class, "bucket_link_olh", CLS_METHOD_RD | CLS_METHOD_WR, rgw_bucket_link_olh, &h_rgw_bucket_link_olh); + cls_register_cxx_method(h_class, "bucket_unlink_instance", CLS_METHOD_RD | CLS_METHOD_WR, rgw_bucket_unlink_instance, &h_rgw_bucket_unlink_instance_op); + cls_register_cxx_method(h_class, "bucket_read_olh_log", CLS_METHOD_RD, rgw_bucket_read_olh_log, &h_rgw_bucket_read_olh_log); + cls_register_cxx_method(h_class, "bucket_trim_olh_log", CLS_METHOD_RD | CLS_METHOD_WR, rgw_bucket_trim_olh_log, &h_rgw_bucket_trim_olh_log); + cls_register_cxx_method(h_class, "bucket_clear_olh", CLS_METHOD_RD | CLS_METHOD_WR, rgw_bucket_clear_olh, &h_rgw_bucket_clear_olh); + + cls_register_cxx_method(h_class, "obj_remove", CLS_METHOD_RD | CLS_METHOD_WR, rgw_obj_remove, &h_rgw_obj_remove); + cls_register_cxx_method(h_class, "obj_check_attrs_prefix", CLS_METHOD_RD, rgw_obj_check_attrs_prefix, &h_rgw_obj_check_attrs_prefix); + + cls_register_cxx_method(h_class, "bi_get", CLS_METHOD_RD, rgw_bi_get_op, &h_rgw_bi_get_op); + cls_register_cxx_method(h_class, "bi_put", CLS_METHOD_RD | CLS_METHOD_WR, rgw_bi_put_op, &h_rgw_bi_put_op); + cls_register_cxx_method(h_class, "bi_list", CLS_METHOD_RD, rgw_bi_list_op, &h_rgw_bi_list_op); + cls_register_cxx_method(h_class, "bi_log_list", CLS_METHOD_RD, rgw_bi_log_list, &h_rgw_bi_log_list_op); cls_register_cxx_method(h_class, "bi_log_trim", CLS_METHOD_RD | CLS_METHOD_WR, rgw_bi_log_trim, &h_rgw_bi_log_list_op); cls_register_cxx_method(h_class, "dir_suggest_changes", CLS_METHOD_RD | CLS_METHOD_WR, rgw_dir_suggest_changes, &h_rgw_dir_suggest_changes); diff --git a/src/cls/rgw/cls_rgw_client.cc b/src/cls/rgw/cls_rgw_client.cc index c13c1a1559c62..e6ac56b822c36 100644 --- a/src/cls/rgw/cls_rgw_client.cc +++ b/src/cls/rgw/cls_rgw_client.cc @@ -11,142 +11,342 @@ using namespace librados; +const string BucketIndexShardsManager::KEY_VALUE_SEPARATOR = "#"; +const string BucketIndexShardsManager::SHARDS_SEPARATOR = ","; + +/** + * This class represents the bucket index object operation callback context. + */ +template +class ClsBucketIndexOpCtx : public ObjectOperationCompletion { +private: + T *data; + int *ret_code; +public: + ClsBucketIndexOpCtx(T* _data, int *_ret_code) : data(_data), ret_code(_ret_code) { assert(data); } + ~ClsBucketIndexOpCtx() {} + void handle_completion(int r, bufferlist& outbl) { + if (r >= 0) { + try { + bufferlist::iterator iter = outbl.begin(); + ::decode((*data), iter); + } catch (buffer::error& err) { + r = -EIO; + } + } + if (ret_code) { + *ret_code = r; + } + } +}; + +void BucketIndexAioManager::do_completion(int id) { + Mutex::Locker l(lock); + + map::iterator iter = pendings.find(id); + assert(iter != pendings.end()); + completions[id] = iter->second; + pendings.erase(iter); + + // If the caller needs a list of finished objects, store them + // for further processing + map::iterator miter = pending_objs.find(id); + if (miter != pending_objs.end()) { + completion_objs[id] = miter->second; + pending_objs.erase(miter); + } + + cond.Signal(); +} + +bool BucketIndexAioManager::wait_for_completions(int valid_ret_code, + int *num_completions, int *ret_code, map *objs) { + lock.Lock(); + if (pendings.empty() && completions.empty()) { + lock.Unlock(); + return false; + } + + if (completions.empty()) { + // Wait for AIO completion + cond.Wait(lock); + } + + // Clear the completed AIOs + map::iterator iter = completions.begin(); + for (; iter != completions.end(); ++iter) { + int r = iter->second->get_return_value(); + if (objs && r == 0) { /* update list of successfully completed objs */ + map::iterator liter = completion_objs.find(iter->first); + if (liter != completion_objs.end()) { + (*objs)[liter->first] = liter->second; + } + } + if (ret_code && (r < 0 && r != valid_ret_code)) + (*ret_code) = r; + iter->second->release(); + } + if (num_completions) + (*num_completions) = completions.size(); + completions.clear(); + lock.Unlock(); + + return true; +} + void cls_rgw_bucket_init(ObjectWriteOperation& o) { bufferlist in; o.exec("rgw", "bucket_init_index", in); } -void cls_rgw_bucket_set_tag_timeout(ObjectWriteOperation& o, uint64_t tag_timeout) -{ +static bool issue_bucket_index_init_op(librados::IoCtx& io_ctx, + const string& oid, BucketIndexAioManager *manager) { + bufferlist in; + librados::ObjectWriteOperation op; + op.create(true); + op.exec("rgw", "bucket_init_index", in); + return manager->aio_operate(io_ctx, oid, &op); +} + +static bool issue_bucket_set_tag_timeout_op(librados::IoCtx& io_ctx, + const string& oid, uint64_t timeout, BucketIndexAioManager *manager) { bufferlist in; struct rgw_cls_tag_timeout_op call; - call.tag_timeout = tag_timeout; + call.tag_timeout = timeout; ::encode(call, in); - o.exec("rgw", "bucket_set_tag_timeout", in); + ObjectWriteOperation op; + op.exec("rgw", "bucket_set_tag_timeout", in); + return manager->aio_operate(io_ctx, oid, &op); +} + +int CLSRGWIssueBucketIndexInit::issue_op(int shard_id, const string& oid) +{ + return issue_bucket_index_init_op(io_ctx, oid, &manager); +} + +void CLSRGWIssueBucketIndexInit::cleanup() +{ + // Do best effort removal + for (map::iterator citer = objs_container.begin(); citer != iter; ++citer) { + io_ctx.remove(citer->second); + } +} + +int CLSRGWIssueSetTagTimeout::issue_op(int shard_id, const string& oid) +{ + return issue_bucket_set_tag_timeout_op(io_ctx, oid, tag_timeout, &manager); } void cls_rgw_bucket_prepare_op(ObjectWriteOperation& o, RGWModifyOp op, string& tag, - string& name, string& locator, bool log_op) + const cls_rgw_obj_key& key, const string& locator, bool log_op, + uint16_t bilog_flags) { struct rgw_cls_obj_prepare_op call; call.op = op; call.tag = tag; - call.name = name; + call.key = key; call.locator = locator; call.log_op = log_op; + call.bilog_flags = bilog_flags; bufferlist in; ::encode(call, in); o.exec("rgw", "bucket_prepare_op", in); } void cls_rgw_bucket_complete_op(ObjectWriteOperation& o, RGWModifyOp op, string& tag, - rgw_bucket_entry_ver& ver, string& name, rgw_bucket_dir_entry_meta& dir_meta, - list *remove_objs, bool log_op) + rgw_bucket_entry_ver& ver, + const cls_rgw_obj_key& key, + rgw_bucket_dir_entry_meta& dir_meta, + list *remove_objs, bool log_op, + uint16_t bilog_flags) { bufferlist in; struct rgw_cls_obj_complete_op call; call.op = op; call.tag = tag; - call.name = name; + call.key = key; call.ver = ver; call.meta = dir_meta; call.log_op = log_op; + call.bilog_flags = bilog_flags; if (remove_objs) call.remove_objs = *remove_objs; ::encode(call, in); o.exec("rgw", "bucket_complete_op", in); } - -int cls_rgw_list_op(IoCtx& io_ctx, string& oid, string& start_obj, - string& filter_prefix, uint32_t num_entries, - rgw_bucket_dir *dir, bool *is_truncated) -{ - bufferlist in, out; +static bool issue_bucket_list_op(librados::IoCtx& io_ctx, + const string& oid, const cls_rgw_obj_key& start_obj, const string& filter_prefix, + uint32_t num_entries, bool list_versions, BucketIndexAioManager *manager, + struct rgw_cls_list_ret *pdata) { + bufferlist in; struct rgw_cls_list_op call; call.start_obj = start_obj; call.filter_prefix = filter_prefix; call.num_entries = num_entries; + call.list_versions = list_versions; ::encode(call, in); - int r = io_ctx.exec(oid, "rgw", "bucket_list", in, out); + + librados::ObjectReadOperation op; + op.exec("rgw", "bucket_list", in, new ClsBucketIndexOpCtx(pdata, NULL)); + return manager->aio_operate(io_ctx, oid, &op); +} + +int CLSRGWIssueBucketList::issue_op(int shard_id, const string& oid) +{ + return issue_bucket_list_op(io_ctx, oid, start_obj, filter_prefix, num_entries, list_versions, &manager, &result[shard_id]); +} + +void cls_rgw_remove_obj(librados::ObjectWriteOperation& o, list& keep_attr_prefixes) +{ + bufferlist in; + struct rgw_cls_obj_remove_op call; + call.keep_attr_prefixes = keep_attr_prefixes; + ::encode(call, in); + o.exec("rgw", "obj_remove", in); +} + +void cls_rgw_obj_check_attrs_prefix(librados::ObjectOperation& o, const string& prefix, bool fail_if_exist) +{ + bufferlist in; + struct rgw_cls_obj_check_attrs_prefix call; + call.check_prefix = prefix; + call.fail_if_exist = fail_if_exist; + ::encode(call, in); + o.exec("rgw", "obj_check_attrs_prefix", in); +} + +int cls_rgw_bi_get(librados::IoCtx& io_ctx, const string oid, + BIIndexType index_type, cls_rgw_obj_key& key, + rgw_cls_bi_entry *entry) +{ + bufferlist in, out; + struct rgw_cls_bi_get_op call; + call.key = key; + call.type = index_type; + ::encode(call, in); + int r = io_ctx.exec(oid, "rgw", "bi_get", in, out); if (r < 0) return r; - struct rgw_cls_list_ret ret; + struct rgw_cls_bi_get_ret op_ret; + bufferlist::iterator iter = out.begin(); try { - bufferlist::iterator iter = out.begin(); - ::decode(ret, iter); + ::decode(op_ret, iter); } catch (buffer::error& err) { return -EIO; } - if (dir) - *dir = ret.dir; - if (is_truncated) - *is_truncated = ret.is_truncated; + *entry = op_ret.entry; - return r; + return 0; +} + +int cls_rgw_bi_put(librados::IoCtx& io_ctx, const string oid, rgw_cls_bi_entry& entry) +{ + bufferlist in, out; + struct rgw_cls_bi_put_op call; + call.entry = entry; + ::encode(call, in); + int r = io_ctx.exec(oid, "rgw", "bi_put", in, out); + if (r < 0) + return r; + + return 0; } -int cls_rgw_bucket_check_index_op(IoCtx& io_ctx, string& oid, - rgw_bucket_dir_header *existing_header, - rgw_bucket_dir_header *calculated_header) +int cls_rgw_bi_list(librados::IoCtx& io_ctx, const string oid, + const string& name, const string& marker, uint32_t max, + list *entries, bool *is_truncated) { bufferlist in, out; - int r = io_ctx.exec(oid, "rgw", "bucket_check_index", in, out); + struct rgw_cls_bi_list_op call; + call.name = name; + call.marker = marker; + call.max = max; + ::encode(call, in); + int r = io_ctx.exec(oid, "rgw", "bi_list", in, out); if (r < 0) return r; - struct rgw_cls_check_index_ret ret; + struct rgw_cls_bi_list_ret op_ret; + bufferlist::iterator iter = out.begin(); try { - bufferlist::iterator iter = out.begin(); - ::decode(ret, iter); + ::decode(op_ret, iter); } catch (buffer::error& err) { return -EIO; } - if (existing_header) - *existing_header = ret.existing_header; - if (calculated_header) - *calculated_header = ret.calculated_header; + entries->swap(op_ret.entries); + *is_truncated = op_ret.is_truncated; return 0; } -int cls_rgw_bucket_rebuild_index_op(IoCtx& io_ctx, string& oid) +int cls_rgw_bucket_link_olh(librados::IoCtx& io_ctx, const string& oid, const cls_rgw_obj_key& key, bufferlist& olh_tag, + bool delete_marker, const string& op_tag, struct rgw_bucket_dir_entry_meta *meta, + uint64_t olh_epoch, bool log_op) { bufferlist in, out; - int r = io_ctx.exec(oid, "rgw", "bucket_rebuild_index", in, out); + struct rgw_cls_link_olh_op call; + call.key = key; + call.olh_tag = string(olh_tag.c_str(), olh_tag.length()); + call.op_tag = op_tag; + call.delete_marker = delete_marker; + if (meta) { + call.meta = *meta; + } + call.olh_epoch = olh_epoch; + call.log_op = log_op; + ::encode(call, in); + int r = io_ctx.exec(oid, "rgw", "bucket_link_olh", in, out); if (r < 0) return r; return 0; } -void cls_rgw_encode_suggestion(char op, rgw_bucket_dir_entry& dirent, bufferlist& updates) +int cls_rgw_bucket_unlink_instance(librados::IoCtx& io_ctx, const string& oid, + const cls_rgw_obj_key& key, const string& op_tag, + uint64_t olh_epoch, bool log_op) { - updates.append(op); - ::encode(dirent, updates); -} + bufferlist in, out; + struct rgw_cls_unlink_instance_op call; + call.key = key; + call.op_tag = op_tag; + call.olh_epoch = olh_epoch; + call.log_op = log_op; + ::encode(call, in); + int r = io_ctx.exec(oid, "rgw", "bucket_unlink_instance", in, out); + if (r < 0) + return r; -void cls_rgw_suggest_changes(ObjectWriteOperation& o, bufferlist& updates) -{ - o.exec("rgw", "dir_suggest_changes", updates); + return 0; } -int cls_rgw_get_dir_header(IoCtx& io_ctx, string& oid, rgw_bucket_dir_header *header) +int cls_rgw_get_olh_log(IoCtx& io_ctx, string& oid, librados::ObjectReadOperation& op, const cls_rgw_obj_key& olh, uint64_t ver_marker, + const string& olh_tag, + map > *log, bool *is_truncated) { bufferlist in, out; - struct rgw_cls_list_op call; - call.num_entries = 0; + struct rgw_cls_read_olh_log_op call; + call.olh = olh; + call.ver_marker = ver_marker; + call.olh_tag = olh_tag; ::encode(call, in); - int r = io_ctx.exec(oid, "rgw", "bucket_list", in, out); - if (r < 0) + int op_ret; + op.exec("rgw", "bucket_read_olh_log", in, &out, &op_ret); + int r = io_ctx.operate(oid, &op, NULL); + if (r < 0) { return r; + } + if (op_ret < 0) { + return op_ret; + } - struct rgw_cls_list_ret ret; + struct rgw_cls_read_olh_log_ret ret; try { bufferlist::iterator iter = out.begin(); ::decode(ret, iter); @@ -154,12 +354,125 @@ int cls_rgw_get_dir_header(IoCtx& io_ctx, string& oid, rgw_bucket_dir_header *he return -EIO; } - if (header) - *header = ret.dir.header; + if (log) { + *log = ret.log; + } + if (is_truncated) { + *is_truncated = ret.is_truncated; + } return r; } +void cls_rgw_trim_olh_log(librados::ObjectWriteOperation& op, const cls_rgw_obj_key& olh, uint64_t ver, const string& olh_tag) +{ + bufferlist in; + struct rgw_cls_trim_olh_log_op call; + call.olh = olh; + call.ver = ver; + call.olh_tag = olh_tag; + ::encode(call, in); + op.exec("rgw", "bucket_trim_olh_log", in); +} + +int cls_rgw_clear_olh(IoCtx& io_ctx, string& oid, const cls_rgw_obj_key& olh, const string& olh_tag) +{ + bufferlist in, out; + struct rgw_cls_bucket_clear_olh_op call; + call.key = olh; + call.olh_tag = olh_tag; + ::encode(call, in); + librados::ObjectWriteOperation op; + int op_ret; + op.exec("rgw", "bucket_clear_olh", in, &out, &op_ret); + int r = io_ctx.operate(oid, &op); + if (r < 0) { + return r; + } + return op_ret; +} + +static bool issue_bi_log_list_op(librados::IoCtx& io_ctx, const string& oid, int shard_id, + BucketIndexShardsManager& marker_mgr, uint32_t max, BucketIndexAioManager *manager, + struct cls_rgw_bi_log_list_ret *pdata) { + bufferlist in; + cls_rgw_bi_log_list_op call; + call.marker = marker_mgr.get(shard_id, ""); + call.max = max; + ::encode(call, in); + + librados::ObjectReadOperation op; + op.exec("rgw", "bi_log_list", in, new ClsBucketIndexOpCtx(pdata, NULL)); + return manager->aio_operate(io_ctx, oid, &op); +} + +int CLSRGWIssueBILogList::issue_op(int shard_id, const string& oid) +{ + return issue_bi_log_list_op(io_ctx, oid, shard_id, marker_mgr, max, &manager, &result[shard_id]); +} + +static bool issue_bi_log_trim(librados::IoCtx& io_ctx, const string& oid, int shard_id, + BucketIndexShardsManager& start_marker_mgr, + BucketIndexShardsManager& end_marker_mgr, BucketIndexAioManager *manager) { + bufferlist in; + cls_rgw_bi_log_trim_op call; + call.start_marker = start_marker_mgr.get(shard_id, ""); + call.end_marker = end_marker_mgr.get(shard_id, ""); + ::encode(call, in); + ObjectWriteOperation op; + op.exec("rgw", "bi_log_trim", in); + return manager->aio_operate(io_ctx, oid, &op); +} + +int CLSRGWIssueBILogTrim::issue_op(int shard_id, const string& oid) +{ + return issue_bi_log_trim(io_ctx, oid, shard_id, start_marker_mgr, end_marker_mgr, &manager); +} + +static bool issue_bucket_check_index_op(IoCtx& io_ctx, const string& oid, BucketIndexAioManager *manager, + struct rgw_cls_check_index_ret *pdata) { + bufferlist in; + librados::ObjectReadOperation op; + op.exec("rgw", "bucket_check_index", in, new ClsBucketIndexOpCtx( + pdata, NULL)); + return manager->aio_operate(io_ctx, oid, &op); +} + +int CLSRGWIssueBucketCheck::issue_op(int shard_id, const string& oid) +{ + return issue_bucket_check_index_op(io_ctx, oid, &manager, &result[shard_id]); +} + +static bool issue_bucket_rebuild_index_op(IoCtx& io_ctx, const string& oid, + BucketIndexAioManager *manager) { + bufferlist in; + librados::ObjectWriteOperation op; + op.exec("rgw", "bucket_rebuild_index", in); + return manager->aio_operate(io_ctx, oid, &op); +} + +int CLSRGWIssueBucketRebuild::issue_op(int shard_id, const string& oid) +{ + return issue_bucket_rebuild_index_op(io_ctx, oid, &manager); +} + +void cls_rgw_encode_suggestion(char op, rgw_bucket_dir_entry& dirent, bufferlist& updates) +{ + updates.append(op); + ::encode(dirent, updates); +} + +void cls_rgw_suggest_changes(ObjectWriteOperation& o, bufferlist& updates) +{ + o.exec("rgw", "dir_suggest_changes", updates); +} + +int CLSRGWIssueGetDirHeader::issue_op(int shard_id, const string& oid) +{ + cls_rgw_obj_key nokey; + return issue_bucket_list_op(io_ctx, oid, nokey, "", 0, false, &manager, &result[shard_id]); +} + class GetDirHeaderCompletion : public ObjectOperationCompletion { RGWGetDirHeader_CB *ret_ctx; public: @@ -198,56 +511,6 @@ int cls_rgw_get_dir_header_async(IoCtx& io_ctx, string& oid, RGWGetDirHeader_CB return 0; } -int cls_rgw_bi_log_list(IoCtx& io_ctx, string& oid, string& marker, uint32_t max, - list& entries, bool *truncated) -{ - bufferlist in, out; - cls_rgw_bi_log_list_op call; - call.marker = marker; - call.max = max; - ::encode(call, in); - int r = io_ctx.exec(oid, "rgw", "bi_log_list", in, out); - if (r < 0) - return r; - - cls_rgw_bi_log_list_ret ret; - try { - bufferlist::iterator iter = out.begin(); - ::decode(ret, iter); - } catch (buffer::error& err) { - return -EIO; - } - - entries = ret.entries; - - if (truncated) - *truncated = ret.truncated; - - return r; -} - -int cls_rgw_bi_log_trim(IoCtx& io_ctx, string& oid, string& start_marker, string& end_marker) -{ - do { - int r; - bufferlist in, out; - cls_rgw_bi_log_trim_op call; - call.start_marker = start_marker; - call.end_marker = end_marker; - ::encode(call, in); - r = io_ctx.exec(oid, "rgw", "bi_log_trim", in, out); - - if (r == -ENODATA) - break; - - if (r < 0) - return r; - - } while (1); - - return 0; -} - int cls_rgw_usage_log_read(IoCtx& io_ctx, string& oid, string& user, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries, string& read_iter, map& usage, @@ -348,7 +611,7 @@ int cls_rgw_gc_list(IoCtx& io_ctx, string& oid, string& marker, uint32_t max, bo return -EIO; } - entries = ret.entries; + entries.swap(ret.entries); if (truncated) *truncated = ret.truncated; diff --git a/src/cls/rgw/cls_rgw_client.h b/src/cls/rgw/cls_rgw_client.h index c6b5b757fa843..ecec679192ed3 100644 --- a/src/cls/rgw/cls_rgw_client.h +++ b/src/cls/rgw/cls_rgw_client.h @@ -2,50 +2,454 @@ #define CEPH_CLS_RGW_CLIENT_H #include "include/types.h" +#include "include/str_list.h" #include "include/rados/librados.hpp" #include "cls_rgw_types.h" +#include "cls_rgw_ops.h" #include "common/RefCountedObj.h" +// Forward declaration +class BucketIndexAioManager; + +/* + * Bucket index AIO request argument, this is used to pass a argument + * to callback. + */ +struct BucketIndexAioArg : public RefCountedObject { + BucketIndexAioArg(int _id, BucketIndexAioManager* _manager) : + id(_id), manager(_manager) {} + int id; + BucketIndexAioManager* manager; +}; + +/* + * This class manages AIO completions. This class is not completely thread-safe, + * methods like *get_next* is not thread-safe and is expected to be called from + * within one thread. + */ +class BucketIndexAioManager { +private: + map pendings; + map completions; + map pending_objs; + map completion_objs; + int next; + Mutex lock; + Cond cond; + /* + * Callback implementation for AIO request. + */ + static void bucket_index_op_completion_cb(void* cb, void* arg) { + BucketIndexAioArg* cb_arg = (BucketIndexAioArg*) arg; + cb_arg->manager->do_completion(cb_arg->id); + cb_arg->put(); + } + + /* + * Get next request ID. This method is not thread-safe. + * + * Return next request ID. + */ + int get_next() { return next++; } + + /* + * Add a new pending AIO completion instance. + * + * @param id - the request ID. + * @param completion - the AIO completion instance. + * @param oid - the object id associated with the object, if it is NULL, we don't + * track the object id per callback. + */ + void add_pending(int id, librados::AioCompletion* completion, const string& oid) { + pendings[id] = completion; + pending_objs[id] = oid; + } +public: + /* + * Create a new instance. + */ + BucketIndexAioManager() : next(0), lock("BucketIndexAioManager::lock") {} + + + /* + * Do completion for the given AIO request. + */ + void do_completion(int id); + + /* + * Wait for AIO completions. + * + * valid_ret_code - valid AIO return code. + * num_completions - number of completions. + * ret_code - return code of failed AIO. + * objs - a list of objects that has been finished the AIO. + * + * Return false if there is no pending AIO, true otherwise. + */ + bool wait_for_completions(int valid_ret_code, int *num_completions, int *ret_code, + map *objs); + + /** + * Do aio read operation. + */ + bool aio_operate(librados::IoCtx& io_ctx, const string& oid, librados::ObjectReadOperation *op) { + Mutex::Locker l(lock); + BucketIndexAioArg *arg = new BucketIndexAioArg(get_next(), this); + librados::AioCompletion *c = librados::Rados::aio_create_completion((void*)arg, NULL, bucket_index_op_completion_cb); + int r = io_ctx.aio_operate(oid, c, (librados::ObjectReadOperation*)op, NULL); + if (r >= 0) { + add_pending(arg->id, c, oid); + } + return r; + } + + /** + * Do aio write operation. + */ + bool aio_operate(librados::IoCtx& io_ctx, const string& oid, librados::ObjectWriteOperation *op) { + Mutex::Locker l(lock); + BucketIndexAioArg *arg = new BucketIndexAioArg(get_next(), this); + librados::AioCompletion *c = librados::Rados::aio_create_completion((void*)arg, NULL, bucket_index_op_completion_cb); + int r = io_ctx.aio_operate(oid, c, (librados::ObjectWriteOperation*)op); + if (r >= 0) { + add_pending(arg->id, c, oid); + } + return r; + } +}; + class RGWGetDirHeader_CB : public RefCountedObject { public: virtual ~RGWGetDirHeader_CB() {} virtual void handle_response(int r, rgw_bucket_dir_header& header) = 0; }; +class BucketIndexShardsManager { +private: + // Per shard setting manager, for example, marker. + map value_by_shards; +public: + const static string KEY_VALUE_SEPARATOR; + const static string SHARDS_SEPARATOR; + + void add(int shard, const string& value) { + value_by_shards[shard] = value; + } + + const string& get(int shard, const string& default_value) { + map::iterator iter = value_by_shards.find(shard); + return (iter == value_by_shards.end() ? default_value : iter->second); + } + + map& get() { + return value_by_shards; + } + + bool empty() { + return value_by_shards.empty(); + } + + void to_string(string *out) const { + if (!out) { + return; + } + out->clear(); + map::const_iterator iter = value_by_shards.begin(); + for (; iter != value_by_shards.end(); ++iter) { + if (out->length()) { + // Not the first item, append a separator first + out->append(SHARDS_SEPARATOR); + } + char buf[16]; + snprintf(buf, sizeof(buf), "%d", iter->first); + out->append(buf); + out->append(KEY_VALUE_SEPARATOR); + out->append(iter->second); + } + } + + static bool is_shards_marker(const string& marker) { + return marker.find(KEY_VALUE_SEPARATOR) != string::npos; + } + + /* + * convert from string. There are two options of how the string looks like: + * + * 1. Single shard, no shard id specified, e.g. 000001.23.1 + * + * for this case, if passed shard_id >= 0, use this shard id, otherwise assume that it's a + * bucket with no shards. + * + * 2. One or more shards, shard id specified for each shard, e.g., 0#00002.12,1#00003.23.2 + * + */ + int from_string(const string& composed_marker, int shard_id) { + value_by_shards.clear(); + vector shards; + get_str_vec(composed_marker, SHARDS_SEPARATOR.c_str(), shards); + if (shards.size() > 1 && shard_id >= 0) { + return -EINVAL; + } + vector::const_iterator iter = shards.begin(); + for (; iter != shards.end(); ++iter) { + size_t pos = iter->find(KEY_VALUE_SEPARATOR); + if (pos == string::npos) { + if (!value_by_shards.empty()) { + return -EINVAL; + } + if (shard_id < 0) { + add(0, *iter); + } else { + add(shard_id, *iter); + } + return 0; + } + string shard_str = iter->substr(0, pos); + string err; + int shard = (int)strict_strtol(shard_str.c_str(), 10, &err); + if (!err.empty()) { + return -EINVAL; + } + add(shard, iter->substr(pos + 1)); + } + return 0; + } +}; + /* bucket index */ void cls_rgw_bucket_init(librados::ObjectWriteOperation& o); -void cls_rgw_bucket_set_tag_timeout(librados::ObjectWriteOperation& o, uint64_t tag_timeout); +class CLSRGWConcurrentIO { +protected: + librados::IoCtx& io_ctx; + map& objs_container; + map::iterator iter; + uint32_t max_aio; + BucketIndexAioManager manager; + + virtual int issue_op(int shard_id, const string& oid) = 0; + + virtual void cleanup() {} + virtual int valid_ret_code() { return 0; } + // Return true if multiple rounds of OPs might be needed, this happens when + // OP needs to be re-send until a certain code is returned. + virtual bool need_multiple_rounds() { return false; } + // Add a new object to the end of the container. + virtual void add_object(int shard, const string& oid) {} + virtual void reset_container(map& objs) {} + +public: + CLSRGWConcurrentIO(librados::IoCtx& ioc, map& _objs_container, + uint32_t _max_aio) : io_ctx(ioc), objs_container(_objs_container), max_aio(_max_aio) {} + virtual ~CLSRGWConcurrentIO() {} + + int operator()() { + int ret = 0; + iter = objs_container.begin(); + for (; iter != objs_container.end() && max_aio-- > 0; ++iter) { + ret = issue_op(iter->first, iter->second); + if (ret < 0) + break; + } + + int num_completions, r = 0; + map objs; + map *pobjs = (need_multiple_rounds() ? &objs : NULL); + while (manager.wait_for_completions(valid_ret_code(), &num_completions, &r, pobjs)) { + if (r >= 0 && ret >= 0) { + for(int i = 0; i < num_completions && iter != objs_container.end(); ++i, ++iter) { + int issue_ret = issue_op(iter->first, iter->second); + if(issue_ret < 0) { + ret = issue_ret; + break; + } + } + } else if (ret >= 0) { + ret = r; + } + if (need_multiple_rounds() && iter == objs_container.end() && !objs.empty()) { + // For those objects which need another round, use them to reset + // the container + reset_container(objs); + } + } + + if (ret < 0) { + cleanup(); + } + return ret; + } +}; + +class CLSRGWIssueBucketIndexInit : public CLSRGWConcurrentIO { +protected: + int issue_op(int shard_id, const string& oid); + int valid_ret_code() { return -EEXIST; } + void cleanup(); +public: + CLSRGWIssueBucketIndexInit(librados::IoCtx& ioc, map& _bucket_objs, + uint32_t _max_aio) : + CLSRGWConcurrentIO(ioc, _bucket_objs, _max_aio) {} +}; + +class CLSRGWIssueSetTagTimeout : public CLSRGWConcurrentIO { + uint64_t tag_timeout; +protected: + int issue_op(int shard_id, const string& oid); +public: + CLSRGWIssueSetTagTimeout(librados::IoCtx& ioc, map& _bucket_objs, + uint32_t _max_aio, uint64_t _tag_timeout) : + CLSRGWConcurrentIO(ioc, _bucket_objs, _max_aio), tag_timeout(_tag_timeout) {} +}; void cls_rgw_bucket_prepare_op(librados::ObjectWriteOperation& o, RGWModifyOp op, string& tag, - string& name, string& locator, bool log_op); + const cls_rgw_obj_key& key, const string& locator, bool log_op, + uint16_t bilog_op); void cls_rgw_bucket_complete_op(librados::ObjectWriteOperation& o, RGWModifyOp op, string& tag, - rgw_bucket_entry_ver& ver, string& name, rgw_bucket_dir_entry_meta& dir_meta, - list *remove_objs, bool log_op); - -int cls_rgw_list_op(librados::IoCtx& io_ctx, string& oid, string& start_obj, - string& filter_prefix, uint32_t num_entries, - rgw_bucket_dir *dir, bool *is_truncated); - -int cls_rgw_bucket_check_index_op(librados::IoCtx& io_ctx, string& oid, - rgw_bucket_dir_header *existing_header, - rgw_bucket_dir_header *calculated_header); -int cls_rgw_bucket_rebuild_index_op(librados::IoCtx& io_ctx, string& oid); - -int cls_rgw_get_dir_header(librados::IoCtx& io_ctx, string& oid, rgw_bucket_dir_header *header); + rgw_bucket_entry_ver& ver, + const cls_rgw_obj_key& key, + rgw_bucket_dir_entry_meta& dir_meta, + list *remove_objs, bool log_op, + uint16_t bilog_op); + +void cls_rgw_remove_obj(librados::ObjectWriteOperation& o, list& keep_attr_prefixes); +void cls_rgw_obj_check_attrs_prefix(librados::ObjectOperation& o, const string& prefix, bool fail_if_exist); + +int cls_rgw_bi_get(librados::IoCtx& io_ctx, const string oid, + BIIndexType index_type, cls_rgw_obj_key& key, + rgw_cls_bi_entry *entry); +int cls_rgw_bi_put(librados::IoCtx& io_ctx, const string oid, rgw_cls_bi_entry& entry); +int cls_rgw_bi_list(librados::IoCtx& io_ctx, const string oid, + const string& name, const string& marker, uint32_t max, + list *entries, bool *is_truncated); + + +int cls_rgw_bucket_link_olh(librados::IoCtx& io_ctx, const string& oid, const cls_rgw_obj_key& key, bufferlist& olh_tag, + bool delete_marker, const string& op_tag, struct rgw_bucket_dir_entry_meta *meta, + uint64_t olh_epoch, bool log_op); +int cls_rgw_bucket_unlink_instance(librados::IoCtx& io_ctx, const string& oid, const cls_rgw_obj_key& key, const string& op_tag, + uint64_t olh_epoch, bool log_op); +int cls_rgw_get_olh_log(librados::IoCtx& io_ctx, string& oid, librados::ObjectReadOperation& op, const cls_rgw_obj_key& olh, uint64_t ver_marker, + const string& olh_tag, + map > *log, bool *is_truncated); +void cls_rgw_trim_olh_log(librados::ObjectWriteOperation& op, const cls_rgw_obj_key& olh, uint64_t ver, const string& olh_tag); +int cls_rgw_clear_olh(librados::IoCtx& io_ctx, string& oid, const cls_rgw_obj_key& olh, const string& olh_tag); + +/** + * List the bucket with the starting object and filter prefix. + * NOTE: this method do listing requests for each bucket index shards identified by + * the keys of the *list_results* map, which means the map should be popludated + * by the caller to fill with each bucket index object id. + * + * io_ctx - IO context for rados. + * start_obj - marker for the listing. + * filter_prefix - filter prefix. + * num_entries - number of entries to request for each object (note the total + * amount of entries returned depends on the number of shardings). + * list_results - the list results keyed by bucket index object id. + * max_aio - the maximum number of AIO (for throttling). + * + * Return 0 on success, a failure code otherwise. +*/ + +class CLSRGWIssueBucketList : public CLSRGWConcurrentIO { + cls_rgw_obj_key start_obj; + string filter_prefix; + uint32_t num_entries; + bool list_versions; + map& result; +protected: + int issue_op(int shard_id, const string& oid); +public: + CLSRGWIssueBucketList(librados::IoCtx& io_ctx, const cls_rgw_obj_key& _start_obj, + const string& _filter_prefix, uint32_t _num_entries, + bool _list_versions, + map& oids, + map& list_results, + uint32_t max_aio) : + CLSRGWConcurrentIO(io_ctx, oids, max_aio), + start_obj(_start_obj), filter_prefix(_filter_prefix), num_entries(_num_entries), list_versions(_list_versions), result(list_results) {} +}; + +class CLSRGWIssueBILogList : public CLSRGWConcurrentIO { + map& result; + BucketIndexShardsManager& marker_mgr; + uint32_t max; +protected: + int issue_op(int shard_id, const string& oid); +public: + CLSRGWIssueBILogList(librados::IoCtx& io_ctx, BucketIndexShardsManager& _marker_mgr, uint32_t _max, + map& oids, + map& bi_log_lists, uint32_t max_aio) : + CLSRGWConcurrentIO(io_ctx, oids, max_aio), result(bi_log_lists), + marker_mgr(_marker_mgr), max(_max) {} +}; + +class CLSRGWIssueBILogTrim : public CLSRGWConcurrentIO { + BucketIndexShardsManager& start_marker_mgr; + BucketIndexShardsManager& end_marker_mgr; +protected: + int issue_op(int shard_id, const string& oid); + // Trim until -ENODATA is returned. + int valid_ret_code() { return -ENODATA; } + bool need_multiple_rounds() { return true; } + void add_object(int shard, const string& oid) { objs_container[shard] = oid; } + void reset_container(map& objs) { + objs_container.swap(objs); + iter = objs_container.begin(); + objs.clear(); + } +public: + CLSRGWIssueBILogTrim(librados::IoCtx& io_ctx, BucketIndexShardsManager& _start_marker_mgr, + BucketIndexShardsManager& _end_marker_mgr, map& _bucket_objs, uint32_t max_aio) : + CLSRGWConcurrentIO(io_ctx, _bucket_objs, max_aio), + start_marker_mgr(_start_marker_mgr), end_marker_mgr(_end_marker_mgr) {} +}; + +/** + * Check the bucket index. + * + * io_ctx - IO context for rados. + * bucket_objs_ret - check result for all shards. + * max_aio - the maximum number of AIO (for throttling). + * + * Return 0 on success, a failure code otherwise. + */ +class CLSRGWIssueBucketCheck : public CLSRGWConcurrentIO /* >*/ { + map& result; +protected: + int issue_op(int shard_id, const string& oid); +public: + CLSRGWIssueBucketCheck(librados::IoCtx& ioc, map& oids, map& bucket_objs_ret, + uint32_t _max_aio) : + CLSRGWConcurrentIO(ioc, oids, _max_aio), result(bucket_objs_ret) {} +}; + +class CLSRGWIssueBucketRebuild : public CLSRGWConcurrentIO { +protected: + int issue_op(int shard_id, const string& oid); +public: + CLSRGWIssueBucketRebuild(librados::IoCtx& io_ctx, map& bucket_objs, + uint32_t max_aio) : CLSRGWConcurrentIO(io_ctx, bucket_objs, max_aio) {} +}; + +class CLSRGWIssueGetDirHeader : public CLSRGWConcurrentIO { + map& result; +protected: + int issue_op(int shard_id, const string& oid); +public: + CLSRGWIssueGetDirHeader(librados::IoCtx& io_ctx, map& oids, map& dir_headers, + uint32_t max_aio) : + CLSRGWConcurrentIO(io_ctx, oids, max_aio), result(dir_headers) {} +}; + int cls_rgw_get_dir_header_async(librados::IoCtx& io_ctx, string& oid, RGWGetDirHeader_CB *ctx); void cls_rgw_encode_suggestion(char op, rgw_bucket_dir_entry& dirent, bufferlist& updates); void cls_rgw_suggest_changes(librados::ObjectWriteOperation& o, bufferlist& updates); -/* bucket index log */ - -int cls_rgw_bi_log_list(librados::IoCtx& io_ctx, string& oid, string& marker, uint32_t max, - list& entries, bool *truncated); -int cls_rgw_bi_log_trim(librados::IoCtx& io_ctx, string& oid, string& start_marker, string& end_marker); - /* usage logging */ int cls_rgw_usage_log_read(librados::IoCtx& io_ctx, string& oid, string& user, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries, diff --git a/src/cls/rgw/cls_rgw_ops.cc b/src/cls/rgw/cls_rgw_ops.cc index 01c0666d9ebe9..23beb4ee429e7 100644 --- a/src/cls/rgw/cls_rgw_ops.cc +++ b/src/cls/rgw/cls_rgw_ops.cc @@ -92,7 +92,7 @@ void rgw_cls_obj_prepare_op::generate_test_instances(listop = CLS_RGW_OP_ADD; - op->name = "name"; + op->key.name = "name"; op->tag = "tag"; op->locator = "locator"; o.push_back(op); @@ -102,16 +102,18 @@ void rgw_cls_obj_prepare_op::generate_test_instances(listdump_int("op", op); - f->dump_string("name", name); + f->dump_string("name", key.name); f->dump_string("tag", tag); f->dump_string("locator", locator); + f->dump_bool("log_op", log_op); + f->dump_int("bilog_flags", bilog_flags); } void rgw_cls_obj_complete_op::generate_test_instances(list& o) { rgw_cls_obj_complete_op *op = new rgw_cls_obj_complete_op; op->op = CLS_RGW_OP_DEL; - op->name = "name"; + op->key.name = "name"; op->locator = "locator"; op->ver.pool = 2; op->ver.epoch = 100; @@ -130,7 +132,8 @@ void rgw_cls_obj_complete_op::generate_test_instances(listdump_int("op", (int)op); - f->dump_string("name", name); + f->dump_string("name", key.name); + f->dump_string("instance", key.instance); f->dump_string("locator", locator); f->open_object_section("ver"); ver.dump(f); @@ -139,12 +142,142 @@ void rgw_cls_obj_complete_op::dump(Formatter *f) const meta.dump(f); f->close_section(); f->dump_string("tag", tag); + f->dump_bool("log_op", log_op); + f->dump_int("bilog_flags", bilog_flags); +} + +void rgw_cls_link_olh_op::generate_test_instances(list& o) +{ + rgw_cls_link_olh_op *op = new rgw_cls_link_olh_op; + op->key.name = "name"; + op->olh_tag = "olh_tag"; + op->delete_marker = true; + op->op_tag = "op_tag"; + op->olh_epoch = 123; + list l; + rgw_bucket_dir_entry_meta::generate_test_instances(l); + list::iterator iter = l.begin(); + op->meta = *(*iter); + op->log_op = true; + + o.push_back(op); + + o.push_back(new rgw_cls_link_olh_op); +} + +void rgw_cls_link_olh_op::dump(Formatter *f) const +{ + ::encode_json("key", key, f); + ::encode_json("olh_tag", olh_tag, f); + ::encode_json("delete_marker", delete_marker, f); + ::encode_json("op_tag", op_tag, f); + ::encode_json("meta", meta, f); + ::encode_json("olh_epoch", olh_epoch, f); + ::encode_json("log_op", log_op, f); + ::encode_json("bilog_flags", (uint32_t)bilog_flags, f); +} + +void rgw_cls_unlink_instance_op::generate_test_instances(list& o) +{ + rgw_cls_unlink_instance_op *op = new rgw_cls_unlink_instance_op; + op->key.name = "name"; + op->op_tag = "op_tag"; + op->olh_epoch = 124; + op->log_op = true; + + o.push_back(op); + + o.push_back(new rgw_cls_unlink_instance_op); +} + +void rgw_cls_unlink_instance_op::dump(Formatter *f) const +{ + ::encode_json("key", key, f); + ::encode_json("op_tag", op_tag, f); + ::encode_json("olh_epoch", olh_epoch, f); + ::encode_json("log_op", log_op, f); + ::encode_json("bilog_flags", (uint32_t)bilog_flags, f); +} + +void rgw_cls_read_olh_log_op::generate_test_instances(list& o) +{ + rgw_cls_read_olh_log_op *op = new rgw_cls_read_olh_log_op; + op->olh.name = "name"; + op->ver_marker = 123; + op->olh_tag = "olh_tag"; + + o.push_back(op); + + o.push_back(new rgw_cls_read_olh_log_op); +} + +void rgw_cls_read_olh_log_op::dump(Formatter *f) const +{ + ::encode_json("olh", olh, f); + ::encode_json("ver_marker", ver_marker, f); + ::encode_json("olh_tag", olh_tag, f); +} + +void rgw_cls_read_olh_log_ret::generate_test_instances(list& o) +{ + rgw_cls_read_olh_log_ret *r = new rgw_cls_read_olh_log_ret; + r->is_truncated = true; + list l; + rgw_bucket_olh_log_entry::generate_test_instances(l); + list::iterator iter = l.begin(); + r->log[1].push_back(*(*iter)); + + o.push_back(r); + + o.push_back(new rgw_cls_read_olh_log_ret); +} + +void rgw_cls_read_olh_log_ret::dump(Formatter *f) const +{ + ::encode_json("log", log, f); + ::encode_json("is_truncated", is_truncated, f); +} + +void rgw_cls_trim_olh_log_op::generate_test_instances(list& o) +{ + rgw_cls_trim_olh_log_op *op = new rgw_cls_trim_olh_log_op; + op->olh.name = "olh.name"; + op->ver = 100; + op->olh_tag = "olh_tag"; + + o.push_back(op); + + o.push_back(new rgw_cls_trim_olh_log_op); +} + +void rgw_cls_trim_olh_log_op::dump(Formatter *f) const +{ + ::encode_json("olh", olh, f); + ::encode_json("ver", ver, f); + ::encode_json("olh_tag", olh_tag, f); +} + +void rgw_cls_bucket_clear_olh_op::generate_test_instances(list& o) +{ + + rgw_cls_bucket_clear_olh_op *op = new rgw_cls_bucket_clear_olh_op; + op->key.name = "key.name"; + op->olh_tag = "olh_tag"; + + o.push_back(op); + o.push_back(new rgw_cls_bucket_clear_olh_op); +} + +void rgw_cls_bucket_clear_olh_op::dump(Formatter *f) const +{ + ::encode_json("key", key, f); + ::encode_json("olh_tag", olh_tag, f); } void rgw_cls_list_op::generate_test_instances(list& o) { rgw_cls_list_op *op = new rgw_cls_list_op; - op->start_obj = "start_obj"; + op->start_obj.name = "start_obj"; op->num_entries = 100; op->filter_prefix = "filter_prefix"; o.push_back(op); @@ -153,7 +286,7 @@ void rgw_cls_list_op::generate_test_instances(list& o) void rgw_cls_list_op::dump(Formatter *f) const { - f->dump_string("start_obj", start_obj); + f->dump_string("start_obj", start_obj.name); f->dump_unsigned("num_entries", num_entries); } @@ -185,6 +318,27 @@ void rgw_cls_list_ret::dump(Formatter *f) const f->dump_int("is_truncated", (int)is_truncated); } +void rgw_cls_check_index_ret::generate_test_instances(list& o) +{ + list h; + rgw_bucket_dir_header::generate_test_instances(h); + rgw_cls_check_index_ret *r = new rgw_cls_check_index_ret; + r->existing_header = *(h.front()); + r->calculated_header = *(h.front()); + o.push_back(r); + + for (list::iterator iter = h.begin(); iter != h.end(); ++iter) { + delete *iter; + } + o.push_back(new rgw_cls_check_index_ret); +} + +void rgw_cls_check_index_ret::dump(Formatter *f) const +{ + ::encode_json("existing_header", existing_header, f); + ::encode_json("calculated_header", calculated_header, f); +} + void cls_rgw_bi_log_list_op::dump(Formatter *f) const { f->dump_string("marker", marker); diff --git a/src/cls/rgw/cls_rgw_ops.h b/src/cls/rgw/cls_rgw_ops.h index e5db06010dc24..0a0686fbccb16 100644 --- a/src/cls/rgw/cls_rgw_ops.h +++ b/src/cls/rgw/cls_rgw_ops.h @@ -30,29 +30,33 @@ WRITE_CLASS_ENCODER(rgw_cls_tag_timeout_op) struct rgw_cls_obj_prepare_op { RGWModifyOp op; - string name; + cls_rgw_obj_key key; string tag; string locator; bool log_op; + uint16_t bilog_flags; - rgw_cls_obj_prepare_op() : op(CLS_RGW_OP_UNKNOWN), log_op(false) {} + rgw_cls_obj_prepare_op() : op(CLS_RGW_OP_UNKNOWN), log_op(false), bilog_flags(0) {} void encode(bufferlist &bl) const { - ENCODE_START(4, 3, bl); + ENCODE_START(6, 5, bl); uint8_t c = (uint8_t)op; ::encode(c, bl); - ::encode(name, bl); ::encode(tag, bl); ::encode(locator, bl); ::encode(log_op, bl); + ::encode(key, bl); + ::encode(bilog_flags, bl); ENCODE_FINISH(bl); } void decode(bufferlist::iterator &bl) { - DECODE_START_LEGACY_COMPAT_LEN(4, 3, 3, bl); + DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, bl); uint8_t c; ::decode(c, bl); op = (RGWModifyOp)c; - ::decode(name, bl); + if (struct_v < 5) { + ::decode(key.name, bl); + } ::decode(tag, bl); if (struct_v >= 2) { ::decode(locator, bl); @@ -60,6 +64,12 @@ struct rgw_cls_obj_prepare_op if (struct_v >= 4) { ::decode(log_op, bl); } + if (struct_v >= 5) { + ::decode(key, bl); + } + if (struct_v >= 6) { + ::decode(bilog_flags, bl); + } DECODE_FINISH(bl); } void dump(Formatter *f) const; @@ -70,22 +80,22 @@ WRITE_CLASS_ENCODER(rgw_cls_obj_prepare_op) struct rgw_cls_obj_complete_op { RGWModifyOp op; - string name; + cls_rgw_obj_key key; string locator; rgw_bucket_entry_ver ver; struct rgw_bucket_dir_entry_meta meta; string tag; bool log_op; + uint16_t bilog_flags; - list remove_objs; + list remove_objs; - rgw_cls_obj_complete_op() : op(CLS_RGW_OP_ADD), log_op(false) {} + rgw_cls_obj_complete_op() : op(CLS_RGW_OP_ADD), log_op(false), bilog_flags(0) {} void encode(bufferlist &bl) const { - ENCODE_START(6, 3, bl); + ENCODE_START(8, 7, bl); uint8_t c = (uint8_t)op; ::encode(c, bl); - ::encode(name, bl); ::encode(ver.epoch, bl); ::encode(meta, bl); ::encode(tag, bl); @@ -93,21 +103,35 @@ struct rgw_cls_obj_complete_op ::encode(remove_objs, bl); ::encode(ver, bl); ::encode(log_op, bl); + ::encode(key, bl); + ::encode(bilog_flags, bl); ENCODE_FINISH(bl); } void decode(bufferlist::iterator &bl) { - DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, bl); + DECODE_START_LEGACY_COMPAT_LEN(8, 3, 3, bl); uint8_t c; ::decode(c, bl); op = (RGWModifyOp)c; - ::decode(name, bl); + if (struct_v < 7) { + ::decode(key.name, bl); + } ::decode(ver.epoch, bl); ::decode(meta, bl); ::decode(tag, bl); if (struct_v >= 2) { ::decode(locator, bl); } - if (struct_v >= 4) { + if (struct_v >= 4 && struct_v < 7) { + list old_remove_objs; + ::decode(old_remove_objs, bl); + + for (list::iterator iter = old_remove_objs.begin(); + iter != old_remove_objs.end(); ++iter) { + cls_rgw_obj_key k; + k.name = *iter; + remove_objs.push_back(k); + } + } else { ::decode(remove_objs, bl); } if (struct_v >= 5) { @@ -118,6 +142,12 @@ struct rgw_cls_obj_complete_op if (struct_v >= 6) { ::decode(log_op, bl); } + if (struct_v >= 7) { + ::decode(key, bl); + } + if (struct_v >= 8) { + ::decode(bilog_flags, bl); + } DECODE_FINISH(bl); } void dump(Formatter *f) const; @@ -125,27 +155,216 @@ struct rgw_cls_obj_complete_op }; WRITE_CLASS_ENCODER(rgw_cls_obj_complete_op) +struct rgw_cls_link_olh_op { + cls_rgw_obj_key key; + string olh_tag; + bool delete_marker; + string op_tag; + struct rgw_bucket_dir_entry_meta meta; + uint64_t olh_epoch; + bool log_op; + uint16_t bilog_flags; + + rgw_cls_link_olh_op() : delete_marker(false), olh_epoch(0), log_op(false), bilog_flags(0) {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + ::encode(key, bl); + ::encode(olh_tag, bl); + ::encode(delete_marker, bl); + ::encode(op_tag, bl); + ::encode(meta, bl); + ::encode(olh_epoch, bl); + ::encode(log_op, bl); + ::encode(bilog_flags, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::iterator& bl) { + DECODE_START(1, bl); + ::decode(key, bl); + ::decode(olh_tag, bl); + ::decode(delete_marker, bl); + ::decode(op_tag, bl); + ::decode(meta, bl); + ::decode(olh_epoch, bl); + ::decode(log_op, bl); + ::decode(bilog_flags, bl); + DECODE_FINISH(bl); + } + + static void generate_test_instances(list& o); + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(rgw_cls_link_olh_op) + +struct rgw_cls_unlink_instance_op { + cls_rgw_obj_key key; + string op_tag; + uint64_t olh_epoch; + bool log_op; + uint16_t bilog_flags; + + rgw_cls_unlink_instance_op() : olh_epoch(0), log_op(false), bilog_flags(0) {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + ::encode(key, bl); + ::encode(op_tag, bl); + ::encode(olh_epoch, bl); + ::encode(log_op, bl); + ::encode(bilog_flags, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::iterator& bl) { + DECODE_START(1, bl); + ::decode(key, bl); + ::decode(op_tag, bl); + ::decode(olh_epoch, bl); + ::decode(log_op, bl); + ::decode(bilog_flags, bl); + DECODE_FINISH(bl); + } + + static void generate_test_instances(list& o); + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(rgw_cls_unlink_instance_op) + +struct rgw_cls_read_olh_log_op +{ + cls_rgw_obj_key olh; + uint64_t ver_marker; + string olh_tag; + + rgw_cls_read_olh_log_op() : ver_marker(0) {} + + void encode(bufferlist &bl) const { + ENCODE_START(1, 1, bl); + ::encode(olh, bl); + ::encode(ver_marker, bl); + ::encode(olh_tag, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::iterator &bl) { + DECODE_START(1, bl); + ::decode(olh, bl); + ::decode(ver_marker, bl); + ::decode(olh_tag, bl); + DECODE_FINISH(bl); + } + static void generate_test_instances(list& o); + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(rgw_cls_read_olh_log_op) + + +struct rgw_cls_read_olh_log_ret +{ + map > log; + bool is_truncated; + + rgw_cls_read_olh_log_ret() : is_truncated(false) {} + + void encode(bufferlist &bl) const { + ENCODE_START(1, 1, bl); + ::encode(log, bl); + ::encode(is_truncated, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::iterator &bl) { + DECODE_START(1, bl); + ::decode(log, bl); + ::decode(is_truncated, bl); + DECODE_FINISH(bl); + } + static void generate_test_instances(list& o); + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(rgw_cls_read_olh_log_ret) + +struct rgw_cls_trim_olh_log_op +{ + cls_rgw_obj_key olh; + uint64_t ver; + string olh_tag; + + rgw_cls_trim_olh_log_op() : ver(0) {} + + void encode(bufferlist &bl) const { + ENCODE_START(1, 1, bl); + ::encode(olh, bl); + ::encode(ver, bl); + ::encode(olh_tag, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::iterator &bl) { + DECODE_START(1, bl); + ::decode(olh, bl); + ::decode(ver, bl); + ::decode(olh_tag, bl); + DECODE_FINISH(bl); + } + static void generate_test_instances(list& o); + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(rgw_cls_trim_olh_log_op) + +struct rgw_cls_bucket_clear_olh_op { + cls_rgw_obj_key key; + string olh_tag; + + rgw_cls_bucket_clear_olh_op() {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + ::encode(key, bl); + ::encode(olh_tag, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::iterator& bl) { + DECODE_START(1, bl); + ::decode(key, bl); + ::decode(olh_tag, bl); + DECODE_FINISH(bl); + } + + static void generate_test_instances(list& o); + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(rgw_cls_bucket_clear_olh_op) + struct rgw_cls_list_op { - string start_obj; + cls_rgw_obj_key start_obj; uint32_t num_entries; string filter_prefix; + bool list_versions; - rgw_cls_list_op() : num_entries(0) {} + rgw_cls_list_op() : num_entries(0), list_versions(false) {} void encode(bufferlist &bl) const { - ENCODE_START(3, 2, bl); - ::encode(start_obj, bl); + ENCODE_START(5, 4, bl); ::encode(num_entries, bl); ::encode(filter_prefix, bl); + ::encode(start_obj, bl); + ::encode(list_versions, bl); ENCODE_FINISH(bl); } void decode(bufferlist::iterator &bl) { - DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl); - ::decode(start_obj, bl); + DECODE_START_LEGACY_COMPAT_LEN(5, 2, 2, bl); + if (struct_v < 4) { + ::decode(start_obj.name, bl); + } ::decode(num_entries, bl); if (struct_v >= 3) ::decode(filter_prefix, bl); + if (struct_v >= 4) + ::decode(start_obj, bl); + if (struct_v >= 5) + ::decode(list_versions, bl); DECODE_FINISH(bl); } void dump(Formatter *f) const; @@ -197,10 +416,49 @@ struct rgw_cls_check_index_ret DECODE_FINISH(bl); } void dump(Formatter *f) const; - static void generate_test_instances(list& o); + static void generate_test_instances(list& o); }; WRITE_CLASS_ENCODER(rgw_cls_check_index_ret) +struct rgw_cls_obj_remove_op { + list keep_attr_prefixes; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + ::encode(keep_attr_prefixes, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::iterator& bl) { + DECODE_START(1, bl); + ::decode(keep_attr_prefixes, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(rgw_cls_obj_remove_op) + +struct rgw_cls_obj_check_attrs_prefix { + string check_prefix; + bool fail_if_exist; + + rgw_cls_obj_check_attrs_prefix() : fail_if_exist(false) {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + ::encode(check_prefix, bl); + ::encode(fail_if_exist, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::iterator& bl) { + DECODE_START(1, bl); + ::decode(check_prefix, bl); + ::decode(fail_if_exist, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(rgw_cls_obj_check_attrs_prefix) + struct rgw_cls_usage_log_add_op { rgw_usage_log_info info; @@ -218,6 +476,115 @@ struct rgw_cls_usage_log_add_op { }; WRITE_CLASS_ENCODER(rgw_cls_usage_log_add_op) +struct rgw_cls_bi_get_op { + cls_rgw_obj_key key; + BIIndexType type; /* namespace: plain, instance, olh */ + + rgw_cls_bi_get_op() : type(PlainIdx) {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + ::encode(key, bl); + ::encode((uint8_t)type, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::iterator& bl) { + DECODE_START(1, bl); + ::decode(key, bl); + uint8_t c; + ::decode(c, bl); + type = (BIIndexType)c; + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(rgw_cls_bi_get_op) + +struct rgw_cls_bi_get_ret { + rgw_cls_bi_entry entry; + + rgw_cls_bi_get_ret() {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + ::encode(entry, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::iterator& bl) { + DECODE_START(1, bl); + ::decode(entry, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(rgw_cls_bi_get_ret) + +struct rgw_cls_bi_put_op { + rgw_cls_bi_entry entry; + + rgw_cls_bi_put_op() {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + ::encode(entry, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::iterator& bl) { + DECODE_START(1, bl); + ::decode(entry, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(rgw_cls_bi_put_op) + +struct rgw_cls_bi_list_op { + uint32_t max; + string name; + string marker; + + rgw_cls_bi_list_op() : max(0) {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + ::encode(max, bl); + ::encode(name, bl); + ::encode(marker, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::iterator& bl) { + DECODE_START(1, bl); + ::decode(max, bl); + ::decode(name, bl); + ::decode(marker, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(rgw_cls_bi_list_op) + +struct rgw_cls_bi_list_ret { + list entries; + bool is_truncated; + + rgw_cls_bi_list_ret() : is_truncated(false) {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + ::encode(entries, bl); + ::encode(is_truncated, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::iterator& bl) { + DECODE_START(1, bl); + ::decode(entries, bl); + ::decode(is_truncated, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(rgw_cls_bi_list_ret) + struct rgw_cls_usage_log_read_op { uint64_t start_epoch; uint64_t end_epoch; diff --git a/src/cls/rgw/cls_rgw_types.cc b/src/cls/rgw/cls_rgw_types.cc index f1c50bec546f0..faec1a304012a 100644 --- a/src/cls/rgw/cls_rgw_types.cc +++ b/src/cls/rgw/cls_rgw_types.cc @@ -1,6 +1,7 @@ #include "cls/rgw/cls_rgw_types.h" #include "common/Formatter.h" +#include "common/ceph_json.h" void rgw_bucket_pending_info::generate_test_instances(list& o) @@ -14,9 +15,23 @@ void rgw_bucket_pending_info::generate_test_instances(listdump_int("state", (int)state); - f->dump_stream("timestamp") << timestamp; - f->dump_int("op", (int)op); + encode_json("state", (int)state, f); + encode_json("timestamp", timestamp, f); + encode_json("op", (int)op, f); +} + +void rgw_bucket_pending_info::decode_json(JSONObj *obj) { + int val; + JSONDecoder::decode_json("state", val, obj); + state = (RGWPendingState)val; + JSONDecoder::decode_json("timestamp", timestamp, obj); + JSONDecoder::decode_json("op", val, obj); + op = (uint8_t)val; +} + +void cls_rgw_obj_key::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("name", name, obj); + JSONDecoder::decode_json("instance", instance, obj); } void rgw_bucket_dir_entry_meta::generate_test_instances(list& o) @@ -34,13 +49,27 @@ void rgw_bucket_dir_entry_meta::generate_test_instances(listdump_int("category", category); - f->dump_unsigned("size", size); - f->dump_stream("mtime") << mtime; - f->dump_string("etag", etag); - f->dump_string("owner", owner); - f->dump_string("owner_display_name", owner_display_name); - f->dump_string("content_type", content_type); + encode_json("category", (int)category, f); + encode_json("size", size, f); + encode_json("mtime", mtime, f); + encode_json("etag", etag, f); + encode_json("owner", owner, f); + encode_json("owner_display_name", owner_display_name, f); + encode_json("content_type", content_type, f); + encode_json("accounted_size", accounted_size, f); +} + +void rgw_bucket_dir_entry_meta::decode_json(JSONObj *obj) { + int val; + JSONDecoder::decode_json("category", val, obj); + category = (uint8_t)val; + JSONDecoder::decode_json("size", size, obj); + JSONDecoder::decode_json("mtime", mtime, obj); + JSONDecoder::decode_json("etag", etag, obj); + JSONDecoder::decode_json("owner", owner, obj); + JSONDecoder::decode_json("owner_display_name", owner_display_name, obj); + JSONDecoder::decode_json("content_type", content_type, obj); + JSONDecoder::decode_json("accounted_size", accounted_size, obj); } void rgw_bucket_dir_entry::generate_test_instances(list& o) @@ -52,7 +81,7 @@ void rgw_bucket_dir_entry::generate_test_instances(list& for (iter = l.begin(); iter != l.end(); ++iter) { rgw_bucket_dir_entry_meta *m = *iter; rgw_bucket_dir_entry *e = new rgw_bucket_dir_entry; - e->name = "name"; + e->key.name = "name"; e->ver.pool = 1; e->ver.epoch = 1234; e->locator = "locator"; @@ -69,8 +98,13 @@ void rgw_bucket_dir_entry::generate_test_instances(list& void rgw_bucket_entry_ver::dump(Formatter *f) const { - f->dump_int("pool", pool); - f->dump_unsigned("epoch", epoch); + encode_json("pool", pool, f); + encode_json("epoch", epoch, f); +} + +void rgw_bucket_entry_ver::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("pool", pool, obj); + JSONDecoder::decode_json("epoch", epoch, obj); } void rgw_bucket_entry_ver::generate_test_instances(list& ls) @@ -84,28 +118,196 @@ void rgw_bucket_entry_ver::generate_test_instances(list& void rgw_bucket_dir_entry::dump(Formatter *f) const { - f->dump_string("name", name); - f->open_object_section("ver"); - ver.dump(f); - f->close_section(); - f->dump_string("locator", locator); - f->dump_int("exists", (int)exists); - f->open_object_section("meta"); - meta.dump(f); - f->close_section(); - f->dump_string("tag", tag); + encode_json("name", key.name, f); + encode_json("instance", key.instance , f); + encode_json("ver", ver , f); + encode_json("locator", locator , f); + encode_json("exists", exists , f); + encode_json("meta", meta , f); + encode_json("tag", tag , f); + encode_json("flags", (int)flags , f); + encode_json("pending_map", pending_map, f); + encode_json("versioned_epoch", versioned_epoch , f); +} - map::const_iterator iter = pending_map.begin(); - f->open_array_section("pending_map"); - for (; iter != pending_map.end(); ++iter) { - f->dump_string("tag", iter->first); - f->open_object_section("info"); - iter->second.dump(f); - f->close_section(); +void rgw_bucket_dir_entry::decode_json(JSONObj *obj) { + JSONDecoder::decode_json("name", key.name, obj); + JSONDecoder::decode_json("instance", key.instance , obj); + JSONDecoder::decode_json("ver", ver , obj); + JSONDecoder::decode_json("locator", locator , obj); + JSONDecoder::decode_json("exists", exists , obj); + JSONDecoder::decode_json("meta", meta , obj); + JSONDecoder::decode_json("tag", tag , obj); + int val; + JSONDecoder::decode_json("flags", val , obj); + flags = (uint16_t)val; + JSONDecoder::decode_json("pending_map", pending_map, obj); + JSONDecoder::decode_json("versioned_epoch", versioned_epoch, obj); +} + +static void dump_bi_entry(bufferlist bl, BIIndexType index_type, Formatter *formatter) +{ + bufferlist::iterator iter = bl.begin(); + switch (index_type) { + case PlainIdx: + case InstanceIdx: + { + rgw_bucket_dir_entry entry; + ::decode(entry, iter); + encode_json("entry", entry, formatter); + } + break; + case OLHIdx: + { + rgw_bucket_olh_entry entry; + ::decode(entry, iter); + encode_json("entry", entry, formatter); + } + break; + default: + break; + } +} + +void rgw_cls_bi_entry::decode_json(JSONObj *obj, cls_rgw_obj_key *effective_key) { + JSONDecoder::decode_json("idx", idx, obj); + string s; + JSONDecoder::decode_json("type", s, obj); + if (s == "plain") { + type = PlainIdx; + } else if (s == "instance") { + type = InstanceIdx; + } else if (s == "olh") { + type = OLHIdx; + } else { + type = InvalidIdx; + } + switch (type) { + case PlainIdx: + case InstanceIdx: + { + rgw_bucket_dir_entry entry; + JSONDecoder::decode_json("entry", entry, obj); + ::encode(entry, data); + + if (effective_key) { + *effective_key = entry.key; + } + } + break; + case OLHIdx: + { + rgw_bucket_olh_entry entry; + JSONDecoder::decode_json("entry", entry, obj); + ::encode(entry, data); + + if (effective_key) { + *effective_key = entry.key; + } + } + break; + default: + break; } - f->close_section(); } +void rgw_cls_bi_entry::dump(Formatter *f) const +{ + string type_str; + switch (type) { + case PlainIdx: + type_str = "plain"; + break; + case InstanceIdx: + type_str = "instance"; + break; + case OLHIdx: + type_str = "olh"; + break; + default: + type_str = "invalid"; + } + encode_json("type", type_str, f); + encode_json("idx", idx, f); + dump_bi_entry(data, type, f); +} + +void rgw_bucket_olh_entry::dump(Formatter *f) const +{ + encode_json("key", key, f); + encode_json("delete_marker", delete_marker, f); + encode_json("epoch", epoch, f); + encode_json("pending_log", pending_log, f); + encode_json("tag", tag, f); + encode_json("exists", exists, f); + encode_json("pending_removal", pending_removal, f); +} + +void rgw_bucket_olh_entry::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("key", key, obj); + JSONDecoder::decode_json("delete_marker", delete_marker, obj); + JSONDecoder::decode_json("epoch", epoch, obj); + JSONDecoder::decode_json("pending_log", pending_log, obj); + JSONDecoder::decode_json("tag", tag, obj); + JSONDecoder::decode_json("exists", exists, obj); + JSONDecoder::decode_json("pending_removal", pending_removal, obj); +} + +void rgw_bucket_olh_log_entry::generate_test_instances(list& o) +{ + rgw_bucket_olh_log_entry *entry = new rgw_bucket_olh_log_entry; + entry->epoch = 1234; + entry->op = CLS_RGW_OLH_OP_LINK_OLH; + entry->op_tag = "op_tag"; + entry->key.name = "key.name"; + entry->key.instance = "key.instance"; + entry->delete_marker = true; + o.push_back(entry); + o.push_back(new rgw_bucket_olh_log_entry); +} + +void rgw_bucket_olh_log_entry::dump(Formatter *f) const +{ + encode_json("epoch", epoch, f); + const char *op_str; + switch (op) { + case CLS_RGW_OLH_OP_LINK_OLH: + op_str = "link_olh"; + break; + case CLS_RGW_OLH_OP_UNLINK_OLH: + op_str = "unlink_olh"; + break; + case CLS_RGW_OLH_OP_REMOVE_INSTANCE: + op_str = "remove_instance"; + break; + default: + op_str = "unknown"; + } + encode_json("op", op_str, f); + encode_json("op_tag", op_tag, f); + encode_json("key", key, f); + encode_json("delete_marker", delete_marker, f); +} + +void rgw_bucket_olh_log_entry::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("epoch", epoch, obj); + string op_str; + JSONDecoder::decode_json("op", op_str, obj); + if (op_str == "link_olh") { + op = CLS_RGW_OLH_OP_LINK_OLH; + } else if (op_str == "unlink_olh") { + op = CLS_RGW_OLH_OP_UNLINK_OLH; + } else if (op_str == "remove_instance") { + op = CLS_RGW_OLH_OP_REMOVE_INSTANCE; + } else { + op = CLS_RGW_OLH_OP_UNKNOWN; + } + JSONDecoder::decode_json("op_tag", op_tag, obj); + JSONDecoder::decode_json("key", key, obj); + JSONDecoder::decode_json("delete_marker", delete_marker, obj); +} void rgw_bi_log_entry::dump(Formatter *f) const { f->dump_string("op_id", id); @@ -123,12 +325,22 @@ void rgw_bi_log_entry::dump(Formatter *f) const case CLS_RGW_OP_UNKNOWN: f->dump_string("op", "unknown"); break; + case CLS_RGW_OP_LINK_OLH: + f->dump_string("op", "link_olh"); + break; + case CLS_RGW_OP_LINK_OLH_DM: + f->dump_string("op", "link_olh_del"); + break; + case CLS_RGW_OP_UNLINK_INSTANCE: + f->dump_string("op", "unlink_instance"); + break; default: f->dump_string("op", "invalid"); break; } f->dump_string("object", object); + f->dump_string("instance", instance); switch (state) { case CLS_RGW_STATE_PENDING_MODIFY: @@ -147,6 +359,7 @@ void rgw_bi_log_entry::dump(Formatter *f) const f->open_object_section("ver"); ver.dump(f); f->close_section(); + f->dump_bool("versioned", (bilog_flags & RGW_BILOG_FLAG_VERSIONED_OP) != 0); } void rgw_bi_log_entry::generate_test_instances(list& ls) @@ -230,7 +443,7 @@ void rgw_bucket_dir::generate_test_instances(list& o) list::iterator eiter; for (eiter = el.begin(); eiter != el.end(); ++eiter) { rgw_bucket_dir_entry *e = *eiter; - d->m[e->name] = *e; + d->m[e->key.name] = *e; delete e; } @@ -251,7 +464,7 @@ void rgw_bucket_dir::dump(Formatter *f) const map::const_iterator iter = m.begin(); f->open_array_section("map"); for (; iter != m.end(); ++iter) { - f->dump_string("obj", iter->first); + f->dump_string("key", iter->first); f->open_object_section("dir_entry"); iter->second.dump(f); f->close_section(); diff --git a/src/cls/rgw/cls_rgw_types.h b/src/cls/rgw/cls_rgw_types.h index 1ed56ba0da895..dfa9286c0008e 100644 --- a/src/cls/rgw/cls_rgw_types.h +++ b/src/cls/rgw/cls_rgw_types.h @@ -11,6 +11,8 @@ #define CEPH_RGW_UPDATE 'u' #define CEPH_RGW_TAG_TIMEOUT 60*60*24 +class JSONObj; + namespace ceph { class Formatter; } @@ -25,6 +27,13 @@ enum RGWModifyOp { CLS_RGW_OP_DEL = 1, CLS_RGW_OP_CANCEL = 2, CLS_RGW_OP_UNKNOWN = 3, + CLS_RGW_OP_LINK_OLH = 4, + CLS_RGW_OP_LINK_OLH_DM = 5, /* creation of delete marker */ + CLS_RGW_OP_UNLINK_INSTANCE = 6, +}; + +enum RGWBILogFlags { + RGW_BILOG_FLAG_VERSIONED_OP = 0x1, }; struct rgw_bucket_pending_info { @@ -52,6 +61,7 @@ struct rgw_bucket_pending_info { DECODE_FINISH(bl); } void dump(Formatter *f) const; + void decode_json(JSONObj *obj); static void generate_test_instances(list& o); }; WRITE_CLASS_ENCODER(rgw_bucket_pending_info) @@ -64,12 +74,13 @@ struct rgw_bucket_dir_entry_meta { string owner; string owner_display_name; string content_type; + uint64_t accounted_size; rgw_bucket_dir_entry_meta() : - category(0), size(0) { mtime.set_from_double(0); } + category(0), size(0), accounted_size(0) { mtime.set_from_double(0); } void encode(bufferlist &bl) const { - ENCODE_START(3, 3, bl); + ENCODE_START(4, 3, bl); ::encode(category, bl); ::encode(size, bl); ::encode(mtime, bl); @@ -77,10 +88,11 @@ struct rgw_bucket_dir_entry_meta { ::encode(owner, bl); ::encode(owner_display_name, bl); ::encode(content_type, bl); + ::encode(accounted_size, bl); ENCODE_FINISH(bl); } void decode(bufferlist::iterator &bl) { - DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl); + DECODE_START_LEGACY_COMPAT_LEN(4, 3, 3, bl); ::decode(category, bl); ::decode(size, bl); ::decode(mtime, bl); @@ -89,9 +101,14 @@ struct rgw_bucket_dir_entry_meta { ::decode(owner_display_name, bl); if (struct_v >= 2) ::decode(content_type, bl); + if (struct_v >= 4) + ::decode(accounted_size, bl); + else + accounted_size = size; DECODE_FINISH(bl); } void dump(Formatter *f) const; + void decode_json(JSONObj *obj); static void generate_test_instances(list& o); }; WRITE_CLASS_ENCODER(rgw_bucket_dir_entry_meta) @@ -189,27 +206,80 @@ struct rgw_bucket_entry_ver { DECODE_FINISH(bl); } void dump(Formatter *f) const; + void decode_json(JSONObj *obj); static void generate_test_instances(list& o); }; WRITE_CLASS_ENCODER(rgw_bucket_entry_ver) +struct cls_rgw_obj_key { + string name; + string instance; + + cls_rgw_obj_key() {} + cls_rgw_obj_key(const string &_name) : name(_name) {} + cls_rgw_obj_key(const string& n, const string& i) : name(n), instance(i) {} + + bool operator==(const cls_rgw_obj_key& k) const { + return (name.compare(k.name) == 0) && + (instance.compare(k.instance) == 0); + } + bool operator<(const cls_rgw_obj_key& k) const { + int r = name.compare(k.name); + if (r == 0) { + r = instance.compare(k.instance); + } + return (r < 0); + } + void encode(bufferlist &bl) const { + ENCODE_START(1, 1, bl); + ::encode(name, bl); + ::encode(instance, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::iterator &bl) { + DECODE_START(1, bl); + ::decode(name, bl); + ::decode(instance, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const { + f->dump_string("name", name); + f->dump_string("instance", instance); + } + void decode_json(JSONObj *obj); + static void generate_test_instances(list& ls) { + ls.push_back(new cls_rgw_obj_key); + ls.push_back(new cls_rgw_obj_key); + ls.back()->name = "name"; + ls.back()->instance = "instance"; + } +}; +WRITE_CLASS_ENCODER(cls_rgw_obj_key) + + +#define RGW_BUCKET_DIRENT_FLAG_VER 0x1 /* a versioned object instance */ +#define RGW_BUCKET_DIRENT_FLAG_CURRENT 0x2 /* the last object instance of a versioned object */ +#define RGW_BUCKET_DIRENT_FLAG_DELETE_MARKER 0x4 /* delete marker */ +#define RGW_BUCKET_DIRENT_FLAG_VER_MARKER 0x8 /* object is versioned, a placeholder for the plain entry */ struct rgw_bucket_dir_entry { - std::string name; + cls_rgw_obj_key key; rgw_bucket_entry_ver ver; std::string locator; bool exists; struct rgw_bucket_dir_entry_meta meta; - map pending_map; + multimap pending_map; uint64_t index_ver; string tag; + uint16_t flags; + uint64_t versioned_epoch; rgw_bucket_dir_entry() : - exists(false), index_ver(0) {} + exists(false), index_ver(0), flags(0), versioned_epoch(0) {} void encode(bufferlist &bl) const { - ENCODE_START(5, 3, bl); - ::encode(name, bl); + ENCODE_START(8, 3, bl); + ::encode(key.name, bl); ::encode(ver.epoch, bl); ::encode(exists, bl); ::encode(meta, bl); @@ -218,11 +288,14 @@ struct rgw_bucket_dir_entry { ::encode(ver, bl); ::encode_packed_val(index_ver, bl); ::encode(tag, bl); + ::encode(key.instance, bl); + ::encode(flags, bl); + ::encode(versioned_epoch, bl); ENCODE_FINISH(bl); } void decode(bufferlist::iterator &bl) { - DECODE_START_LEGACY_COMPAT_LEN(5, 3, 3, bl); - ::decode(name, bl); + DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, bl); + ::decode(key.name, bl); ::decode(ver.epoch, bl); ::decode(exists, bl); ::decode(meta, bl); @@ -239,27 +312,169 @@ struct rgw_bucket_dir_entry { ::decode_packed_val(index_ver, bl); ::decode(tag, bl); } + if (struct_v >= 6) { + ::decode(key.instance, bl); + } + if (struct_v >= 7) { + ::decode(flags, bl); + } + if (struct_v >= 8) { + ::decode(versioned_epoch, bl); + } DECODE_FINISH(bl); } + + bool is_current() { + int test_flags = RGW_BUCKET_DIRENT_FLAG_VER | RGW_BUCKET_DIRENT_FLAG_CURRENT; + return (flags & RGW_BUCKET_DIRENT_FLAG_VER) == 0 || + (flags & test_flags) == test_flags; + } + bool is_delete_marker() { return (flags & RGW_BUCKET_DIRENT_FLAG_DELETE_MARKER) != 0; } + bool is_visible() { + return is_current() && !is_delete_marker(); + } + bool is_valid() { return (flags & RGW_BUCKET_DIRENT_FLAG_VER_MARKER) == 0; } + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); static void generate_test_instances(list& o); }; WRITE_CLASS_ENCODER(rgw_bucket_dir_entry) +enum BIIndexType { + InvalidIdx = 0, + PlainIdx = 1, + InstanceIdx = 2, + OLHIdx = 3, +}; + +struct rgw_cls_bi_entry { + BIIndexType type; + string idx; + bufferlist data; + + rgw_cls_bi_entry() : type(InvalidIdx) {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + ::encode((uint8_t)type, bl); + ::encode(idx, bl); + ::encode(data, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::iterator& bl) { + DECODE_START(1, bl); + uint8_t c; + ::decode(c, bl); + type = (BIIndexType)c; + ::decode(idx, bl); + ::decode(data, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void decode_json(JSONObj *obj, cls_rgw_obj_key *effective_key = NULL); +}; +WRITE_CLASS_ENCODER(rgw_cls_bi_entry) + +enum OLHLogOp { + CLS_RGW_OLH_OP_UNKNOWN = 0, + CLS_RGW_OLH_OP_LINK_OLH = 1, + CLS_RGW_OLH_OP_UNLINK_OLH = 2, /* object does not exist */ + CLS_RGW_OLH_OP_REMOVE_INSTANCE = 3, +}; + +struct rgw_bucket_olh_log_entry { + uint64_t epoch; + OLHLogOp op; + string op_tag; + cls_rgw_obj_key key; + bool delete_marker; + + rgw_bucket_olh_log_entry() : epoch(0), op(CLS_RGW_OLH_OP_UNKNOWN), delete_marker(false) {} + + + void encode(bufferlist &bl) const { + ENCODE_START(1, 1, bl); + ::encode(epoch, bl); + ::encode((__u8)op, bl); + ::encode(op_tag, bl); + ::encode(key, bl); + ::encode(delete_marker, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::iterator &bl) { + DECODE_START(1, bl); + ::decode(epoch, bl); + uint8_t c; + ::decode(c, bl); + op = (OLHLogOp)c; + ::decode(op_tag, bl); + ::decode(key, bl); + ::decode(delete_marker, bl); + DECODE_FINISH(bl); + } + static void generate_test_instances(list& o); + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(rgw_bucket_olh_log_entry) + +struct rgw_bucket_olh_entry { + cls_rgw_obj_key key; + bool delete_marker; + uint64_t epoch; + map > pending_log; + string tag; + bool exists; + bool pending_removal; + + rgw_bucket_olh_entry() : delete_marker(false), epoch(0), exists(false), pending_removal(false) {} + + void encode(bufferlist &bl) const { + ENCODE_START(1, 1, bl); + ::encode(key, bl); + ::encode(delete_marker, bl); + ::encode(epoch, bl); + ::encode(pending_log, bl); + ::encode(tag, bl); + ::encode(exists, bl); + ::encode(pending_removal, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::iterator &bl) { + DECODE_START(1, bl); + ::decode(key, bl); + ::decode(delete_marker, bl); + ::decode(epoch, bl); + ::decode(pending_log, bl); + ::decode(tag, bl); + ::decode(exists, bl); + ::decode(pending_removal, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const; + void decode_json(JSONObj *obj); +}; +WRITE_CLASS_ENCODER(rgw_bucket_olh_entry) + struct rgw_bi_log_entry { string id; string object; + string instance; utime_t timestamp; rgw_bucket_entry_ver ver; RGWModifyOp op; RGWPendingState state; uint64_t index_ver; string tag; + uint16_t bilog_flags; - rgw_bi_log_entry() : op(CLS_RGW_OP_UNKNOWN), state(CLS_RGW_STATE_PENDING_MODIFY), index_ver(0) {} + rgw_bi_log_entry() : op(CLS_RGW_OP_UNKNOWN), state(CLS_RGW_STATE_PENDING_MODIFY), index_ver(0), bilog_flags(0) {} void encode(bufferlist &bl) const { - ENCODE_START(1, 1, bl); + ENCODE_START(2, 1, bl); ::encode(id, bl); ::encode(object, bl); ::encode(timestamp, bl); @@ -270,10 +485,12 @@ struct rgw_bi_log_entry { c = (uint8_t)state; ::encode(c, bl); encode_packed_val(index_ver, bl); + ::encode(instance, bl); + ::encode(bilog_flags, bl); ENCODE_FINISH(bl); } void decode(bufferlist::iterator &bl) { - DECODE_START(1, bl); + DECODE_START(2, bl); ::decode(id, bl); ::decode(object, bl); ::decode(timestamp, bl); @@ -285,6 +502,10 @@ struct rgw_bi_log_entry { ::decode(c, bl); state = (RGWPendingState)c; decode_packed_val(index_ver, bl); + if (struct_v >= 2) { + ::decode(instance, bl); + ::decode(bilog_flags, bl); + } DECODE_FINISH(bl); } void dump(Formatter *f) const; @@ -548,39 +769,44 @@ enum cls_rgw_gc_op { struct cls_rgw_obj { string pool; - string oid; - string key; + cls_rgw_obj_key key; + string loc; cls_rgw_obj() {} - cls_rgw_obj(string& _p, string& _o) : pool(_p), oid(_o) {} + cls_rgw_obj(string& _p, cls_rgw_obj_key& _k) : pool(_p), key(_k) {} void encode(bufferlist& bl) const { - ENCODE_START(1, 1, bl); + ENCODE_START(2, 1, bl); ::encode(pool, bl); - ::encode(oid, bl); + ::encode(key.name, bl); + ::encode(loc, bl); ::encode(key, bl); ENCODE_FINISH(bl); } void decode(bufferlist::iterator& bl) { - DECODE_START(1, bl); + DECODE_START(2, bl); ::decode(pool, bl); - ::decode(oid, bl); - ::decode(key, bl); + ::decode(key.name, bl); + ::decode(loc, bl); + if (struct_v >= 2) { + ::decode(key, bl); + } DECODE_FINISH(bl); } void dump(Formatter *f) const { f->dump_string("pool", pool); - f->dump_string("oid", oid); - f->dump_string("key", key); + f->dump_string("oid", key.name); + f->dump_string("key", loc); + f->dump_string("instance", key.instance); } static void generate_test_instances(list& ls) { ls.push_back(new cls_rgw_obj); ls.push_back(new cls_rgw_obj); ls.back()->pool = "mypool"; - ls.back()->oid = "myoid"; - ls.back()->key = "mykey"; + ls.back()->key.name = "myoid"; + ls.back()->loc = "mykey"; } }; WRITE_CLASS_ENCODER(cls_rgw_obj) @@ -590,11 +816,11 @@ struct cls_rgw_obj_chain { cls_rgw_obj_chain() {} - void push_obj(string& pool, string& oid, string& key) { + void push_obj(string& pool, cls_rgw_obj_key& key, string& loc) { cls_rgw_obj obj; obj.pool = pool; - obj.oid = oid; obj.key = key; + obj.loc = loc; objs.push_back(obj); } diff --git a/src/cls/statelog/cls_statelog_ops.h b/src/cls/statelog/cls_statelog_ops.h index 725fa863df7df..2f21440f87e30 100644 --- a/src/cls/statelog/cls_statelog_ops.h +++ b/src/cls/statelog/cls_statelog_ops.h @@ -34,7 +34,7 @@ struct cls_statelog_list_op { int max_entries; /* upperbound to returned num of entries might return less than that and still be truncated */ - cls_statelog_list_op() {} + cls_statelog_list_op() : max_entries(0) {} void encode(bufferlist& bl) const { ENCODE_START(1, 1, bl); @@ -119,7 +119,7 @@ struct cls_statelog_check_state_op { string object; uint32_t state; - cls_statelog_check_state_op() {} + cls_statelog_check_state_op() : state(0) {} void encode(bufferlist& bl) const { ENCODE_START(1, 1, bl); diff --git a/src/cls/statelog/cls_statelog_types.h b/src/cls/statelog/cls_statelog_types.h index f812c7b321cdc..72147de0e9d12 100644 --- a/src/cls/statelog/cls_statelog_types.h +++ b/src/cls/statelog/cls_statelog_types.h @@ -16,7 +16,7 @@ struct cls_statelog_entry { bufferlist data; uint32_t state; /* user defined state */ - cls_statelog_entry() {} + cls_statelog_entry() : state(0) {} void encode(bufferlist& bl) const { ENCODE_START(1, 1, bl); diff --git a/src/cls/user/cls_user.cc b/src/cls/user/cls_user.cc index 003a8347889fc..3a91112941762 100644 --- a/src/cls/user/cls_user.cc +++ b/src/cls/user/cls_user.cc @@ -293,7 +293,7 @@ static int cls_user_list_buckets(cls_method_context_t hctx, bufferlist *in, buff #define MAX_ENTRIES 1000 size_t max_entries = op.max_entries; - if (!max_entries || max_entries > MAX_ENTRIES) + if (max_entries > MAX_ENTRIES) max_entries = MAX_ENTRIES; string match_prefix; diff --git a/src/common/ConfUtils.cc b/src/common/ConfUtils.cc index 5efde8d4ae005..1ae5df5408895 100644 --- a/src/common/ConfUtils.cc +++ b/src/common/ConfUtils.cc @@ -24,6 +24,7 @@ #include #include #include +#include #include "include/buffer.h" #include "common/errno.h" diff --git a/src/common/ContextCompletion.cc b/src/common/ContextCompletion.cc new file mode 100644 index 0000000000000..d5ff9f2204c43 --- /dev/null +++ b/src/common/ContextCompletion.cc @@ -0,0 +1,49 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#include "common/ContextCompletion.h" + +namespace ceph +{ + +ContextCompletion::ContextCompletion(Context *ctx, bool ignore_enoent) + : m_lock("ceph::ContextCompletion::m_lock"), m_ctx(ctx), + m_ignore_enoent(ignore_enoent), m_ret(0), m_building(true), m_current_ops(0) +{ +} + +void ContextCompletion::finish_adding_requests() { + bool complete; + { + Mutex::Locker l(m_lock); + m_building = false; + complete = (m_current_ops == 0); + } + if (complete) { + m_ctx->complete(m_ret); + delete this; + } +} + +void ContextCompletion::start_op() { + Mutex::Locker l(m_lock); + ++m_current_ops; +} + +void ContextCompletion::finish_op(int r) { + bool complete; + { + Mutex::Locker l(m_lock); + if (r < 0 && m_ret == 0 && (!m_ignore_enoent || r != -ENOENT)) { + m_ret = r; + } + + --m_current_ops; + complete = (m_current_ops == 0 && !m_building); + } + if (complete) { + m_ctx->complete(m_ret); + delete this; + } +} + +} // namespace ceph diff --git a/src/common/ContextCompletion.h b/src/common/ContextCompletion.h new file mode 100644 index 0000000000000..2cf5d55cb0c05 --- /dev/null +++ b/src/common/ContextCompletion.h @@ -0,0 +1,47 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_ASYNC_COMPLETION_H +#define CEPH_ASYNC_COMPLETION_H + +#include "include/int_types.h" +#include "include/Context.h" + +namespace ceph { + +class ContextCompletion { +public: + ContextCompletion(Context *ctx, bool ignore_enoent); + + void finish_adding_requests(); + + void start_op(); + void finish_op(int r); + +private: + Mutex m_lock; + Context *m_ctx; + bool m_ignore_enoent; + int m_ret; + bool m_building; + uint64_t m_current_ops; +}; + +class C_ContextCompletion : public Context { +public: + C_ContextCompletion(ContextCompletion &context_completion) + : m_context_completion(context_completion) + { + m_context_completion.start_op(); + } + + virtual void finish(int r) { + m_context_completion.finish_op(r); + } + +private: + ContextCompletion &m_context_completion; +}; + +} // namespace ceph + +#endif // CEPH_ASYNC_COMPLETION_H diff --git a/src/common/Cycles.cc b/src/common/Cycles.cc index c20f8a5ec9d12..b0b687e49d46b 100644 --- a/src/common/Cycles.cc +++ b/src/common/Cycles.cc @@ -52,12 +52,16 @@ void Cycles::init() if (cycles_per_sec != 0) return; + // Skip initialization if rtdsc is not implemented + if (rdtsc() == 0) + return; + // Compute the frequency of the fine-grained CPU timer: to do this, // take parallel time readings using both rdtsc and gettimeofday. // After 10ms have elapsed, take the ratio between these readings. struct timeval start_time, stop_time; - uint64_t start_cycles, stop_cycles, micros; + uint64_t micros; double old_cycles; // There is one tricky aspect, which is that we could get interrupted @@ -70,12 +74,12 @@ void Cycles::init() if (gettimeofday(&start_time, NULL) != 0) { assert(0 == "couldn't read clock"); } - start_cycles = rdtsc(); + uint64_t start_cycles = rdtsc(); while (1) { if (gettimeofday(&stop_time, NULL) != 0) { assert(0 == "couldn't read clock"); } - stop_cycles = rdtsc(); + uint64_t stop_cycles = rdtsc(); micros = (stop_time.tv_usec - start_time.tv_usec) + (stop_time.tv_sec - start_time.tv_sec)*1000000; if (micros > 10000) { diff --git a/src/common/Cycles.h b/src/common/Cycles.h index 6ddcb7a74941e..bb47d5cb029c5 100644 --- a/src/common/Cycles.h +++ b/src/common/Cycles.h @@ -32,7 +32,6 @@ #ifndef CEPH_CYCLES_H #define CEPH_CYCLES_H - /** * This class provides static methods that read the fine-grain CPU * cycle counter and translate between cycle-level times and absolute @@ -47,9 +46,42 @@ class Cycles { * (accessed via the RDTSC instruction). */ static __inline __attribute__((always_inline)) uint64_t rdtsc() { +#if defined(__i386__) + int64_t ret; + __asm__ volatile ("rdtsc" : "=A" (ret) ); + return ret; +#elif defined(__x86_64__) || defined(__amd64__) uint32_t lo, hi; __asm__ __volatile__("rdtsc" : "=a" (lo), "=d" (hi)); return (((uint64_t)hi << 32) | lo); +#elif defined(__aarch64__) + // + // arch/arm64/include/asm/arch_timer.h + // + // static inline u64 arch_counter_get_cntvct(void) + // { + // u64 cval; + // + // isb(); + // asm volatile("mrs %0, cntvct_el0" : "=r" (cval)); + // + // return cval; + // } + // + // https://github.com/cloudius-systems/osv/blob/master/arch/aarch64/arm-clock.cc + uint64_t cntvct; + asm volatile ("isb; mrs %0, cntvct_el0; isb; " : "=r" (cntvct) :: "memory"); + return cntvct; +#elif defined(__powerpc__) || defined (__powerpc64__) + // Based on: + // https://github.com/randombit/botan/blob/net.randombit.botan/src/lib/entropy/hres_timer/hres_timer.cpp + uint32_t lo = 0, hi = 0; + asm volatile("mftbu %0; mftb %1" : "=r" (hi), "=r" (lo)); + return (((uint64_t)hi << 32) | lo); +#else +#warning No high-precision counter available for your OS/arch + return 0; +#endif } static double per_second(); diff --git a/src/common/Finisher.cc b/src/common/Finisher.cc index f3f4107dcf171..7ebbe05622bcb 100644 --- a/src/common/Finisher.cc +++ b/src/common/Finisher.cc @@ -20,9 +20,11 @@ void Finisher::stop() ldout(cct, 10) << __func__ << dendl; finisher_lock.Lock(); finisher_stop = true; + // we don't have any new work to do, but we want the worker to wake up anyway + // to process the stop condition. finisher_cond.Signal(); finisher_lock.Unlock(); - finisher_thread.join(); + finisher_thread.join(); // wait until the worker exits completely ldout(cct, 10) << __func__ << " finish" << dendl; } @@ -43,7 +45,10 @@ void *Finisher::finisher_thread_entry() ldout(cct, 10) << "finisher_thread start" << dendl; while (!finisher_stop) { + /// Every time we are woken up, we process the queue until it is empty. while (!finisher_queue.empty()) { + // To reduce lock contention, we swap out the queue to process. + // This way other threads can submit new contexts to complete while we are working. vector ls; list > ls_rval; ls.swap(finisher_queue); @@ -52,12 +57,17 @@ void *Finisher::finisher_thread_entry() finisher_lock.Unlock(); ldout(cct, 10) << "finisher_thread doing " << ls << dendl; + // Now actually process the contexts. for (vector::iterator p = ls.begin(); p != ls.end(); ++p) { if (*p) { (*p)->complete(0); } else { + // When an item is NULL in the finisher_queue, it means + // we should instead process an item from finisher_queue_rval, + // which has a parameter for complete() other than zero. + // This preserves the order while saving some storage. assert(!ls_rval.empty()); Context *c = ls_rval.front().first; c->complete(ls_rval.front().second); @@ -80,6 +90,8 @@ void *Finisher::finisher_thread_entry() ldout(cct, 10) << "finisher_thread sleeping" << dendl; finisher_cond.Wait(finisher_lock); } + // If we are exiting, we signal the thread waiting in stop(), + // otherwise it would never unblock finisher_empty_cond.Signal(); ldout(cct, 10) << "finisher_thread stop" << dendl; diff --git a/src/common/Finisher.h b/src/common/Finisher.h index 76a3944ac9e3e..8767445990316 100644 --- a/src/common/Finisher.h +++ b/src/common/Finisher.h @@ -23,19 +23,36 @@ class CephContext; +/// Finisher queue length performance counter ID. enum { l_finisher_first = 997082, l_finisher_queue_len, l_finisher_last }; +/** @brief Asynchronous cleanup class. + * Finisher asynchronously completes Contexts, which are simple classes + * representing callbacks, in a dedicated worker thread. Enqueuing + * contexts to complete is thread-safe. + */ class Finisher { CephContext *cct; - Mutex finisher_lock; - Cond finisher_cond, finisher_empty_cond; - bool finisher_stop, finisher_running; + Mutex finisher_lock; ///< Protects access to queues and finisher_running. + Cond finisher_cond; ///< Signaled when there is something to process. + Cond finisher_empty_cond; ///< Signaled when the finisher has nothing more to process. + bool finisher_stop; ///< Set when the finisher should stop. + bool finisher_running; ///< True when the finisher is currently executing contexts. + /// Queue for contexts for which complete(0) will be called. + /// NULLs in this queue indicate that an item from finisher_queue_rval + /// should be completed in that place instead. vector finisher_queue; + + /// Queue for contexts for which the complete function will be called + /// with a parameter other than 0. list > finisher_queue_rval; + + /// Performance counter for the finisher's queue length. + /// Only active for named finishers. PerfCounters *logger; void *finisher_thread_entry(); @@ -47,22 +64,27 @@ class Finisher { } finisher_thread; public: + /// Add a context to complete, optionally specifying a parameter for the complete function. void queue(Context *c, int r = 0) { finisher_lock.Lock(); + if (finisher_queue.empty()) { + finisher_cond.Signal(); + } if (r) { finisher_queue_rval.push_back(pair(c, r)); finisher_queue.push_back(NULL); } else finisher_queue.push_back(c); - finisher_cond.Signal(); if (logger) logger->inc(l_finisher_queue_len); finisher_lock.Unlock(); } void queue(vector& ls) { finisher_lock.Lock(); + if (finisher_queue.empty()) { + finisher_cond.Signal(); + } finisher_queue.insert(finisher_queue.end(), ls.begin(), ls.end()); - finisher_cond.Signal(); if (logger) logger->inc(l_finisher_queue_len, ls.size()); finisher_lock.Unlock(); @@ -70,8 +92,10 @@ class Finisher { } void queue(deque& ls) { finisher_lock.Lock(); + if (finisher_queue.empty()) { + finisher_cond.Signal(); + } finisher_queue.insert(finisher_queue.end(), ls.begin(), ls.end()); - finisher_cond.Signal(); if (logger) logger->inc(l_finisher_queue_len, ls.size()); finisher_lock.Unlock(); @@ -79,24 +103,41 @@ class Finisher { } void queue(list& ls) { finisher_lock.Lock(); + if (finisher_queue.empty()) { + finisher_cond.Signal(); + } finisher_queue.insert(finisher_queue.end(), ls.begin(), ls.end()); - finisher_cond.Signal(); if (logger) logger->inc(l_finisher_queue_len, ls.size()); finisher_lock.Unlock(); ls.clear(); } - + + /// Start the worker thread. void start(); + + /** @brief Stop the worker thread. + * + * Does not wait until all outstanding contexts are completed. + * To ensure that everything finishes, you should first shut down + * all sources that can add contexts to this finisher and call + * wait_for_empty() before calling stop(). */ void stop(); + /** @brief Blocks until the finisher has nothing left to process. + * This function will also return when a concurrent call to stop() + * finishes, but this class should never be used in this way. */ void wait_for_empty(); + /// Construct an anonymous Finisher. + /// Anonymous finishers do not log their queue length. Finisher(CephContext *cct_) : cct(cct_), finisher_lock("Finisher::finisher_lock"), finisher_stop(false), finisher_running(false), logger(0), finisher_thread(this) {} + + /// Construct a named Finisher that logs its queue length. Finisher(CephContext *cct_, string name) : cct(cct_), finisher_lock("Finisher::finisher_lock"), finisher_stop(false), finisher_running(false), @@ -118,6 +159,7 @@ class Finisher { } }; +/// Context that is completed asynchronously on the supplied finisher. class C_OnFinisher : public Context { Context *con; Finisher *fin; diff --git a/src/common/Formatter.cc b/src/common/Formatter.cc index b9ad5bff75cf3..7c166ef09865f 100644 --- a/src/common/Formatter.cc +++ b/src/common/Formatter.cc @@ -26,6 +26,11 @@ #include #include #include +#include +#include +#include + + // ----------------------- namespace ceph { @@ -54,31 +59,34 @@ FormatterAttrs::FormatterAttrs(const char *attr, ...) va_end(ap); } -Formatter::Formatter() -{ -} - -Formatter::~Formatter() -{ -} - -Formatter * -new_formatter(const std::string &type) -{ - std::string mytype = type; - if (mytype == "") - mytype = "json-pretty"; - - if (mytype == "json") - return new JSONFormatter(false); - else if (mytype == "json-pretty") - return new JSONFormatter(true); - else if (mytype == "xml") - return new XMLFormatter(false); - else if (mytype == "xml-pretty") - return new XMLFormatter(true); - else - return (Formatter *)NULL; +Formatter::Formatter() { } + +Formatter::~Formatter() { } + +Formatter *Formatter::create(const std::string &type, + const std::string& default_type, + const std::string& fallback) +{ + std::string mytype = type; + if (mytype == "") + mytype = default_type; + + if (mytype == "json") + return new JSONFormatter(false); + else if (mytype == "json-pretty") + return new JSONFormatter(true); + else if (mytype == "xml") + return new XMLFormatter(false); + else if (mytype == "xml-pretty") + return new XMLFormatter(true); + else if (mytype == "table") + return new TableFormatter(); + else if (mytype == "table-kv") + return new TableFormatter(true); + else if (fallback != "") + return create(fallback, "", ""); + else + return (Formatter *) NULL; } void Formatter::dump_format(const char *name, const char *fmt, ...) @@ -89,7 +97,7 @@ void Formatter::dump_format(const char *name, const char *fmt, ...) va_end(ap); } -void Formatter:: dump_format_ns(const char *name, const char *ns, const char *fmt, ...) +void Formatter::dump_format_ns(const char *name, const char *ns, const char *fmt, ...) { va_list ap; va_start(ap, fmt); @@ -98,7 +106,6 @@ void Formatter:: dump_format_ns(const char *name, const char *ns, const char *fm } - void Formatter::dump_format_unquoted(const char *name, const char *fmt, ...) { va_list ap; @@ -108,8 +115,9 @@ void Formatter::dump_format_unquoted(const char *name, const char *fmt, ...) } // ----------------------- + JSONFormatter::JSONFormatter(bool p) - : m_pretty(p), m_is_pending_string(false) +: m_pretty(p), m_is_pending_string(false) { reset(); } @@ -118,6 +126,8 @@ void JSONFormatter::flush(std::ostream& os) { finish_pending_string(); os << m_ss.str(); + if (m_pretty) + os << "\n"; m_ss.clear(); m_ss.str(""); } @@ -136,27 +146,26 @@ void JSONFormatter::print_comma(json_formatter_stack_entry_d& entry) if (entry.size) { if (m_pretty) { m_ss << ",\n"; - for (unsigned i=1; i < m_stack.size(); i++) - m_ss << " "; + for (unsigned i = 1; i < m_stack.size(); i++) + m_ss << " "; } else { m_ss << ","; } - } else if (entry.is_array && m_pretty) { + } else if (m_pretty) { m_ss << "\n"; - for (unsigned i=1; i < m_stack.size(); i++) + for (unsigned i = 1; i < m_stack.size(); i++) m_ss << " "; } if (m_pretty && entry.is_array) m_ss << " "; } -void JSONFormatter::print_quoted_string(const char *s) +void JSONFormatter::print_quoted_string(const std::string& s) { - int len = escape_json_attr_len(s); - char *escaped = new char[len]; - escape_json_attr(s, escaped); + int len = escape_json_attr_len(s.c_str(), s.size()); + char escaped[len]; + escape_json_attr(s.c_str(), s.size(), escaped); m_ss << '\"' << escaped << '\"'; - delete[] escaped; } void JSONFormatter::print_name(const char *name) @@ -168,10 +177,7 @@ void JSONFormatter::print_name(const char *name) print_comma(entry); if (!entry.is_array) { if (m_pretty) { - if (entry.size) - m_ss << " "; - else - m_ss << " "; + m_ss << " "; } m_ss << "\"" << name << "\""; if (m_pretty) @@ -225,6 +231,11 @@ void JSONFormatter::close_section() finish_pending_string(); struct json_formatter_stack_entry_d& entry = m_stack.back(); + if (m_pretty && entry.size) { + m_ss << "\n"; + for (unsigned i = 1; i < m_stack.size(); i++) + m_ss << " "; + } m_ss << (entry.is_array ? ']' : '}'); m_stack.pop_back(); } @@ -232,7 +243,7 @@ void JSONFormatter::close_section() void JSONFormatter::finish_pending_string() { if (m_is_pending_string) { - print_quoted_string(m_pending_string.str().c_str()); + print_quoted_string(m_pending_string.str()); m_pending_string.str(std::string()); m_is_pending_string = false; } @@ -258,10 +269,10 @@ void JSONFormatter::dump_float(const char *name, double d) m_ss << foo; } -void JSONFormatter::dump_string(const char *name, std::string s) +void JSONFormatter::dump_string(const char *name, const std::string& s) { print_name(name); - print_quoted_string(s.c_str()); + print_quoted_string(s); } std::ostream& JSONFormatter::dump_stream(const char *name) @@ -278,9 +289,9 @@ void JSONFormatter::dump_format_va(const char *name, const char *ns, bool quoted print_name(name); if (quoted) { - print_quoted_string(buf); + print_quoted_string(std::string(buf)); } else { - m_ss << buf; + m_ss << std::string(buf); } } @@ -294,11 +305,11 @@ void JSONFormatter::write_raw_data(const char *data) m_ss << data; } -const char *XMLFormatter::XML_1_DTD = +const char *XMLFormatter::XML_1_DTD = ""; XMLFormatter::XMLFormatter(bool pretty) - : m_pretty(pretty) +: m_pretty(pretty) { reset(); } @@ -307,6 +318,8 @@ void XMLFormatter::flush(std::ostream& os) { finish_pending_string(); os << m_ss.str(); + if (m_pretty) + os << "\n"; m_ss.clear(); m_ss.str(""); } @@ -391,7 +404,7 @@ void XMLFormatter::dump_float(const char *name, double d) m_ss << "\n"; } -void XMLFormatter::dump_string(const char *name, std::string s) +void XMLFormatter::dump_string(const char *name, const std::string& s) { std::string e(name); print_spaces(); @@ -400,7 +413,7 @@ void XMLFormatter::dump_string(const char *name, std::string s) m_ss << "\n"; } -void XMLFormatter::dump_string_with_attrs(const char *name, std::string s, const FormatterAttrs& attrs) +void XMLFormatter::dump_string_with_attrs(const char *name, const std::string& s, const FormatterAttrs& attrs) { std::string e(name); std::string attrs_str; @@ -427,7 +440,7 @@ void XMLFormatter::dump_format_va(const char* name, const char *ns, bool quoted, std::string e(name); print_spaces(); if (ns) { - m_ss << "<" << e << " xmlns=\"" << ns << "\">" << buf << ""; + m_ss << "<" << e << " xmlns=\"" << ns << "\">" << buf << ""; } else { m_ss << "<" << e << ">" << escape_xml_str(buf) << ""; } @@ -470,8 +483,7 @@ void XMLFormatter::open_section_in_ns(const char *name, const char *ns, const Fo if (ns) { m_ss << "<" << name << attrs_str << " xmlns=\"" << ns << "\">"; - } - else { + } else { m_ss << "<" << name << attrs_str << ">"; } if (m_pretty) @@ -483,7 +495,7 @@ void XMLFormatter::finish_pending_string() { if (!m_pending_string_name.empty()) { m_ss << escape_xml_str(m_pending_string.str().c_str()) - << ""; + << ""; m_pending_string_name.clear(); m_pending_string.str(std::string()); if (m_pretty) { @@ -509,4 +521,344 @@ std::string XMLFormatter::escape_xml_str(const char *str) return std::string(&escaped[0]); } +TableFormatter::TableFormatter(bool keyval) : m_keyval(keyval) +{ + reset(); +} + +void TableFormatter::flush(std::ostream& os) +{ + finish_pending_string(); + std::vector column_size = m_column_size; + std::vector column_name = m_column_name; + + std::set need_header_set; + + // auto-sizing columns + for (size_t i = 0; i < m_vec.size(); i++) { + for (size_t j = 0; j < m_vec[i].size(); j++) { + column_size.resize(m_vec[i].size()); + column_name.resize(m_vec[i].size()); + if (i > 0) { + if (m_vec[i - 1][j] != m_vec[i][j]) { + // changing row labels require to show the header + need_header_set.insert(i); + column_name[i] = m_vec[i][j].first; + } + } else { + column_name[i] = m_vec[i][j].first; + } + + if (m_vec[i][j].second.length() > column_size[j]) + column_size[j] = m_vec[i][j].second.length(); + if (m_vec[i][j].first.length() > column_size[j]) + column_size[j] = m_vec[i][j].first.length(); + } + } + + bool need_header = false; + if ((column_size.size() == m_column_size.size())) { + for (size_t i = 0; i < column_size.size(); i++) { + if (column_size[i] != m_column_size[i]) { + need_header = true; + break; + } + } + } else { + need_header = true; + } + + if (need_header) { + // first row always needs a header if there wasn't one before + need_header_set.insert(0); + } + + m_column_size = column_size; + for (size_t i = 0; i < m_vec.size(); i++) { + if (i == 0) { + if (need_header_set.count(i)) { + // print the header + if (!m_keyval) { + os << "+"; + for (size_t j = 0; j < m_vec[i].size(); j++) { + for (size_t v = 0; v < m_column_size[j] + 3; v++) + os << "-"; + os << "+"; + } + os << "\n"; + os << "|"; + + for (size_t j = 0; j < m_vec[i].size(); j++) { + os << " "; + std::stringstream fs; + fs << boost::format("%%-%is") % (m_column_size[j] + 2); + os << boost::format(fs.str()) % m_vec[i][j].first; + os << "|"; + } + os << "\n"; + os << "+"; + for (size_t j = 0; j < m_vec[i].size(); j++) { + for (size_t v = 0; v < m_column_size[j] + 3; v++) + os << "-"; + os << "+"; + } + os << "\n"; + } + } + } + // print body + if (!m_keyval) + os << "|"; + for (size_t j = 0; j < m_vec[i].size(); j++) { + if (!m_keyval) + os << " "; + std::stringstream fs; + + if (m_keyval) { + os << "key::"; + os << m_vec[i][j].first; + os << "="; + os << "\""; + os << m_vec[i][j].second; + os << "\" "; + } else { + fs << boost::format("%%-%is") % (m_column_size[j] + 2); + os << boost::format(fs.str()) % m_vec[i][j].second; + os << "|"; + } + } + + os << "\n"; + if (!m_keyval) { + if (i == (m_vec.size() - 1)) { + // print trailer + os << "+"; + for (size_t j = 0; j < m_vec[i].size(); j++) { + for (size_t v = 0; v < m_column_size[j] + 3; v++) + os << "-"; + os << "+"; + } + os << "\n"; + } + } + m_vec[i].clear(); + } + m_vec.clear(); +} + +void TableFormatter::reset() +{ + m_ss.clear(); + m_ss.str(""); + m_section_cnt.clear(); + m_column_size.clear(); + m_section_open = 0; +} + +void TableFormatter::open_object_section(const char *name) +{ + open_section_in_ns(name, NULL, NULL); +} + +void TableFormatter::open_object_section_with_attrs(const char *name, const FormatterAttrs& attrs) +{ + open_section_in_ns(name, NULL, NULL); +} + +void TableFormatter::open_object_section_in_ns(const char *name, const char *ns) +{ + open_section_in_ns(name, NULL, NULL); +} + +void TableFormatter::open_array_section(const char *name) +{ + open_section_in_ns(name, NULL, NULL); +} + +void TableFormatter::open_array_section_with_attrs(const char *name, const FormatterAttrs& attrs) +{ + open_section_in_ns(name, NULL, NULL); +} + +void TableFormatter::open_array_section_in_ns(const char *name, const char *ns) +{ + open_section_in_ns(name, NULL, NULL); +} + +void TableFormatter::open_section_in_ns(const char *name, const char *ns, const FormatterAttrs *attrs) +{ + m_section.push_back(name); + m_section_open++; +} + +void TableFormatter::close_section() +{ + // + m_section_open--; + if (m_section.size()) { + m_section_cnt[m_section.back()] = 0; + m_section.pop_back(); + } +} + +size_t TableFormatter::m_vec_index(const char *name) +{ + std::string key(name); + + size_t i = m_vec.size(); + if (i) + i--; + + // make sure there are vectors to push back key/val pairs + if (!m_vec.size()) + m_vec.resize(1); + + if (m_vec.size()) { + if (m_vec[i].size()) { + if (m_vec[i][0].first == key) { + // start a new column if a key is repeated + m_vec.resize(m_vec.size() + 1); + i++; + } + } + } + + return i; +} + +std::string TableFormatter::get_section_name(const char* name) +{ + std::string t_name = name; + for (size_t i = 0; i < m_section.size(); i++) { + t_name.insert(0, ":"); + t_name.insert(0, m_section[i]); + } + if (m_section_open) { + std::stringstream lss; + lss << t_name; + lss << "["; + lss << m_section_cnt[t_name]++; + lss << "]"; + return lss.str(); + } else { + return t_name; + } +} + +void TableFormatter::dump_unsigned(const char *name, uint64_t u) +{ + finish_pending_string(); + size_t i = m_vec_index(name); + m_ss << u; + m_vec[i].push_back(std::make_pair(get_section_name(name), m_ss.str())); + m_ss.clear(); + m_ss.str(""); +} + +void TableFormatter::dump_int(const char *name, int64_t u) +{ + finish_pending_string(); + size_t i = m_vec_index(name); + m_ss << u; + m_vec[i].push_back(std::make_pair(get_section_name(name), m_ss.str())); + m_ss.clear(); + m_ss.str(""); +} + +void TableFormatter::dump_float(const char *name, double d) +{ + finish_pending_string(); + size_t i = m_vec_index(name); + m_ss << d; + + m_vec[i].push_back(std::make_pair(get_section_name(name), m_ss.str())); + m_ss.clear(); + m_ss.str(""); +} + +void TableFormatter::dump_string(const char *name, const std::string& s) +{ + finish_pending_string(); + size_t i = m_vec_index(name); + m_ss << s; + + m_vec[i].push_back(std::make_pair(get_section_name(name), m_ss.str())); + m_ss.clear(); + m_ss.str(""); +} + +void TableFormatter::dump_string_with_attrs(const char *name, const std::string& s, const FormatterAttrs& attrs) +{ + finish_pending_string(); + size_t i = m_vec_index(name); + + std::string attrs_str; + get_attrs_str(&attrs, attrs_str); + m_ss << attrs_str << s; + + m_vec[i].push_back(std::make_pair(get_section_name(name), m_ss.str())); + m_ss.clear(); + m_ss.str(""); +} + +void TableFormatter::dump_format_va(const char* name, const char *ns, bool quoted, const char *fmt, va_list ap) +{ + finish_pending_string(); + char buf[LARGE_SIZE]; + vsnprintf(buf, LARGE_SIZE, fmt, ap); + + size_t i = m_vec_index(name); + if (ns) { + m_ss << ns << "." << buf; + } else + m_ss << buf; + + m_vec[i].push_back(std::make_pair(get_section_name(name), m_ss.str())); + m_ss.clear(); + m_ss.str(""); +} + +std::ostream& TableFormatter::dump_stream(const char *name) +{ + finish_pending_string(); + // we don't support this + m_pending_name = name; + return m_ss; } + +int TableFormatter::get_len() const +{ + // we don't know the size until flush is called + return 0; +} + +void TableFormatter::write_raw_data(const char *data) { + // not supported +} + +void TableFormatter::get_attrs_str(const FormatterAttrs *attrs, std::string& attrs_str) +{ + std::stringstream attrs_ss; + + for (std::list >::const_iterator iter = attrs->attrs.begin(); + iter != attrs->attrs.end(); ++iter) { + std::pair p = *iter; + attrs_ss << " " << p.first << "=" << "\"" << p.second << "\""; + } + + attrs_str = attrs_ss.str(); +} + +void TableFormatter::finish_pending_string() +{ + if (m_pending_name.length()) { + std::string ss = m_ss.str(); + m_ss.clear(); + m_ss.str(""); + std::string pending_name = m_pending_name; + m_pending_name = ""; + dump_string(pending_name.c_str(), ss); + } +} +} + diff --git a/src/common/Formatter.h b/src/common/Formatter.h index 5b76c3a5c4e22..3ce5996ce22ba 100644 --- a/src/common/Formatter.h +++ b/src/common/Formatter.h @@ -1,150 +1,211 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab #ifndef CEPH_FORMATTER_H #define CEPH_FORMATTER_H #include "include/int_types.h" #include -#include +#include #include -#include +#include #include #include #include +#include #include "include/buffer.h" namespace ceph { + struct FormatterAttrs { + std::list< std::pair > attrs; -struct FormatterAttrs { - std::list< std::pair > attrs; - - FormatterAttrs(const char *attr, ...); -}; - -class Formatter { - public: - Formatter(); - virtual ~Formatter(); - - virtual void flush(std::ostream& os) = 0; - void flush(bufferlist &bl) { - std::stringstream os; - flush(os); - bl.append(os.str()); - } - virtual void reset() = 0; - - virtual void open_array_section(const char *name) = 0; - virtual void open_array_section_in_ns(const char *name, const char *ns) = 0; - virtual void open_object_section(const char *name) = 0; - virtual void open_object_section_in_ns(const char *name, const char *ns) = 0; - virtual void close_section() = 0; - virtual void dump_unsigned(const char *name, uint64_t u) = 0; - virtual void dump_int(const char *name, int64_t s) = 0; - virtual void dump_float(const char *name, double d) = 0; - virtual void dump_string(const char *name, std::string s) = 0; - virtual void dump_bool(const char *name, bool b) { - dump_format_unquoted(name, "%s", (b ? "true" : "false")); - } - virtual std::ostream& dump_stream(const char *name) = 0; - virtual void dump_format_va(const char *name, const char *ns, bool quoted, const char *fmt, va_list ap) = 0; - virtual void dump_format(const char *name, const char *fmt, ...); - virtual void dump_format_ns(const char *name, const char *ns, const char *fmt, ...); - virtual void dump_format_unquoted(const char *name, const char *fmt, ...); - virtual int get_len() const = 0; - virtual void write_raw_data(const char *data) = 0; - - /* with attrs */ - virtual void open_array_section_with_attrs(const char *name, const FormatterAttrs& attrs) { - open_array_section(name); - } - virtual void open_object_section_with_attrs(const char *name, const FormatterAttrs& attrs) { - open_object_section(name); - } - virtual void dump_string_with_attrs(const char *name, std::string s, const FormatterAttrs& attrs) { - dump_string(name, s); - } -}; - -Formatter *new_formatter(const std::string &type); - -class JSONFormatter : public Formatter { - public: - JSONFormatter(bool p=false); - - void flush(std::ostream& os); - void reset(); - virtual void open_array_section(const char *name); - void open_array_section_in_ns(const char *name, const char *ns); - void open_object_section(const char *name); - void open_object_section_in_ns(const char *name, const char *ns); - void close_section(); - void dump_unsigned(const char *name, uint64_t u); - void dump_int(const char *name, int64_t u); - void dump_float(const char *name, double d); - void dump_string(const char *name, std::string s); - std::ostream& dump_stream(const char *name); - void dump_format_va(const char *name, const char *ns, bool quoted, const char *fmt, va_list ap); - int get_len() const; - void write_raw_data(const char *data); - - private: - struct json_formatter_stack_entry_d { - int size; - bool is_array; - json_formatter_stack_entry_d() : size(0), is_array(false) {} + FormatterAttrs(const char *attr, ...); }; - - bool m_pretty; - void open_section(const char *name, bool is_array); - void print_quoted_string(const char *s); - void print_name(const char *name); - void print_comma(json_formatter_stack_entry_d& entry); - void finish_pending_string(); - - std::stringstream m_ss, m_pending_string; - std::list m_stack; - bool m_is_pending_string; -}; - -class XMLFormatter : public Formatter { - public: - static const char *XML_1_DTD; - XMLFormatter(bool pretty = false); - - void flush(std::ostream& os); - void reset(); - void open_array_section(const char *name); - void open_array_section_in_ns(const char *name, const char *ns); - void open_object_section(const char *name); - void open_object_section_in_ns(const char *name, const char *ns); - void close_section(); - void dump_unsigned(const char *name, uint64_t u); - void dump_int(const char *name, int64_t u); - void dump_float(const char *name, double d); - void dump_string(const char *name, std::string s); - std::ostream& dump_stream(const char *name); - void dump_format_va(const char *name, const char *ns, bool quoted, const char *fmt, va_list ap); - int get_len() const; - void write_raw_data(const char *data); - - /* with attrs */ - void open_array_section_with_attrs(const char *name, const FormatterAttrs& attrs); - void open_object_section_with_attrs(const char *name, const FormatterAttrs& attrs); - void dump_string_with_attrs(const char *name, std::string s, const FormatterAttrs& attrs); - private: - void open_section_in_ns(const char *name, const char *ns, const FormatterAttrs *attrs); - void finish_pending_string(); - void print_spaces(); - static std::string escape_xml_str(const char *str); - void get_attrs_str(const FormatterAttrs *attrs, std::string& attrs_str); - - std::stringstream m_ss, m_pending_string; - std::deque m_sections; - bool m_pretty; - std::string m_pending_string_name; -}; + + class Formatter { + public: + static Formatter *create(const std::string& type, + const std::string& default_type, + const std::string& fallback); + static Formatter *create(const std::string& type, + const std::string& default_type) { + return create(type, default_type, ""); + } + static Formatter *create(const std::string& type) { + return create(type, "json-pretty", ""); + } + + Formatter(); + virtual ~Formatter(); + + virtual void flush(std::ostream& os) = 0; + void flush(bufferlist &bl) + { + std::stringstream os; + flush(os); + bl.append(os.str()); + } + virtual void reset() = 0; + + virtual void open_array_section(const char *name) = 0; + virtual void open_array_section_in_ns(const char *name, const char *ns) = 0; + virtual void open_object_section(const char *name) = 0; + virtual void open_object_section_in_ns(const char *name, const char *ns) = 0; + virtual void close_section() = 0; + virtual void dump_unsigned(const char *name, uint64_t u) = 0; + virtual void dump_int(const char *name, int64_t s) = 0; + virtual void dump_float(const char *name, double d) = 0; + virtual void dump_string(const char *name, const std::string& s) = 0; + virtual void dump_bool(const char *name, bool b) + { + dump_format_unquoted(name, "%s", (b ? "true" : "false")); + } + virtual std::ostream& dump_stream(const char *name) = 0; + virtual void dump_format_va(const char *name, const char *ns, bool quoted, const char *fmt, va_list ap) = 0; + virtual void dump_format(const char *name, const char *fmt, ...); + virtual void dump_format_ns(const char *name, const char *ns, const char *fmt, ...); + virtual void dump_format_unquoted(const char *name, const char *fmt, ...); + virtual int get_len() const = 0; + virtual void write_raw_data(const char *data) = 0; + /* with attrs */ + virtual void open_array_section_with_attrs(const char *name, const FormatterAttrs& attrs) + { + open_array_section(name); + } + virtual void open_object_section_with_attrs(const char *name, const FormatterAttrs& attrs) + { + open_object_section(name); + } + virtual void dump_string_with_attrs(const char *name, const std::string& s, const FormatterAttrs& attrs) + { + dump_string(name, s); + } + }; + + class JSONFormatter : public Formatter { + public: + JSONFormatter(bool p = false); + + void flush(std::ostream& os); + void reset(); + virtual void open_array_section(const char *name); + void open_array_section_in_ns(const char *name, const char *ns); + void open_object_section(const char *name); + void open_object_section_in_ns(const char *name, const char *ns); + void close_section(); + void dump_unsigned(const char *name, uint64_t u); + void dump_int(const char *name, int64_t u); + void dump_float(const char *name, double d); + void dump_string(const char *name, const std::string& s); + std::ostream& dump_stream(const char *name); + void dump_format_va(const char *name, const char *ns, bool quoted, const char *fmt, va_list ap); + int get_len() const; + void write_raw_data(const char *data); + + private: + + struct json_formatter_stack_entry_d { + int size; + bool is_array; + json_formatter_stack_entry_d() : size(0), is_array(false) { } + }; + + bool m_pretty; + void open_section(const char *name, bool is_array); + void print_quoted_string(const std::string& s); + void print_name(const char *name); + void print_comma(json_formatter_stack_entry_d& entry); + void finish_pending_string(); + + std::stringstream m_ss, m_pending_string; + std::list m_stack; + bool m_is_pending_string; + }; + + class XMLFormatter : public Formatter { + public: + static const char *XML_1_DTD; + XMLFormatter(bool pretty = false); + + void flush(std::ostream& os); + void reset(); + void open_array_section(const char *name); + void open_array_section_in_ns(const char *name, const char *ns); + void open_object_section(const char *name); + void open_object_section_in_ns(const char *name, const char *ns); + void close_section(); + void dump_unsigned(const char *name, uint64_t u); + void dump_int(const char *name, int64_t u); + void dump_float(const char *name, double d); + void dump_string(const char *name, const std::string& s); + std::ostream& dump_stream(const char *name); + void dump_format_va(const char *name, const char *ns, bool quoted, const char *fmt, va_list ap); + int get_len() const; + void write_raw_data(const char *data); + + /* with attrs */ + void open_array_section_with_attrs(const char *name, const FormatterAttrs& attrs); + void open_object_section_with_attrs(const char *name, const FormatterAttrs& attrs); + void dump_string_with_attrs(const char *name, const std::string& s, const FormatterAttrs& attrs); + private: + void open_section_in_ns(const char *name, const char *ns, const FormatterAttrs *attrs); + void finish_pending_string(); + void print_spaces(); + static std::string escape_xml_str(const char *str); + void get_attrs_str(const FormatterAttrs *attrs, std::string& attrs_str); + + std::stringstream m_ss, m_pending_string; + std::deque m_sections; + bool m_pretty; + std::string m_pending_string_name; + }; + + class TableFormatter : public Formatter { + public: + TableFormatter(bool keyval = false); + + void flush(std::ostream& os); + void reset(); + virtual void open_array_section(const char *name); + void open_array_section_in_ns(const char *name, const char *ns); + void open_object_section(const char *name); + void open_object_section_in_ns(const char *name, const char *ns); + + void open_array_section_with_attrs(const char *name, const FormatterAttrs& attrs); + void open_object_section_with_attrs(const char *name, const FormatterAttrs& attrs); + + void close_section(); + void dump_unsigned(const char *name, uint64_t u); + void dump_int(const char *name, int64_t u); + void dump_float(const char *name, double d); + void dump_string(const char *name, const std::string& s); + void dump_format_va(const char *name, const char *ns, bool quoted, const char *fmt, va_list ap); + void dump_string_with_attrs(const char *name, const std::string& s, const FormatterAttrs& attrs); + std::ostream& dump_stream(const char *name); + + int get_len() const; + void write_raw_data(const char *data); + void get_attrs_str(const FormatterAttrs *attrs, std::string& attrs_str); + + private: + void open_section_in_ns(const char *name, const char *ns, const FormatterAttrs *attrs); + std::vector< std::vector > > m_vec; + std::stringstream m_ss; + size_t m_vec_index(const char* name); + std::string get_section_name(const char* name); + void finish_pending_string(); + std::string m_pending_name; + bool m_keyval; + + int m_section_open; + std::vector< std::string > m_section; + std::map m_section_cnt; + std::vector m_column_size; + std::vector< std::string > m_column_name; + }; + } #endif diff --git a/src/common/HeartbeatMap.cc b/src/common/HeartbeatMap.cc index 9787f737cef50..2c58276196ab4 100644 --- a/src/common/HeartbeatMap.cc +++ b/src/common/HeartbeatMap.cc @@ -41,7 +41,7 @@ HeartbeatMap::~HeartbeatMap() assert(m_workers.empty()); } -heartbeat_handle_d *HeartbeatMap::add_worker(string name) +heartbeat_handle_d *HeartbeatMap::add_worker(const string& name) { m_rwlock.get_write(); ldout(m_cct, 10) << "add_worker '" << name << "'" << dendl; @@ -52,7 +52,7 @@ heartbeat_handle_d *HeartbeatMap::add_worker(string name) return h; } -void HeartbeatMap::remove_worker(heartbeat_handle_d *h) +void HeartbeatMap::remove_worker(const heartbeat_handle_d *h) { m_rwlock.get_write(); ldout(m_cct, 10) << "remove_worker '" << h->name << "'" << dendl; @@ -61,7 +61,7 @@ void HeartbeatMap::remove_worker(heartbeat_handle_d *h) delete h; } -bool HeartbeatMap::_check(heartbeat_handle_d *h, const char *who, time_t now) +bool HeartbeatMap::_check(const heartbeat_handle_d *h, const char *who, time_t now) { bool healthy = true; time_t was; diff --git a/src/common/HeartbeatMap.h b/src/common/HeartbeatMap.h index a4aee48a19137..5513e186c2ad3 100644 --- a/src/common/HeartbeatMap.h +++ b/src/common/HeartbeatMap.h @@ -41,7 +41,7 @@ namespace ceph { */ struct heartbeat_handle_d { - std::string name; + const std::string name; atomic_t timeout, suicide_timeout; time_t grace, suicide_grace; std::list::iterator list_item; @@ -54,8 +54,8 @@ struct heartbeat_handle_d { class HeartbeatMap { public: // register/unregister - heartbeat_handle_d *add_worker(std::string name); - void remove_worker(heartbeat_handle_d *h); + heartbeat_handle_d *add_worker(const std::string& name); + void remove_worker(const heartbeat_handle_d *h); // reset the timeout so that it expects another touch within grace amount of time void reset_timeout(heartbeat_handle_d *h, time_t grace, time_t suicide_grace); @@ -77,7 +77,7 @@ class HeartbeatMap { time_t m_inject_unhealthy_until; std::list m_workers; - bool _check(heartbeat_handle_d *h, const char *who, time_t now); + bool _check(const heartbeat_handle_d *h, const char *who, time_t now); }; } diff --git a/src/common/Initialize.h b/src/common/Initialize.h index 273a8710b8cf5..35414d64be8f8 100644 --- a/src/common/Initialize.h +++ b/src/common/Initialize.h @@ -59,8 +59,8 @@ class Initialize { /** * This form of constructor causes its function argument to be invoked * when the object is constructed. When used with a static Initialize - * object, this will cause #func to run before main() runs, so that - * #func can perform once-only initialization. + * object, this will cause \p func to run before main() runs, so that + * \p func can perform once-only initialization. * * \param func * This function is invoked with no arguments when the object is diff --git a/src/common/LogClient.cc b/src/common/LogClient.cc index 85350e24ac563..756297b7bbf0d 100644 --- a/src/common/LogClient.cc +++ b/src/common/LogClient.cc @@ -49,28 +49,28 @@ int parse_log_client_options(CephContext *cct, ostringstream oss; int r = get_conf_str_map_helper(cct->_conf->clog_to_monitors, oss, - &log_to_monitors, CLOG_CHANNEL_DEFAULT); + &log_to_monitors, CLOG_CONFIG_DEFAULT_KEY); if (r < 0) { lderr(cct) << __func__ << " error parsing 'clog_to_monitors'" << dendl; return r; } r = get_conf_str_map_helper(cct->_conf->clog_to_syslog, oss, - &log_to_syslog, CLOG_CHANNEL_DEFAULT); + &log_to_syslog, CLOG_CONFIG_DEFAULT_KEY); if (r < 0) { lderr(cct) << __func__ << " error parsing 'clog_to_syslog'" << dendl; return r; } r = get_conf_str_map_helper(cct->_conf->clog_to_syslog_facility, oss, - &log_channels, CLOG_CHANNEL_DEFAULT); + &log_channels, CLOG_CONFIG_DEFAULT_KEY); if (r < 0) { lderr(cct) << __func__ << " error parsing 'clog_to_syslog_facility'" << dendl; return r; } r = get_conf_str_map_helper(cct->_conf->clog_to_syslog_level, oss, - &log_prios, CLOG_CHANNEL_DEFAULT); + &log_prios, CLOG_CONFIG_DEFAULT_KEY); if (r < 0) { lderr(cct) << __func__ << " error parsing 'clog_to_syslog_level'" << dendl; return r; @@ -132,13 +132,19 @@ void LogChannel::update_config(map &log_to_monitors, map &log_channels, map &log_prios) { + ldout(cct, 20) << __func__ << " log_to_monitors " << log_to_monitors + << " log_to_syslog " << log_to_syslog + << " log_channels " << log_channels + << " log_prios " << log_prios + << dendl; bool to_monitors = (get_str_map_key(log_to_monitors, log_channel, - &CLOG_CHANNEL_DEFAULT) == "true"); + &CLOG_CONFIG_DEFAULT_KEY) == "true"); bool to_syslog = (get_str_map_key(log_to_syslog, log_channel, - &CLOG_CHANNEL_DEFAULT) == "true"); + &CLOG_CONFIG_DEFAULT_KEY) == "true"); string syslog_facility = get_str_map_key(log_channels, log_channel, - &CLOG_CHANNEL_DEFAULT); - string prio = get_str_map_key(log_prios, log_channel, &CLOG_CHANNEL_DEFAULT); + &CLOG_CONFIG_DEFAULT_KEY); + string prio = get_str_map_key(log_prios, log_channel, + &CLOG_CONFIG_DEFAULT_KEY); set_log_to_monitors(to_monitors); set_log_to_syslog(to_syslog); diff --git a/src/common/LogEntry.h b/src/common/LogEntry.h index 63badffbfb785..71d3acd5feffc 100644 --- a/src/common/LogEntry.h +++ b/src/common/LogEntry.h @@ -34,10 +34,14 @@ typedef enum { } clog_type; static const std::string CLOG_CHANNEL_NONE = "none"; -static const std::string CLOG_CHANNEL_DEFAULT = "default"; +static const std::string CLOG_CHANNEL_DEFAULT = "cluster"; static const std::string CLOG_CHANNEL_CLUSTER = "cluster"; static const std::string CLOG_CHANNEL_AUDIT = "audit"; +// this is the key name used in the config options for the default, e.g. +// default=true foo=false bar=false +static const std::string CLOG_CONFIG_DEFAULT_KEY = "default"; + /* * Given a clog log_type, return the equivalent syslog priority */ diff --git a/src/common/Makefile.am b/src/common/Makefile.am index 28881945357c1..2d1e2d3ed3877 100644 --- a/src/common/Makefile.am +++ b/src/common/Makefile.am @@ -33,7 +33,6 @@ libcommon_internal_la_SOURCES = \ common/str_map.cc \ common/errno.cc \ common/RefCountedObj.cc \ - common/blkdev.cc \ common/common_init.cc \ common/pipe.c \ common/ceph_argparse.cc \ @@ -73,7 +72,16 @@ libcommon_internal_la_SOURCES = \ common/linux_version.c \ common/module.c \ common/Readahead.cc \ - common/Cycles.cc + common/Cycles.cc \ + common/ContextCompletion.cc + +libcommon_internal_la_SOURCES += \ + common/blkdev.cc + +if ENABLE_XIO +libcommon_internal_la_SOURCES += \ + common/address_helper.cc +endif # these should go out of libcommon_internal libcommon_internal_la_SOURCES += \ @@ -91,13 +99,6 @@ libcommon_internal_la_SOURCES += \ LIBCOMMON_DEPS += libcommon_internal.la noinst_LTLIBRARIES += libcommon_internal.la -libcommon_api_la_SOURCES = \ - common/buffer.cc -if LINUX -libcommon_api_la_CXXFLAGS = -fvisibility=hidden -fvisibility-inlines-hidden -endif # LINUX -noinst_LTLIBRARIES += libcommon_api.la - # inject crc in common libcommon_crc_la_SOURCES = \ common/sctp_crc32.c \ @@ -112,11 +113,19 @@ endif LIBCOMMON_DEPS += libcommon_crc.la noinst_LTLIBRARIES += libcommon_crc.la +if HAVE_ARMV8_CRC +libcommon_crc_aarch64_la_SOURCES = common/crc32c_aarch64.c +libcommon_crc_aarch64_la_CFLAGS = $(AM_CFLAGS) $(ARM_CRC_FLAGS) +LIBCOMMON_DEPS += libcommon_crc_aarch64.la +noinst_LTLIBRARIES += libcommon_crc_aarch64.la +endif + noinst_HEADERS += \ common/bloom_filter.hpp \ common/sctp_crc32.h \ common/crc32c_intel_baseline.h \ - common/crc32c_intel_fast.h + common/crc32c_intel_fast.h \ + common/crc32c_aarch64.h # important; libmsg before libauth! @@ -126,11 +135,11 @@ LIBCOMMON_DEPS += \ $(LIBCRUSH) $(LIBJSON_SPIRIT) $(LIBLOG) $(LIBARCH) if LINUX -LIBCOMMON_DEPS += -lrt +LIBCOMMON_DEPS += -lrt -lblkid -luuid endif # LINUX -libcommon_la_SOURCES = -libcommon_la_LIBADD = $(LIBCOMMON_DEPS) libcommon_api.la +libcommon_la_SOURCES = common/buffer.cc +libcommon_la_LIBADD = $(LIBCOMMON_DEPS) noinst_LTLIBRARIES += libcommon.la noinst_HEADERS += \ @@ -180,6 +189,7 @@ noinst_HEADERS += \ common/map_cacher.hpp \ common/MemoryModel.h \ common/Mutex.h \ + common/QueueRing.h \ common/PrebufferedStreambuf.h \ common/RWLock.h \ common/Semaphore.h \ @@ -220,7 +230,17 @@ noinst_HEADERS += \ common/Continuation.h \ common/Readahead.h \ common/Cycles.h \ - common/Initialize.h + common/Initialize.h \ + common/ContextCompletion.h \ + common/bit_vector.hpp \ + common/SubProcess.h \ + common/valgrind.h + +if ENABLE_XIO +noinst_HEADERS += \ + common/address_helper.h +endif + libsecret_la_SOURCES = common/secret.c libsecret_la_LIBADD = $(KEYUTILS_LIB) diff --git a/src/common/Mutex.cc b/src/common/Mutex.cc index a0c1202183dcd..cedba098128cb 100644 --- a/src/common/Mutex.cc +++ b/src/common/Mutex.cc @@ -17,19 +17,20 @@ #include "common/perf_counters.h" #include "common/ceph_context.h" #include "common/config.h" +#include "include/stringify.h" #include "include/utime.h" #include "common/Clock.h" -Mutex::Mutex(const char *n, bool r, bool ld, +Mutex::Mutex(const std::string &n, bool r, bool ld, bool bt, CephContext *cct) : - name(n), id(-1), recursive(r), lockdep(ld), backtrace(bt), - nlock(0), locked_by(0), cct(cct), logger(0) + name(n), id(-1), recursive(r), lockdep(ld), backtrace(bt), nlock(0), + locked_by(0), cct(cct), logger(0) { if (cct) { PerfCountersBuilder b(cct, string("mutex-") + name, l_mutex_first, l_mutex_last); - b.add_time_avg(l_mutex_wait, "wait"); + b.add_time_avg(l_mutex_wait, "wait", "Average time of mutex in locked state"); logger = b.create_perf_counters(); cct->get_perfcounters_collection()->add(logger); logger->set(l_mutex_wait, 0); @@ -42,7 +43,7 @@ Mutex::Mutex(const char *n, bool r, bool ld, pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE); pthread_mutex_init(&_m,&attr); pthread_mutexattr_destroy(&attr); - if (g_lockdep) + if (lockdep && g_lockdep) _register(); } else if (lockdep) { @@ -55,6 +56,7 @@ Mutex::Mutex(const char *n, bool r, bool ld, pthread_mutexattr_init(&attr); pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ERRORCHECK); pthread_mutex_init(&_m, &attr); + pthread_mutexattr_destroy(&attr); if (g_lockdep) _register(); } @@ -74,6 +76,9 @@ Mutex::~Mutex() { cct->get_perfcounters_collection()->remove(logger); delete logger; } + if (lockdep && g_lockdep) { + lockdep_unregister(id); + } } void Mutex::Lock(bool no_lockdep) { diff --git a/src/common/Mutex.h b/src/common/Mutex.h index 758157536b708..6a4e6b385eead 100644 --- a/src/common/Mutex.h +++ b/src/common/Mutex.h @@ -33,7 +33,7 @@ enum { class Mutex { private: - const char *name; + std::string name; int id; bool recursive; bool lockdep; @@ -50,20 +50,20 @@ class Mutex { Mutex(const Mutex &M); void _register() { - id = lockdep_register(name); + id = lockdep_register(name.c_str()); } void _will_lock() { // about to lock - id = lockdep_will_lock(name, id); + id = lockdep_will_lock(name.c_str(), id, backtrace); } void _locked() { // just locked - id = lockdep_locked(name, id, backtrace); + id = lockdep_locked(name.c_str(), id, backtrace); } void _will_unlock() { // about to unlock - id = lockdep_will_unlock(name, id); + id = lockdep_will_unlock(name.c_str(), id); } public: - Mutex(const char *n, bool r = false, bool ld=true, bool bt=false, + Mutex(const std::string &n, bool r = false, bool ld=true, bool bt=false, CephContext *cct = 0); ~Mutex(); bool is_locked() const { diff --git a/src/common/Preforker.h b/src/common/Preforker.h index c28fd13ffaf91..ea64f6e764b6d 100644 --- a/src/common/Preforker.h +++ b/src/common/Preforker.h @@ -3,11 +3,20 @@ #ifndef CEPH_COMMON_PREFORKER_H #define CEPH_COMMON_PREFORKER_H +#include "acconfig.h" #include #include #include #include +#include #include +#ifdef WITH_LTTNG +#include +#endif +#include +#include + +#include "include/assert.h" #include "common/safe_io.h" #include "common/errno.h" @@ -24,6 +33,9 @@ class Preforker { pid_t childpid; bool forked; int fd[2]; // parent's, child's +#ifdef WITH_LTTNG + sigset_t sigset; +#endif public: Preforker() @@ -31,22 +43,35 @@ class Preforker { forked(false) {} - void prefork() { + int prefork(std::string &err) { assert(!forked); int r = socketpair(AF_UNIX, SOCK_STREAM, 0, fd); + std::ostringstream oss; if (r < 0) { - cerr << "[" << getpid() << "]: unable to create socketpair: " << cpp_strerror(errno) << std::endl; - exit(errno); + oss << "[" << getpid() << "]: unable to create socketpair: " << cpp_strerror(errno); + err = oss.str(); + return r; } +#ifdef WITH_LTTNG + ust_before_fork(&sigset); +#endif + forked = true; childpid = fork(); + if (childpid < 0) { + r = -errno; + oss << "[" << getpid() << "]: unable to fork: " << cpp_strerror(errno); + err = oss.str(); + return r; + } if (childpid == 0) { - ::close(fd[0]); + child_after_fork(); } else { - ::close(fd[1]); + parent_after_fork(); } + return 0; } bool is_child() { @@ -57,10 +82,11 @@ class Preforker { return childpid != 0; } - int parent_wait() { + int parent_wait(std::string &err_msg) { assert(forked); int r = -1; + std::ostringstream oss; int err = safe_read_exact(fd[0], &r, sizeof(r)); if (err == 0 && r == -1) { // daemonize @@ -69,12 +95,25 @@ class Preforker { ::close(2); r = 0; } else if (err) { - cerr << "[" << getpid() << "]: " << cpp_strerror(err) << std::endl; + oss << "[" << getpid() << "]: " << cpp_strerror(err); } else { // wait for child to exit - waitpid(childpid, NULL, 0); + int status; + err = waitpid(childpid, &status, 0); + if (err < 0) { + oss << "[" << getpid() << "]" << " waitpid error: " << cpp_strerror(err); + } else if (WIFSIGNALED(status)) { + oss << "[" << getpid() << "]" << " exited with a signal"; + } else if (!WIFEXITED(status)) { + oss << "[" << getpid() << "]" << " did not exit normally"; + } else { + err = WEXITSTATUS(status); + if (err != 0) + oss << "[" << getpid() << "]" << " returned exit_status " << cpp_strerror(err); + } } - return r; + err_msg = oss.str(); + return err; } int signal_exit(int r) { @@ -99,6 +138,20 @@ class Preforker { r += r2; // make the compiler shut up about the unused return code from ::write(2). } +private: + void child_after_fork() { +#ifdef WITH_LTTNG + ust_after_fork_child(&sigset); +#endif + ::close(fd[0]); + } + + void parent_after_fork() { +#ifdef WITH_LTTNG + ust_after_fork_parent(&sigset); +#endif + ::close(fd[1]); + } }; #endif diff --git a/src/common/PrioritizedQueue.h b/src/common/PrioritizedQueue.h index ee1dc9cd0e175..5ae94a5fbc28e 100644 --- a/src/common/PrioritizedQueue.h +++ b/src/common/PrioritizedQueue.h @@ -49,13 +49,14 @@ class PrioritizedQueue { int64_t max_tokens_per_subqueue; int64_t min_cost; + typedef std::list > ListPairs; template static unsigned filter_list_pairs( - list > *l, F f, - list *out) { + ListPairs *l, F f, + std::list *out) { unsigned ret = 0; if (out) { - for (typename list >::reverse_iterator i = l->rbegin(); + for (typename ListPairs::reverse_iterator i = l->rbegin(); i != l->rend(); ++i) { if (f(i->second)) { @@ -63,7 +64,7 @@ class PrioritizedQueue { } } } - for (typename list >::iterator i = l->begin(); + for (typename ListPairs::iterator i = l->begin(); i != l->end(); ) { if (f(i->second)) { @@ -78,10 +79,11 @@ class PrioritizedQueue { struct SubQueue { private: - map > > q; + typedef std::map Classes; + Classes q; unsigned tokens, max_tokens; int64_t size; - typename map > >::iterator cur; + typename Classes::iterator cur; public: SubQueue(const SubQueue &other) : q(other.q), @@ -114,18 +116,18 @@ class PrioritizedQueue { tokens = 0; } void enqueue(K cl, unsigned cost, T item) { - q[cl].push_back(make_pair(cost, item)); + q[cl].push_back(std::make_pair(cost, item)); if (cur == q.end()) cur = q.begin(); size++; } void enqueue_front(K cl, unsigned cost, T item) { - q[cl].push_front(make_pair(cost, item)); + q[cl].push_front(std::make_pair(cost, item)); if (cur == q.end()) cur = q.begin(); size++; } - pair front() const { + std::pair front() const { assert(!(q.empty())); assert(cur != q.end()); return cur->second.front(); @@ -150,8 +152,8 @@ class PrioritizedQueue { return q.empty(); } template - void remove_by_filter(F f, list *out) { - for (typename map > >::iterator i = q.begin(); + void remove_by_filter(F f, std::list *out) { + for (typename Classes::iterator i = q.begin(); i != q.end(); ) { size -= filter_list_pairs(&(i->second), f, out); @@ -166,15 +168,15 @@ class PrioritizedQueue { if (cur == q.end()) cur = q.begin(); } - void remove_by_class(K k, list *out) { - typename map > >::iterator i = q.find(k); + void remove_by_class(K k, std::list *out) { + typename Classes::iterator i = q.find(k); if (i == q.end()) return; size -= i->second.size(); if (i == cur) ++cur; if (out) { - for (typename list >::reverse_iterator j = + for (typename ListPairs::reverse_iterator j = i->second.rbegin(); j != i->second.rend(); ++j) { @@ -195,11 +197,13 @@ class PrioritizedQueue { f->dump_int("first_item_cost", front().first); } }; - map high_queue; - map queue; + + typedef std::map SubQueues; + SubQueues high_queue; + SubQueues queue; SubQueue *create_queue(unsigned priority) { - typename map::iterator p = queue.find(priority); + typename SubQueues::iterator p = queue.find(priority); if (p != queue.end()) return &p->second; total_priority += priority; @@ -218,7 +222,7 @@ class PrioritizedQueue { void distribute_tokens(unsigned cost) { if (total_priority == 0) return; - for (typename map::iterator i = queue.begin(); + for (typename SubQueues::iterator i = queue.begin(); i != queue.end(); ++i) { i->second.put_tokens(((i->first * cost) / total_priority) + 1); @@ -234,13 +238,13 @@ class PrioritizedQueue { unsigned length() const { unsigned total = 0; - for (typename map::const_iterator i = queue.begin(); + for (typename SubQueues::const_iterator i = queue.begin(); i != queue.end(); ++i) { assert(i->second.length()); total += i->second.length(); } - for (typename map::const_iterator i = high_queue.begin(); + for (typename SubQueues::const_iterator i = high_queue.begin(); i != high_queue.end(); ++i) { assert(i->second.length()); @@ -250,8 +254,8 @@ class PrioritizedQueue { } template - void remove_by_filter(F f, list *removed = 0) { - for (typename map::iterator i = queue.begin(); + void remove_by_filter(F f, std::list *removed = 0) { + for (typename SubQueues::iterator i = queue.begin(); i != queue.end(); ) { unsigned priority = i->first; @@ -264,7 +268,7 @@ class PrioritizedQueue { ++i; } } - for (typename map::iterator i = high_queue.begin(); + for (typename SubQueues::iterator i = high_queue.begin(); i != high_queue.end(); ) { i->second.remove_by_filter(f, removed); @@ -276,8 +280,8 @@ class PrioritizedQueue { } } - void remove_by_class(K k, list *out = 0) { - for (typename map::iterator i = queue.begin(); + void remove_by_class(K k, std::list *out = 0) { + for (typename SubQueues::iterator i = queue.begin(); i != queue.end(); ) { i->second.remove_by_class(k, out); @@ -289,7 +293,7 @@ class PrioritizedQueue { ++i; } } - for (typename map::iterator i = high_queue.begin(); + for (typename SubQueues::iterator i = high_queue.begin(); i != high_queue.end(); ) { i->second.remove_by_class(k, out); @@ -345,7 +349,7 @@ class PrioritizedQueue { // if there are multiple buckets/subqueues with sufficient tokens, // we behave like a strict priority queue among all subqueues that // are eligible to run. - for (typename map::iterator i = queue.begin(); + for (typename SubQueues::iterator i = queue.begin(); i != queue.end(); ++i) { assert(!(i->second.empty())); @@ -377,7 +381,7 @@ class PrioritizedQueue { f->dump_int("max_tokens_per_subqueue", max_tokens_per_subqueue); f->dump_int("min_cost", min_cost); f->open_array_section("high_queues"); - for (typename map::const_iterator p = high_queue.begin(); + for (typename SubQueues::const_iterator p = high_queue.begin(); p != high_queue.end(); ++p) { f->open_object_section("subqueue"); @@ -387,7 +391,7 @@ class PrioritizedQueue { } f->close_section(); f->open_array_section("queues"); - for (typename map::const_iterator p = queue.begin(); + for (typename SubQueues::const_iterator p = queue.begin(); p != queue.end(); ++p) { f->open_object_section("subqueue"); diff --git a/src/common/QueueRing.h b/src/common/QueueRing.h new file mode 100644 index 0000000000000..830f80f8442e0 --- /dev/null +++ b/src/common/QueueRing.h @@ -0,0 +1,61 @@ +#ifndef QUEUE_RING_H +#define QUEUE_RING_H + +#include +#include +#include "common/Mutex.h" +#include "common/Cond.h" + + + +template +class QueueRing { + struct QueueBucket { + Mutex lock; + Cond cond; + typename std::list entries; + + QueueBucket() : lock("QueueRing::QueueBucket::lock") {} + QueueBucket(const QueueBucket& rhs) : lock("QueueRing::QueueBucket::lock") { + entries = rhs.entries; + } + + void enqueue(const T& entry) { + lock.Lock(); + if (entries.empty()) { + cond.Signal(); + } + entries.push_back(entry); + lock.Unlock(); + } + + void dequeue(T *entry) { + lock.Lock(); + if (entries.empty()) { + cond.Wait(lock); + }; + assert(!entries.empty()); + *entry = entries.front(); + entries.pop_front(); + lock.Unlock(); + }; + }; + + std::vector buckets; + int num_buckets; + atomic_t cur_read_bucket; + atomic_t cur_write_bucket; +public: + QueueRing(int n) : buckets(n), num_buckets(n) { + } + + void enqueue(const T& entry) { + buckets[cur_write_bucket.inc() % num_buckets].enqueue(entry); + }; + + void dequeue(T *entry) { + buckets[cur_read_bucket.inc() % num_buckets].dequeue(entry); + } +}; + +#endif diff --git a/src/common/RWLock.h b/src/common/RWLock.h index 6f0ab8ed09437..47a8c87f500a2 100644 --- a/src/common/RWLock.h +++ b/src/common/RWLock.h @@ -18,6 +18,7 @@ #define CEPH_RWLock_Posix__H #include +#include #include #include "lockdep.h" #include "include/atomic.h" @@ -25,57 +26,70 @@ class RWLock { mutable pthread_rwlock_t L; - const char *name; + std::string name; mutable int id; mutable atomic_t nrlock, nwlock; + bool track; + + std::string unique_name(const char* name) const; public: RWLock(const RWLock& other); const RWLock& operator=(const RWLock& other); - RWLock(const char *n) : name(n), id(-1), nrlock(0), nwlock(0) { + RWLock(const std::string &n, bool track_lock=true) : name(n), id(-1), nrlock(0), nwlock(0), track(track_lock) { pthread_rwlock_init(&L, NULL); - if (g_lockdep) id = lockdep_register(name); + if (g_lockdep) id = lockdep_register(name.c_str()); } bool is_locked() const { + assert(track); return (nrlock.read() > 0) || (nwlock.read() > 0); } bool is_wlocked() const { + assert(track); return (nwlock.read() > 0); } virtual ~RWLock() { // The following check is racy but we are about to destroy // the object and we assume that there are no other users. - assert(!is_locked()); + if (track) + assert(!is_locked()); pthread_rwlock_destroy(&L); + if (g_lockdep) { + lockdep_unregister(id); + } } void unlock(bool lockdep=true) const { - if (nwlock.read() > 0) { - nwlock.dec(); - } else { - assert(nrlock.read() > 0); - nrlock.dec(); + if (track) { + if (nwlock.read() > 0) { + nwlock.dec(); + } else { + assert(nrlock.read() > 0); + nrlock.dec(); + } } - if (lockdep && g_lockdep) id = lockdep_will_unlock(name, id); + if (lockdep && g_lockdep) id = lockdep_will_unlock(name.c_str(), id); int r = pthread_rwlock_unlock(&L); assert(r == 0); } // read void get_read() const { - if (g_lockdep) id = lockdep_will_lock(name, id); + if (g_lockdep) id = lockdep_will_lock(name.c_str(), id); int r = pthread_rwlock_rdlock(&L); assert(r == 0); - if (g_lockdep) id = lockdep_locked(name, id); - nrlock.inc(); + if (g_lockdep) id = lockdep_locked(name.c_str(), id); + if (track) + nrlock.inc(); } bool try_get_read() const { if (pthread_rwlock_tryrdlock(&L) == 0) { - nrlock.inc(); - if (g_lockdep) id = lockdep_locked(name, id); + if (track) + nrlock.inc(); + if (g_lockdep) id = lockdep_locked(name.c_str(), id); return true; } return false; @@ -86,17 +100,19 @@ class RWLock // write void get_write(bool lockdep=true) { - if (lockdep && g_lockdep) id = lockdep_will_lock(name, id); + if (lockdep && g_lockdep) id = lockdep_will_lock(name.c_str(), id); int r = pthread_rwlock_wrlock(&L); assert(r == 0); - if (g_lockdep) id = lockdep_locked(name, id); - nwlock.inc(); + if (g_lockdep) id = lockdep_locked(name.c_str(), id); + if (track) + nwlock.inc(); } bool try_get_write(bool lockdep=true) { if (pthread_rwlock_trywrlock(&L) == 0) { - if (lockdep && g_lockdep) id = lockdep_locked(name, id); - nwlock.inc(); + if (lockdep && g_lockdep) id = lockdep_locked(name.c_str(), id); + if (track) + nwlock.inc(); return true; } return false; diff --git a/src/common/Readahead.cc b/src/common/Readahead.cc index a3f5bfc7e4e74..b1ee2e099c6f3 100644 --- a/src/common/Readahead.cc +++ b/src/common/Readahead.cc @@ -30,6 +30,10 @@ Readahead::extent_t Readahead::update(const vector& extents, uint64_t for (vector::const_iterator p = extents.begin(); p != extents.end(); ++p) { _observe_read(p->first, p->second); } + if (m_readahead_pos >= limit) { + m_lock.Unlock(); + return extent_t(0, 0); + } pair extent = _compute_readahead(limit); m_lock.Unlock(); return extent; @@ -38,6 +42,10 @@ Readahead::extent_t Readahead::update(const vector& extents, uint64_t Readahead::extent_t Readahead::update(uint64_t offset, uint64_t length, uint64_t limit) { m_lock.Lock(); _observe_read(offset, length); + if (m_readahead_pos >= limit) { + m_lock.Unlock(); + return extent_t(0, 0); + } extent_t extent = _compute_readahead(limit); m_lock.Unlock(); return extent; @@ -52,6 +60,7 @@ void Readahead::_observe_read(uint64_t offset, uint64_t length) { m_consec_read_bytes = 0; m_readahead_trigger_pos = 0; m_readahead_size = 0; + m_readahead_pos = 0; } m_last_pos = offset + length; } @@ -70,6 +79,9 @@ Readahead::extent_t Readahead::_compute_readahead(uint64_t limit) { } else { // continuing readahead trigger m_readahead_size *= 2; + if (m_last_pos > m_readahead_pos) { + m_readahead_pos = m_last_pos; + } } m_readahead_size = MAX(m_readahead_size, m_readahead_min_bytes); m_readahead_size = MIN(m_readahead_size, m_readahead_max_bytes); diff --git a/src/common/RefCountedObj.h b/src/common/RefCountedObj.h index 729bbb9fd4b87..3755018f80114 100644 --- a/src/common/RefCountedObj.h +++ b/src/common/RefCountedObj.h @@ -52,7 +52,7 @@ struct RefCountedObject { cct = c; } - uint64_t get_nref() { + uint64_t get_nref() const { return nref.read(); } }; diff --git a/src/common/SubProcess.h b/src/common/SubProcess.h new file mode 100644 index 0000000000000..3d739849193d4 --- /dev/null +++ b/src/common/SubProcess.h @@ -0,0 +1,466 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph distributed storage system + * + * Copyright (C) 2015 Mirantis Inc + * + * Author: Mykola Golub + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + */ + +#ifndef SUB_PROCESS_H +#define SUB_PROCESS_H + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +/** + * SubProcess: + * A helper class to spawn a subprocess. + * + * Example: + * + * SubProcess cat("cat", true, true); + * if (cat.spawn() != 0) { + * std::cerr << "cat failed: " << cat.err() << std::endl; + * return false; + * } + * write_to_fd(cat.stdout(), "hello world!\n"); + * cat.close_stdout(); + * read_from_fd(cat.stdin(), buf); + * if (cat.join() != 0) { + * std::cerr << cat.err() << std::endl; + * return false; + * } + */ + +class SubProcess { +public: + SubProcess(const char *cmd, bool pipe_stdin = false, bool pipe_stdout = false, + bool pipe_stderr = false); + virtual ~SubProcess(); + + void add_cmd_args(const char *arg, ...); + void add_cmd_arg(const char *arg); + + virtual int spawn(); // Returns 0 on success or -errno on failure. + virtual int join(); // Returns exit code (0 on success). + + bool is_spawned() const { return pid > 0; } + + int stdin() const; + int stdout() const; + int stderr() const; + + void close_stdin(); + void close_stdout(); + void close_stderr(); + + void kill(int signo = SIGTERM) const; + + const char* err() const; + +protected: + bool is_child() const { return pid == 0; } + virtual void exec(); + +private: + void close(int &fd); + +protected: + std::string cmd; + std::vector cmd_args; + bool pipe_stdin; + bool pipe_stdout; + bool pipe_stderr; + int stdin_pipe_out_fd; + int stdout_pipe_in_fd; + int stderr_pipe_in_fd; + int pid; + std::ostringstream errstr; +}; + +class SubProcessTimed : public SubProcess { +public: + SubProcessTimed(const char *cmd, bool pipe_stdin = false, + bool pipe_stdout = false, bool pipe_stderr = false, + int timeout = 0, int sigkill = SIGKILL); + +protected: + virtual void exec(); + +private: + int timeout; + int sigkill; +}; + +SubProcess::SubProcess(const char *cmd_, bool stdin, bool stdout, bool stderr) : + cmd(cmd_), + cmd_args(), + pipe_stdin(stdin), + pipe_stdout(stdout), + pipe_stderr(stderr), + stdin_pipe_out_fd(-1), + stdout_pipe_in_fd(-1), + stderr_pipe_in_fd(-1), + pid(-1), + errstr() { +} + +SubProcess::~SubProcess() { + assert(!is_spawned()); + assert(stdin_pipe_out_fd == -1); + assert(stdout_pipe_in_fd == -1); + assert(stderr_pipe_in_fd == -1); +} + +void SubProcess::add_cmd_args(const char *arg, ...) { + assert(!is_spawned()); + + va_list ap; + va_start(ap, arg); + const char *p = arg; + do { + add_cmd_arg(p); + p = va_arg(ap, const char*); + } while (p != NULL); + va_end(ap); +} + +void SubProcess::add_cmd_arg(const char *arg) { + assert(!is_spawned()); + + cmd_args.push_back(arg); +} + +int SubProcess::stdin() const { + assert(is_spawned()); + assert(pipe_stdin); + + return stdin_pipe_out_fd; +} + +int SubProcess::stdout() const { + assert(is_spawned()); + assert(pipe_stdout); + + return stdout_pipe_in_fd; +} + +int SubProcess::stderr() const { + assert(is_spawned()); + assert(pipe_stderr); + + return stderr_pipe_in_fd; +} + +void SubProcess::close(int &fd) { + if (fd == -1) + return; + + ::close(fd); + fd = -1; +} + +void SubProcess::close_stdin() { + assert(is_spawned()); + assert(pipe_stdin); + + close(stdin_pipe_out_fd); +} + +void SubProcess::close_stdout() { + assert(is_spawned()); + assert(pipe_stdout); + + close(stdout_pipe_in_fd); +} + +void SubProcess::close_stderr() { + assert(is_spawned()); + assert(pipe_stderr); + + close(stderr_pipe_in_fd); +} + +void SubProcess::kill(int signo) const { + assert(is_spawned()); + + int ret = ::kill(pid, signo); + assert(ret == 0); +} + +const char* SubProcess::err() const { + return errstr.str().c_str(); +} + +int SubProcess::spawn() { + assert(!is_spawned()); + assert(stdin_pipe_out_fd == -1); + assert(stdout_pipe_in_fd == -1); + assert(stderr_pipe_in_fd == -1); + + enum { IN = 0, OUT = 1 }; + + int ipipe[2], opipe[2], epipe[2]; + + ipipe[0] = ipipe[1] = opipe[0] = opipe[1] = epipe[0] = epipe[1] = -1; + + int ret = 0; + + if ((pipe_stdin && ::pipe(ipipe) == -1) || + (pipe_stdout && ::pipe(opipe) == -1) || + (pipe_stderr && ::pipe(epipe) == -1)) { + ret = -errno; + errstr << "pipe failed: " << cpp_strerror(errno); + goto fail; + } + + pid = fork(); + + if (pid > 0) { // Parent + stdin_pipe_out_fd = ipipe[OUT]; close(ipipe[IN ]); + stdout_pipe_in_fd = opipe[IN ]; close(opipe[OUT]); + stderr_pipe_in_fd = epipe[IN ]; close(epipe[OUT]); + return 0; + } + + if (pid == 0) { // Child + close(ipipe[OUT]); + close(opipe[IN ]); + close(epipe[IN ]); + + if (ipipe[IN] != -1 && ipipe[IN] != STDIN_FILENO) { + ::dup2(ipipe[IN], STDIN_FILENO); + close(ipipe[IN]); + } + if (opipe[OUT] != -1 && opipe[OUT] != STDOUT_FILENO) { + ::dup2(opipe[OUT], STDOUT_FILENO); + close(opipe[OUT]); + } + if (epipe[OUT] != -1 && epipe[OUT] != STDERR_FILENO) { + ::dup2(epipe[OUT], STDERR_FILENO); + close(epipe[OUT]); + } + + int maxfd = sysconf(_SC_OPEN_MAX); + if (maxfd == -1) + maxfd = 16384; + for (int fd = 0; fd <= maxfd; fd++) { + if (fd == STDIN_FILENO && pipe_stdin) + continue; + if (fd == STDOUT_FILENO && pipe_stdout) + continue; + if (fd == STDERR_FILENO && pipe_stderr) + continue; + ::close(fd); + } + + exec(); + assert(0); // Never reached + } + + ret = -errno; + errstr << "fork failed: " << cpp_strerror(errno); + +fail: + close(ipipe[0]); + close(ipipe[1]); + close(opipe[0]); + close(opipe[1]); + close(epipe[0]); + close(epipe[1]); + + return ret; +} + +void SubProcess::exec() { + assert(is_child()); + + std::vector args; + args.push_back(cmd.c_str()); + for (std::vector::iterator i = cmd_args.begin(); + i != cmd_args.end(); + i++) { + args.push_back(i->c_str()); + } + args.push_back(NULL); + + int ret = execvp(cmd.c_str(), (char * const *)&args[0]); + assert(ret == -1); + + std::ostringstream err; + err << cmd << ": exec failed: " << cpp_strerror(errno) << "\n"; + write(STDERR_FILENO, err.str().c_str(), err.str().size()); + _exit(EXIT_FAILURE); +} + +int SubProcess::join() { + assert(is_spawned()); + + close(stdin_pipe_out_fd); + close(stdout_pipe_in_fd); + close(stderr_pipe_in_fd); + + int status; + + while (waitpid(pid, &status, 0) == -1) + assert(errno == EINTR); + + pid = -1; + + if (WIFEXITED(status)) { + if (WEXITSTATUS(status) != EXIT_SUCCESS) + errstr << cmd << ": exit status: " << WEXITSTATUS(status); + return WEXITSTATUS(status); + } + if (WIFSIGNALED(status)) { + errstr << cmd << ": got signal: " << WTERMSIG(status); + return 128 + WTERMSIG(status); + } + errstr << cmd << ": waitpid: unknown status returned\n"; + return EXIT_FAILURE; +} + +SubProcessTimed::SubProcessTimed(const char *cmd, bool pipe_stdin, + bool pipe_stdout, bool pipe_stderr, + int timeout_, int sigkill_) : + SubProcess(cmd, pipe_stdin, pipe_stdout, pipe_stderr), + timeout(timeout_), + sigkill(sigkill_) { +} + +static bool timedout = false; // only used after fork +static void timeout_sighandler(int sig) { + timedout = true; +} +static void dummy_sighandler(int sig) {} + +void SubProcessTimed::exec() { + assert(is_child()); + + if (timeout <= 0) { + SubProcess::exec(); + assert(0); // Never reached + } + + sigset_t mask, oldmask; + std::ostringstream err; + int pid; + + // Restore default action for SIGTERM in case the parent process decided + // to ignore it. + if (signal(SIGTERM, SIG_DFL) == SIG_ERR) { + err << cmd << ": signal failed: " << cpp_strerror(errno) << "\n"; + goto fail_exit; + } + // Because SIGCHLD is ignored by default, setup dummy handler for it, + // so we can mask it. + if (signal(SIGCHLD, dummy_sighandler) == SIG_ERR) { + err << cmd << ": signal failed: " << cpp_strerror(errno) << "\n"; + goto fail_exit; + } + // Setup timeout handler. + if (signal(SIGALRM, timeout_sighandler) == SIG_ERR) { + err << cmd << ": signal failed: " << cpp_strerror(errno) << "\n"; + goto fail_exit; + } + // Block interesting signals. + sigemptyset(&mask); + sigaddset(&mask, SIGINT); + sigaddset(&mask, SIGTERM); + sigaddset(&mask, SIGCHLD); + sigaddset(&mask, SIGALRM); + if (sigprocmask(SIG_SETMASK, &mask, &oldmask) == -1) { + err << cmd << ": sigprocmask failed: " << cpp_strerror(errno) << "\n"; + goto fail_exit; + } + + pid = fork(); + + if (pid == -1) { + err << cmd << ": fork failed: " << cpp_strerror(errno) << "\n"; + goto fail_exit; + } + + if (pid == 0) { // Child + // Restore old sigmask. + if (sigprocmask(SIG_SETMASK, &oldmask, NULL) == -1) { + err << cmd << ": sigprocmask failed: " << cpp_strerror(errno) << "\n"; + goto fail_exit; + } + (void)setpgid(0, 0); // Become process group leader. + SubProcess::exec(); + assert(0); // Never reached + } + + // Parent + (void)alarm(timeout); + + for (;;) { + int signo; + if (sigwait(&mask, &signo) == -1) { + err << cmd << ": sigwait failed: " << cpp_strerror(errno) << "\n"; + goto fail_exit; + } + switch (signo) { + case SIGCHLD: + int status; + if (waitpid(pid, &status, WNOHANG) == -1) { + err << cmd << ": waitpid failed: " << cpp_strerror(errno) << "\n"; + goto fail_exit; + } + write(STDERR_FILENO, err.str().c_str(), err.str().size()); + if (WIFEXITED(status)) + _exit(WEXITSTATUS(status)); + if (WIFSIGNALED(status)) + _exit(128 + WTERMSIG(status)); + err << cmd << ": unknown status returned\n"; + goto fail_exit; + case SIGINT: + case SIGTERM: + // Pass SIGINT and SIGTERM, which are usually used to terminate + // a process, to the child. + if (::kill(pid, signo) == -1) { + err << cmd << ": kill failed: " << cpp_strerror(errno) << "\n"; + goto fail_exit; + } + continue; + case SIGALRM: + err << cmd << ": timed out (" << timeout << " sec)\n"; + write(STDERR_FILENO, err.str().c_str(), err.str().size()); + if (::killpg(pid, sigkill) == -1) { + err << cmd << ": kill failed: " << cpp_strerror(errno) << "\n"; + goto fail_exit; + } + continue; + default: + err << cmd << ": sigwait: invalid signal: " << signo << "\n"; + goto fail_exit; + } + } + +fail_exit: + write(STDERR_FILENO, err.str().c_str(), err.str().size()); + _exit(EXIT_FAILURE); +} + +#endif diff --git a/src/common/TextTable.h b/src/common/TextTable.h index 282bf733ba5e5..d17b2652243d6 100644 --- a/src/common/TextTable.h +++ b/src/common/TextTable.h @@ -118,7 +118,6 @@ class TextTable { } // now store the rendered item with its proper width - oss << std::setw(width) << item; row[currow][curcol] = oss.str(); curcol++; diff --git a/src/common/Thread.cc b/src/common/Thread.cc index 584e97bbd8247..f9c6a690250c3 100644 --- a/src/common/Thread.cc +++ b/src/common/Thread.cc @@ -1,4 +1,4 @@ - // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab /* * Ceph - scalable distributed file system @@ -27,13 +27,34 @@ #include #include #include +#ifdef HAVE_SCHED +#include +#endif +static int _set_affinity(int id) +{ +#ifdef HAVE_SCHED + if (id >= 0 && id < CPU_SETSIZE) { + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + + CPU_SET(id, &cpuset); + + if (sched_setaffinity(0, sizeof(cpuset), &cpuset) < 0) + return -errno; + /* guaranteed to take effect immediately */ + sched_yield(); + } +#endif + return 0; +} Thread::Thread() : thread_id(0), pid(0), ioprio_class(-1), - ioprio_priority(-1) + ioprio_priority(-1), + cpuid(-1) { } @@ -58,10 +79,12 @@ void *Thread::entry_wrapper() pid, IOPRIO_PRIO_VALUE(ioprio_class, ioprio_priority)); } + if (pid && cpuid >= 0) + _set_affinity(cpuid); return entry(); } -const pthread_t &Thread::get_thread_id() +const pthread_t &Thread::get_thread_id() const { return thread_id; } @@ -71,7 +94,7 @@ bool Thread::is_started() const return thread_id != 0; } -bool Thread::am_self() +bool Thread::am_self() const { return (pthread_self() == thread_id); } @@ -87,11 +110,11 @@ int Thread::kill(int signal) int Thread::try_create(size_t stacksize) { pthread_attr_t *thread_attr = NULL; + pthread_attr_t thread_attr_loc; + stacksize &= CEPH_PAGE_MASK; // must be multiple of page if (stacksize) { - thread_attr = (pthread_attr_t*) malloc(sizeof(pthread_attr_t)); - if (!thread_attr) - return -ENOMEM; + thread_attr = &thread_attr_loc; pthread_attr_init(thread_attr); pthread_attr_setstacksize(thread_attr, stacksize); } @@ -113,8 +136,6 @@ int Thread::try_create(size_t stacksize) r = pthread_create(&thread_id, thread_attr, _entry_func, (void*)this); restore_sigset(&old_sigset); - if (thread_attr) - free(thread_attr); return r; } @@ -138,7 +159,14 @@ int Thread::join(void **prval) } int status = pthread_join(thread_id, prval); - assert(status == 0); + if (status != 0) { + char buf[256]; + snprintf(buf, sizeof(buf), "Thread::join(): pthread_join " + "failed with error %d\n", status); + dout_emergency(buf); + assert(status == 0); + } + thread_id = 0; return status; } @@ -159,3 +187,11 @@ int Thread::set_ioprio(int cls, int prio) IOPRIO_PRIO_VALUE(cls, prio)); return 0; } + +int Thread::set_affinity(int id) +{ + cpuid = id; + if (pid && ceph_gettid() == pid) + _set_affinity(id); + return 0; +} diff --git a/src/common/Thread.h b/src/common/Thread.h index 7889c91345677..e284bdafaaba8 100644 --- a/src/common/Thread.h +++ b/src/common/Thread.h @@ -24,11 +24,12 @@ class Thread { pthread_t thread_id; pid_t pid; int ioprio_class, ioprio_priority; + int cpuid; void *entry_wrapper(); public: - Thread(const Thread& other); + explicit Thread(const Thread& other); const Thread& operator=(const Thread& other); Thread(); @@ -41,16 +42,17 @@ class Thread { static void *_entry_func(void *arg); public: - const pthread_t &get_thread_id(); + const pthread_t &get_thread_id() const; pid_t get_pid() const { return pid; } bool is_started() const; - bool am_self(); + bool am_self() const; int kill(int signal); int try_create(size_t stacksize); void create(size_t stacksize = 0); int join(void **prval = 0); int detach(); int set_ioprio(int cls, int prio); + int set_affinity(int cpuid); }; #endif diff --git a/src/common/Throttle.cc b/src/common/Throttle.cc index 026d731e839b5..307c0ec9e4b85 100644 --- a/src/common/Throttle.cc +++ b/src/common/Throttle.cc @@ -29,9 +29,9 @@ enum { l_throttle_last, }; -Throttle::Throttle(CephContext *cct, std::string n, int64_t m, bool _use_perf) +Throttle::Throttle(CephContext *cct, const std::string& n, int64_t m, bool _use_perf) : cct(cct), name(n), logger(NULL), - max(m), + max(m), lock("Throttle::lock"), use_perf(_use_perf) { @@ -42,17 +42,17 @@ Throttle::Throttle(CephContext *cct, std::string n, int64_t m, bool _use_perf) if (cct->_conf->throttler_perf_counter) { PerfCountersBuilder b(cct, string("throttle-") + name, l_throttle_first, l_throttle_last); - b.add_u64_counter(l_throttle_val, "val"); - b.add_u64_counter(l_throttle_max, "max"); - b.add_u64_counter(l_throttle_get, "get"); - b.add_u64_counter(l_throttle_get_sum, "get_sum"); - b.add_u64_counter(l_throttle_get_or_fail_fail, "get_or_fail_fail"); - b.add_u64_counter(l_throttle_get_or_fail_success, "get_or_fail_success"); - b.add_u64_counter(l_throttle_take, "take"); - b.add_u64_counter(l_throttle_take_sum, "take_sum"); - b.add_u64_counter(l_throttle_put, "put"); - b.add_u64_counter(l_throttle_put_sum, "put_sum"); - b.add_time_avg(l_throttle_wait, "wait"); + b.add_u64_counter(l_throttle_val, "val", "Currently available throttle"); + b.add_u64_counter(l_throttle_max, "max", "Max value for throttle"); + b.add_u64_counter(l_throttle_get, "get", "Gets"); + b.add_u64_counter(l_throttle_get_sum, "get_sum", "Got data"); + b.add_u64_counter(l_throttle_get_or_fail_fail, "get_or_fail_fail", "Get blocked during get_or_fail"); + b.add_u64_counter(l_throttle_get_or_fail_success, "get_or_fail_success", "Successful get during get_or_fail"); + b.add_u64_counter(l_throttle_take, "take", "Takes"); + b.add_u64_counter(l_throttle_take_sum, "take_sum", "Taken data"); + b.add_u64_counter(l_throttle_put, "put", "Puts"); + b.add_u64_counter(l_throttle_put_sum, "put_sum", "Put data"); + b.add_time_avg(l_throttle_wait, "wait", "Waiting latency"); logger = b.create_perf_counters(); cct->get_perfcounters_collection()->add(logger); @@ -80,6 +80,8 @@ Throttle::~Throttle() void Throttle::_reset_max(int64_t m) { assert(lock.is_locked()); + if ((int64_t)max.read() == m) + return; if (!cond.empty()) cond.front()->SignalOne(); if (logger) @@ -124,7 +126,7 @@ bool Throttle::_wait(int64_t c) bool Throttle::wait(int64_t m) { - if (0 == max.read()) { + if (0 == max.read() && 0 == m) { return false; } @@ -158,7 +160,7 @@ int64_t Throttle::take(int64_t c) bool Throttle::get(int64_t c, int64_t m) { - if (0 == max.read()) { + if (0 == max.read() && 0 == m) { return false; } @@ -267,6 +269,12 @@ void SimpleThrottle::end_op(int r) m_cond.Signal(); } +bool SimpleThrottle::pending_error() const +{ + Mutex::Locker l(m_lock); + return (m_ret < 0); +} + int SimpleThrottle::wait_for_ret() { Mutex::Locker l(m_lock); diff --git a/src/common/Throttle.h b/src/common/Throttle.h index 6d039888c633e..4a3962beca3de 100644 --- a/src/common/Throttle.h +++ b/src/common/Throttle.h @@ -12,22 +12,30 @@ class CephContext; class PerfCounters; +/** + * @class Throttle + * Throttles the maximum number of active requests. + * + * This class defines the maximum number of slots currently taken away. The + * excessive requests for more of them are delayed, until some slots are put + * back, so @p get_current() drops below the limit after fulfills the requests. + */ class Throttle { CephContext *cct; - std::string name; + const std::string name; PerfCounters *logger; - ceph::atomic_t count, max; + ceph::atomic_t count, max; Mutex lock; list cond; - bool use_perf; - + const bool use_perf; + public: - Throttle(CephContext *cct, std::string n, int64_t m = 0, bool _use_perf = true); + Throttle(CephContext *cct, const std::string& n, int64_t m = 0, bool _use_perf = true); ~Throttle(); private: void _reset_max(int64_t m); - bool _should_wait(int64_t c) { + bool _should_wait(int64_t c) const { int64_t m = max.read(); int64_t cur = count.read(); return @@ -39,22 +47,58 @@ class Throttle { bool _wait(int64_t c); public: - int64_t get_current() { + /** + * gets the number of currently taken slots + * @returns the number of taken slots + */ + int64_t get_current() const { return count.read(); } - int64_t get_max() { return max.read(); } + /** + * get the max number of slots + * @returns the max number of slots + */ + int64_t get_max() const { return max.read(); } + /** + * set the new max number, and wait until the number of taken slots drains + * and drops below this limit. + * + * @param m the new max number + * @returns true if this method is blocked, false it it returns immediately + */ bool wait(int64_t m = 0); + /** + * take the specified number of slots from the stock regardless the throttling + * @param c number of slots to take + * @returns the total number of taken slots + */ int64_t take(int64_t c = 1); + + /** + * get the specified amount of slots from the stock, but will wait if the + * total number taken by consumer would exceed the maximum number. + * @param c number of slots to get + * @param m new maximum number to set, ignored if it is 0 + * @returns true if this request is blocked due to the throttling, false + * otherwise + */ bool get(int64_t c = 1, int64_t m = 0); /** - * Returns true if it successfully got the requested amount, + * the unblocked version of @p get() + * @returns true if it successfully got the requested amount, * or false if it would block. */ bool get_or_fail(int64_t c = 1); + + /** + * put slots back to the stock + * @param c number of slots to return + * @returns number of requests being hold after this + */ int64_t put(int64_t c = 1); }; @@ -76,9 +120,10 @@ class SimpleThrottle { ~SimpleThrottle(); void start_op(); void end_op(int r); + bool pending_error() const; int wait_for_ret(); private: - Mutex m_lock; + mutable Mutex m_lock; Cond m_cond; uint64_t m_max; uint64_t m_current; diff --git a/src/common/TrackedOp.cc b/src/common/TrackedOp.cc index 32dbc5398dcc2..f759894fb0480 100644 --- a/src/common/TrackedOp.cc +++ b/src/common/TrackedOp.cc @@ -152,12 +152,14 @@ void OpTracker::unregister_inflight_op(TrackedOp *i) bool OpTracker::check_ops_in_flight(std::vector &warning_vector) { + if (!tracking_enabled) + return false; + utime_t now = ceph_clock_now(cct); utime_t too_old = now; too_old -= complaint_time; - utime_t oldest_op; + utime_t oldest_op = now; uint64_t total_ops_in_flight = 0; - bool got_first_op = false; for (uint32_t i = 0; i < num_optracker_shards; i++) { ShardedTrackingData* sdata = sharded_in_flight_list[i]; @@ -165,10 +167,7 @@ bool OpTracker::check_ops_in_flight(std::vector &warning_vector) Mutex::Locker locker(sdata->ops_in_flight_lock_sharded); if (!sdata->ops_in_flight_sharded.empty()) { utime_t oldest_op_tmp = sdata->ops_in_flight_sharded.front()->get_initiated(); - if (!got_first_op) { - oldest_op = oldest_op_tmp; - got_first_op = true; - } else if (oldest_op_tmp < oldest_op) { + if (oldest_op_tmp < oldest_op) { oldest_op = oldest_op_tmp; } } @@ -188,10 +187,13 @@ bool OpTracker::check_ops_in_flight(std::vector &warning_vector) return false; warning_vector.reserve(log_threshold + 1); + //store summary message + warning_vector.push_back(""); int slow = 0; // total slow int warned = 0; // total logged - for (uint32_t iter = 0; iter < num_optracker_shards; iter++) { + for (uint32_t iter = 0; + iter < num_optracker_shards && warned < log_threshold; iter++) { ShardedTrackingData* sdata = sharded_in_flight_list[iter]; assert(NULL != sdata); Mutex::Locker locker(sdata->ops_in_flight_lock_sharded); @@ -205,8 +207,6 @@ bool OpTracker::check_ops_in_flight(std::vector &warning_vector) if (((*i)->get_initiated() + (complaint_time * (*i)->warn_interval_multiplier)) < now) { // will warn - if (warning_vector.empty()) - warning_vector.push_back(""); warned++; if (warned > log_threshold) break; @@ -236,7 +236,7 @@ bool OpTracker::check_ops_in_flight(std::vector &warning_vector) warning_vector[0] = ss.str(); } - return warning_vector.size(); + return warned; } void OpTracker::get_age_ms_histogram(pow2_hist_t *h) @@ -294,12 +294,12 @@ void OpTracker::_mark_event(TrackedOp *op, const string &evt, } void OpTracker::RemoveOnDelete::operator()(TrackedOp *op) { - op->mark_event("done"); if (!tracker->tracking_enabled) { op->_unregistered(); delete op; return; } + op->mark_event("done"); tracker->unregister_inflight_op(op); // Do not delete op, unregister_inflight_op took control } diff --git a/src/common/TrackedOp.h b/src/common/TrackedOp.h index 6c28a89dce339..2f656cacb97e4 100644 --- a/src/common/TrackedOp.h +++ b/src/common/TrackedOp.h @@ -161,7 +161,8 @@ class TrackedOp { warn_interval_multiplier(1) { tracker->register_inflight_op(&xitem); - events.push_back(make_pair(initiated_at, "initiated")); + if (tracker->tracking_enabled) + events.push_back(make_pair(initiated_at, "initiated")); } /// output any type-specific data you want to get when dump() is called @@ -179,11 +180,12 @@ class TrackedOp { const utime_t& get_initiated() const { return initiated_at; } - // This function maybe needs some work; assumes last event is completion time + double get_duration() const { - return events.size() ? - (events.rbegin()->first - get_initiated()) : - 0.0; + if (!events.empty() && events.rbegin()->second.compare("done") == 0) + return events.rbegin()->first - get_initiated(); + else + return ceph_clock_now(NULL) - get_initiated(); } void mark_event(const string &event); diff --git a/src/common/WorkQueue.cc b/src/common/WorkQueue.cc index 5c7fefc434930..00666faca3343 100644 --- a/src/common/WorkQueue.cc +++ b/src/common/WorkQueue.cc @@ -142,8 +142,13 @@ void ThreadPool::worker(WorkThread *wt) } ldout(cct,20) << "worker waiting" << dendl; - cct->get_heartbeat_map()->reset_timeout(hb, 4, 0); - _cond.WaitInterval(cct, _lock, utime_t(2, 0)); + cct->get_heartbeat_map()->reset_timeout( + hb, + cct->_conf->threadpool_default_timeout, + 0); + _cond.WaitInterval(cct, _lock, + utime_t( + cct->_conf->threadpool_empty_queue_max_wait, 0)); } ldout(cct,1) << "worker finish" << dendl; @@ -301,8 +306,12 @@ void ShardedThreadPool::shardedthreadpool_worker(uint32_t thread_index) ++num_paused; wait_cond.Signal(); while(pause_threads.read()) { - cct->get_heartbeat_map()->reset_timeout(hb, 4, 0); - shardedpool_cond.WaitInterval(cct, shardedpool_lock, utime_t(2, 0)); + cct->get_heartbeat_map()->reset_timeout( + hb, + wq->timeout_interval, wq->suicide_interval); + shardedpool_cond.WaitInterval(cct, shardedpool_lock, + utime_t( + cct->_conf->threadpool_empty_queue_max_wait, 0)); } --num_paused; shardedpool_lock.Unlock(); @@ -313,14 +322,21 @@ void ShardedThreadPool::shardedthreadpool_worker(uint32_t thread_index) ++num_drained; wait_cond.Signal(); while (drain_threads.read()) { - cct->get_heartbeat_map()->reset_timeout(hb, 4, 0); - shardedpool_cond.WaitInterval(cct, shardedpool_lock, utime_t(2, 0)); + cct->get_heartbeat_map()->reset_timeout( + hb, + wq->timeout_interval, wq->suicide_interval); + shardedpool_cond.WaitInterval(cct, shardedpool_lock, + utime_t( + cct->_conf->threadpool_empty_queue_max_wait, 0)); } --num_drained; } shardedpool_lock.Unlock(); } + cct->get_heartbeat_map()->reset_timeout( + hb, + wq->timeout_interval, wq->suicide_interval); wq->_process(thread_index, hb); } diff --git a/src/common/WorkQueue.h b/src/common/WorkQueue.h index 0373d731a93fc..43db8416cbd73 100644 --- a/src/common/WorkQueue.h +++ b/src/common/WorkQueue.h @@ -23,6 +23,7 @@ class CephContext; +/// Pool of threads that share work submitted to multiple work queues. class ThreadPool : public md_config_obs_t { CephContext *cct; string name; @@ -54,6 +55,7 @@ class ThreadPool : public md_config_obs_t { }; private: + /// Basic interface to a work queue used by the worker threads. struct WorkQueue_ { string name; time_t timeout_interval, suicide_interval; @@ -61,10 +63,20 @@ class ThreadPool : public md_config_obs_t { : name(n), timeout_interval(ti), suicide_interval(sti) { } virtual ~WorkQueue_() {} + /// Remove all work items from the queue. virtual void _clear() = 0; + /// Check whether there is anything to do. virtual bool _empty() = 0; + /// Get the next work item to process. virtual void *_void_dequeue() = 0; + /** @brief Process the work item. + * This function will be called several times in parallel + * and must therefore be thread-safe. */ virtual void _void_process(void *item, TPHandle &handle) = 0; + /** @brief Synchronously finish processing a work item. + * This function is called after _void_process with the global thread pool lock held, + * so at most one copy will execute simultaneously for a given thread pool. + * It can be used for non-thread-safe finalization. */ virtual void _void_process_finish(void *) = 0; }; @@ -80,6 +92,9 @@ class ThreadPool : public md_config_obs_t { const std::set &changed); public: + /** @brief Work queue that processes several submitted items at once. + * The queue will automatically add itself to the thread pool on construction + * and remove itself on destruction. */ template class BatchWorkQueue : public WorkQueue_ { ThreadPool *pool; @@ -87,12 +102,9 @@ class ThreadPool : public md_config_obs_t { virtual bool _enqueue(T *) = 0; virtual void _dequeue(T *) = 0; virtual void _dequeue(list *) = 0; - virtual void _process(const list &) { assert(0); } - virtual void _process(const list &items, TPHandle &handle) { - _process(items); - } virtual void _process_finish(const list &) {} + // virtual methods from WorkQueue_ below void *_void_dequeue() { list *out(new list); _dequeue(out); @@ -111,6 +123,12 @@ class ThreadPool : public md_config_obs_t { delete (list *)p; } + protected: + virtual void _process(const list &) { assert(0); } + virtual void _process(const list &items, TPHandle &handle) { + _process(items); + } + public: BatchWorkQueue(string n, time_t ti, time_t sti, ThreadPool* p) : WorkQueue_(n, ti, sti), pool(p) { @@ -155,6 +173,12 @@ class ThreadPool : public md_config_obs_t { } }; + + /** @brief Templated by-value work queue. + * Skeleton implementation of a queue that processes items submitted by value. + * This is useful if the items are single primitive values or very small objects + * (a few bytes). The queue will automatically add itself to the thread pool on + * construction and remove itself on destruction. */ template class WorkQueueVal : public WorkQueue_ { Mutex _lock; @@ -165,10 +189,6 @@ class ThreadPool : public md_config_obs_t { virtual void _enqueue_front(T) = 0; virtual bool _empty() = 0; virtual U _dequeue() = 0; - virtual void _process(U) { assert(0); } - virtual void _process(U u, TPHandle &) { - _process(u); - } virtual void _process_finish(U) {} void *_void_dequeue() { @@ -235,20 +255,30 @@ class ThreadPool : public md_config_obs_t { void unlock() { pool->unlock(); } + virtual void _process(U) { assert(0); } + virtual void _process(U u, TPHandle &) { + _process(u); + } }; + + /** @brief Template by-pointer work queue. + * Skeleton implementation of a queue that processes items of a given type submitted as pointers. + * This is useful when the work item are large or include dynamically allocated memory. The queue + * will automatically add itself to the thread pool on construction and remove itself on + * destruction. */ template class WorkQueue : public WorkQueue_ { ThreadPool *pool; + /// Add a work item to the queue. virtual bool _enqueue(T *) = 0; + /// Dequeue a previously submitted work item. virtual void _dequeue(T *) = 0; + /// Dequeue a work item and return the original submitted pointer. virtual T *_dequeue() = 0; - virtual void _process(T *t) { assert(0); } - virtual void _process(T *t, TPHandle &) { - _process(t); - } virtual void _process_finish(T *) {} - + + // implementation of virtual methods from WorkQueue_ void *_void_dequeue() { return (void *)_dequeue(); } @@ -259,6 +289,13 @@ class ThreadPool : public md_config_obs_t { _process_finish(static_cast(p)); } + protected: + /// Process a work item. Called from the worker threads. + virtual void _process(T *t) { assert(0); } + virtual void _process(T *t, TPHandle &) { + _process(t); + } + public: WorkQueue(string n, time_t ti, time_t sti, ThreadPool* p) : WorkQueue_(n, ti, sti), pool(p) { pool->add_work_queue(this); @@ -330,7 +367,7 @@ class ThreadPool : public md_config_obs_t { public: ThreadPool(CephContext *cct_, string nm, int n, const char *option = NULL); - ~ThreadPool(); + virtual ~ThreadPool(); /// return number of threads currently running int get_num_threads() { @@ -387,7 +424,10 @@ class ThreadPool : public md_config_obs_t { void pause_new(); /// resume work in thread pool. must match each pause() call 1:1 to resume. void unpause(); - /// wait for all work to complete + /** @brief Wait until work completes. + * If the parameter is NULL, blocks until all threads are idle. + * If it is not NULL, blocks until the given work queue does not have + * any items left to process. */ void drain(WorkQueue_* wq = 0); /// set io priority @@ -417,6 +457,7 @@ class GenContextWQ : _queue.pop_front(); return c; } + using ThreadPool::WorkQueueVal*>::_process; void _process(GenContext *c, ThreadPool::TPHandle &tp) { c->complete(tp); } @@ -433,6 +474,41 @@ class C_QueueInWQ : public Context { } }; +/// Work queue that asynchronously completes contexts (executes callbacks). +/// @see Finisher +class ContextWQ : public ThreadPool::WorkQueueVal > { +public: + ContextWQ(const string &name, time_t ti, ThreadPool *tp) + : ThreadPool::WorkQueueVal >(name, ti, 0, tp) {} + + void queue(Context *ctx, int result = 0) { + ThreadPool::WorkQueueVal >::queue( + std::make_pair(ctx, result)); + } + +protected: + virtual void _enqueue(std::pair item) { + _queue.push_back(item); + } + virtual void _enqueue_front(std::pair item) { + _queue.push_front(item); + } + virtual bool _empty() { + return _queue.empty(); + } + virtual std::pair _dequeue() { + std::pair item = _queue.front(); + _queue.pop_front(); + return item; + } + virtual void _process(std::pair item) { + item.first->complete(item.second); + } + using ThreadPool::WorkQueueVal >::_process; +private: + list > _queue; +}; + class ShardedThreadPool { CephContext *cct; diff --git a/src/common/address_helper.cc b/src/common/address_helper.cc new file mode 100644 index 0000000000000..1bde1accd7796 --- /dev/null +++ b/src/common/address_helper.cc @@ -0,0 +1,99 @@ +/* + * address_helper.cc + * + * Created on: Oct 27, 2013 + * Author: matt + */ + +#include +#include +#include + +#include +#include + +using namespace std; + +#include "common/config.h" +#include "boost/regex.hpp" + +#include "common/address_helper.h" + +#include + +// decode strings like "tcp://:" +int entity_addr_from_url(entity_addr_t *addr /* out */, const char *url) +{ + using namespace boost; + using std::endl; + + struct addrinfo hints; + struct addrinfo *res; + + regex expr("(tcp|rdma)://([^:]*):([\\d]+)"); + cmatch m; + + if (regex_match(url, m, expr)) { + int error; + string host(m[2].first, m[2].second); + string port(m[3].first, m[3].second); + memset(&hints, 0, sizeof(hints)); + hints.ai_family = PF_UNSPEC; + error = getaddrinfo(host.c_str(), NULL, &hints, &res); + if (! error) { + struct sockaddr_in *sin; + struct sockaddr_in6 *sin6; + addr->addr.ss_family = res->ai_family; + switch(res->ai_family) { + case AF_INET: + sin = (struct sockaddr_in *) res->ai_addr; + memcpy(&addr->addr4.sin_addr, &sin->sin_addr, + sizeof(sin->sin_addr)); + break; + case AF_INET6: + sin6 = (struct sockaddr_in6 *) res->ai_addr; + memcpy(&addr->addr6.sin6_addr, &sin6->sin6_addr, + sizeof(sin6->sin6_addr)); + break; + default: + break; + }; + addr->set_port(std::atoi(port.c_str())); + return 0; + } + } + + return 1; +} + +int entity_addr_from_sockaddr(entity_addr_t *addr /* out */, + const struct sockaddr *sa) +{ + struct sockaddr_in *sin; + struct sockaddr_in6 *sin6; + + if (! sa) + return 0; + + addr->addr.ss_family = sa->sa_family; + switch(sa->sa_family) { + case AF_INET: + sin = (struct sockaddr_in *) sa; + memcpy(&addr->addr4.sin_addr, &sin->sin_addr, + sizeof(sin->sin_addr)); + addr->addr4.sin_port = sin->sin_port; + break; + case AF_INET6: + sin6 = (struct sockaddr_in6 *) sa; + memcpy(&addr->addr6.sin6_addr, &sin6->sin6_addr, + sizeof(sin6->sin6_addr)); + addr->addr6.sin6_port = sin6->sin6_port; + break; + default: + break; + }; + + return 1; +} + + diff --git a/src/common/address_helper.h b/src/common/address_helper.h new file mode 100644 index 0000000000000..cdb60612c7f87 --- /dev/null +++ b/src/common/address_helper.h @@ -0,0 +1,24 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef ADDRESS_HELPER_H_ +#define ADDRESS_HELPER_H_ + +#include "msg/msg_types.h" + +int entity_addr_from_url(entity_addr_t *addr /* out */, const char *url); +int entity_addr_from_sockaddr(entity_addr_t *addr /* out */, + const struct sockaddr *sa); + +#endif /* ADDRESS_HELPER_H_ */ diff --git a/src/common/admin_socket.cc b/src/common/admin_socket.cc index 24d165671c484..1df5105451373 100644 --- a/src/common/admin_socket.cc +++ b/src/common/admin_socket.cc @@ -149,6 +149,33 @@ std::string AdminSocket::create_shutdown_pipe(int *pipe_rd, int *pipe_wr) return ""; } +std::string AdminSocket::destroy_shutdown_pipe() +{ + // Send a byte to the shutdown pipe that the thread is listening to + char buf[1] = { 0x0 }; + int ret = safe_write(m_shutdown_wr_fd, buf, sizeof(buf)); + + // Close write end + VOID_TEMP_FAILURE_RETRY(close(m_shutdown_wr_fd)); + m_shutdown_wr_fd = -1; + + if (ret != 0) { + ostringstream oss; + oss << "AdminSocket::destroy_shutdown_pipe error: failed to write" + "to thread shutdown pipe: error " << ret; + return oss.str(); + } + + join(); + + // Close read end. Doing this before join() blocks the listenter and prevents + // joining. + VOID_TEMP_FAILURE_RETRY(close(m_shutdown_rd_fd)); + m_shutdown_rd_fd = -1; + + return ""; +} + std::string AdminSocket::bind_and_listen(const std::string &sock_path, int *fd) { ldout(m_cct, 5) << "bind_and_listen " << sock_path << dendl; @@ -325,7 +352,7 @@ bool AdminSocket::do_accept() stringstream errss; cmdvec.push_back(cmd); if (!cmdmap_from_json(cmdvec, &cmdmap, errss)) { - ldout(m_cct, 0) << "AdminSocket: " << errss << dendl; + ldout(m_cct, 0) << "AdminSocket: " << errss.rdbuf() << dendl; return false; } cmd_getval(m_cct, cmdmap, "format", format); @@ -448,9 +475,7 @@ class HelpHook : public AdminSocketHook { public: HelpHook(AdminSocket *as) : m_as(as) {} bool call(string command, cmdmap_t &cmdmap, string format, bufferlist& out) { - Formatter *f = new_formatter(format); - if (!f) - f = new_formatter("json-pretty"); + Formatter *f = Formatter::create(format, "json-pretty", "json-pretty"); f->open_object_section("help"); for (map::iterator p = m_as->m_help.begin(); p != m_as->m_help.end(); @@ -538,30 +563,31 @@ bool AdminSocket::init(const std::string &path) void AdminSocket::shutdown() { + std::string err; + + // Under normal operation this is unlikely to occur. However for some unit + // tests, some object members are not initialized and so cannot be deleted + // without fault. if (m_shutdown_wr_fd < 0) return; ldout(m_cct, 5) << "shutdown" << dendl; - // Send a byte to the shutdown pipe that the thread is listening to - char buf[1] = { 0x0 }; - int ret = safe_write(m_shutdown_wr_fd, buf, sizeof(buf)); - VOID_TEMP_FAILURE_RETRY(close(m_shutdown_wr_fd)); - m_shutdown_wr_fd = -1; - - if (ret == 0) { - join(); - } else { - lderr(m_cct) << "AdminSocket::shutdown: failed to write " - "to thread shutdown pipe: error " << ret << dendl; + err = destroy_shutdown_pipe(); + if (!err.empty()) { + lderr(m_cct) << "AdminSocket::shutdown: error: " << err << dendl; } + VOID_TEMP_FAILURE_RETRY(close(m_sock_fd)); + unregister_command("version"); unregister_command("git_version"); unregister_command("0"); delete m_version_hook; + unregister_command("help"); delete m_help_hook; + unregister_command("get_command_descriptions"); delete m_getdescs_hook; diff --git a/src/common/admin_socket.h b/src/common/admin_socket.h index 20e3f9b85f66f..bbbaa2933213b 100644 --- a/src/common/admin_socket.h +++ b/src/common/admin_socket.h @@ -79,6 +79,7 @@ class AdminSocket : public Thread void shutdown(); std::string create_shutdown_pipe(int *pipe_rd, int *pipe_wr); + std::string destroy_shutdown_pipe(); std::string bind_and_listen(const std::string &sock_path, int *fd); void *entry(); diff --git a/src/common/bit_vector.hpp b/src/common/bit_vector.hpp new file mode 100644 index 0000000000000..55403c5d35cdb --- /dev/null +++ b/src/common/bit_vector.hpp @@ -0,0 +1,423 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 Red Hat + * + * LGPL2.1 (see COPYING-LGPL2.1) or later + */ + +#ifndef BIT_VECTOR_HPP +#define BIT_VECTOR_HPP + +#include "common/Formatter.h" +#include "include/assert.h" +#include "include/buffer.h" +#include "include/encoding.h" +#include +#include +#include +#include +#include + +namespace ceph { + +template +class BitVector +{ +private: + static const uint8_t BITS_PER_BYTE = 8; + static const uint32_t ELEMENTS_PER_BLOCK = BITS_PER_BYTE / _bit_count; + static const uint8_t MASK = static_cast((1 << _bit_count) - 1); + + // must be power of 2 + BOOST_STATIC_ASSERT((_bit_count != 0) && !(_bit_count & (_bit_count - 1))); + BOOST_STATIC_ASSERT(_bit_count <= BITS_PER_BYTE); +public: + + class ConstReference { + public: + operator uint8_t() const; + private: + friend class BitVector; + const BitVector &m_bit_vector; + uint64_t m_offset; + + ConstReference(const BitVector &bit_vector, uint64_t offset); + }; + + class Reference { + public: + operator uint8_t() const; + Reference& operator=(uint8_t v); + private: + friend class BitVector; + BitVector &m_bit_vector; + uint64_t m_offset; + + Reference(BitVector &bit_vector, uint64_t offset); + }; + + static const uint8_t BIT_COUNT = _bit_count; + + BitVector(); + + void set_crc_enabled(bool enabled) { + m_crc_enabled = enabled; + } + void clear(); + + void resize(uint64_t elements); + uint64_t size() const; + + const bufferlist& get_data() const; + + Reference operator[](uint64_t offset); + ConstReference operator[](uint64_t offset) const; + + void encode_header(bufferlist& bl) const; + void decode_header(bufferlist::iterator& it); + uint64_t get_header_length() const; + + void encode_data(bufferlist& bl, uint64_t byte_offset, + uint64_t byte_length) const; + void decode_data(bufferlist::iterator& it, uint64_t byte_offset); + void get_data_extents(uint64_t offset, uint64_t length, + uint64_t *byte_offset, uint64_t *byte_length) const; + + void encode_footer(bufferlist& bl) const; + void decode_footer(bufferlist::iterator& it); + uint64_t get_footer_offset() const; + + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator& it); + void dump(Formatter *f) const; + + bool operator==(const BitVector &b) const; + + static void generate_test_instances(std::list &o); +private: + + bufferlist m_data; + uint64_t m_size; + bool m_crc_enabled; + + mutable __u32 m_header_crc; + mutable std::vector<__u32> m_data_crcs; + + static void compute_index(uint64_t offset, uint64_t *index, uint64_t *shift); + +}; + +template +BitVector<_b>::BitVector() : m_size(0), m_crc_enabled(true) +{ +} + +template +void BitVector<_b>::clear() { + m_data.clear(); + m_data_crcs.clear(); + m_size = 0; + m_header_crc = 0; +} + +template +void BitVector<_b>::resize(uint64_t size) { + uint64_t buffer_size = (size + ELEMENTS_PER_BLOCK - 1) / ELEMENTS_PER_BLOCK; + if (buffer_size > m_data.length()) { + m_data.append_zero(buffer_size - m_data.length()); + } else if (buffer_size < m_data.length()) { + bufferlist bl; + bl.substr_of(m_data, 0, buffer_size); + bl.swap(m_data); + } + m_size = size; + + uint64_t block_count = (buffer_size + CEPH_PAGE_SIZE - 1) / CEPH_PAGE_SIZE; + m_data_crcs.resize(block_count); +} + +template +uint64_t BitVector<_b>::size() const { + return m_size; +} + +template +const bufferlist& BitVector<_b>::get_data() const { + return m_data; +} + +template +void BitVector<_b>::compute_index(uint64_t offset, uint64_t *index, uint64_t *shift) { + *index = offset / ELEMENTS_PER_BLOCK; + *shift = ((ELEMENTS_PER_BLOCK - 1) - (offset % ELEMENTS_PER_BLOCK)) * _b; +} + +template +void BitVector<_b>::encode_header(bufferlist& bl) const { + bufferlist header_bl; + ENCODE_START(1, 1, header_bl); + ::encode(m_size, header_bl); + ENCODE_FINISH(header_bl); + m_header_crc = header_bl.crc32c(0); + + ::encode(header_bl, bl); +} + +template +void BitVector<_b>::decode_header(bufferlist::iterator& it) { + bufferlist header_bl; + ::decode(header_bl, it); + + bufferlist::iterator header_it = header_bl.begin(); + uint64_t size; + DECODE_START(1, header_it); + ::decode(size, header_it); + DECODE_FINISH(header_it); + + resize(size); + m_header_crc = header_bl.crc32c(0); +} + +template +uint64_t BitVector<_b>::get_header_length() const { + // 4 byte bl length, 6 byte encoding header, 8 byte size + return 18; +} + +template +void BitVector<_b>::encode_data(bufferlist& bl, uint64_t byte_offset, + uint64_t byte_length) const { + assert(byte_offset % CEPH_PAGE_SIZE == 0); + assert(byte_offset + byte_length == m_data.length() || + byte_length % CEPH_PAGE_SIZE == 0); + + uint64_t end_offset = byte_offset + byte_length; + while (byte_offset < end_offset) { + uint64_t len = MIN(CEPH_PAGE_SIZE, end_offset - byte_offset); + + bufferlist bit; + bit.substr_of(m_data, byte_offset, len); + m_data_crcs[byte_offset / CEPH_PAGE_SIZE] = bit.crc32c(0); + + bl.claim_append(bit); + byte_offset += CEPH_PAGE_SIZE; + } +} + +template +void BitVector<_b>::decode_data(bufferlist::iterator& it, uint64_t byte_offset) { + assert(byte_offset % CEPH_PAGE_SIZE == 0); + if (it.end()) { + return; + } + + uint64_t end_offset = byte_offset + it.get_remaining(); + if (end_offset > m_data.length()) { + throw buffer::end_of_buffer(); + } + + bufferlist data; + if (byte_offset > 0) { + data.substr_of(m_data, 0, byte_offset); + } + + while (byte_offset < end_offset) { + uint64_t len = MIN(CEPH_PAGE_SIZE, end_offset - byte_offset); + + bufferlist bit; + it.copy(len, bit); + if (m_crc_enabled && + m_data_crcs[byte_offset / CEPH_PAGE_SIZE] != bit.crc32c(0)) { + throw buffer::malformed_input("invalid data block CRC"); + } + data.append(bit); + byte_offset += bit.length(); + } + + if (m_data.length() > end_offset) { + bufferlist tail; + tail.substr_of(m_data, end_offset, m_data.length() - end_offset); + data.append(tail); + } + assert(data.length() == m_data.length()); + data.swap(m_data); +} + +template +void BitVector<_b>::get_data_extents(uint64_t offset, uint64_t length, + uint64_t *byte_offset, + uint64_t *byte_length) const { + // read CEPH_PAGE_SIZE-aligned chunks + assert(length > 0 && offset + length <= m_size); + uint64_t shift; + compute_index(offset, byte_offset, &shift); + *byte_offset -= (*byte_offset % CEPH_PAGE_SIZE); + + uint64_t end_offset; + compute_index(offset + length - 1, &end_offset, &shift); + end_offset += (CEPH_PAGE_SIZE - (end_offset % CEPH_PAGE_SIZE)); + assert(*byte_offset <= end_offset); + + *byte_length = MIN(end_offset - *byte_offset, m_data.length()); +} + +template +void BitVector<_b>::encode_footer(bufferlist& bl) const { + bufferlist footer_bl; + if (m_crc_enabled) { + ::encode(m_header_crc, footer_bl); + ::encode(m_data_crcs, footer_bl); + } + ::encode(footer_bl, bl); +} + +template +void BitVector<_b>::decode_footer(bufferlist::iterator& it) { + bufferlist footer_bl; + ::decode(footer_bl, it); + + m_crc_enabled = (footer_bl.length() > 0); + if (m_crc_enabled) { + bufferlist::iterator footer_it = footer_bl.begin(); + + __u32 header_crc; + ::decode(header_crc, footer_it); + if (m_header_crc != header_crc) { + throw buffer::malformed_input("incorrect header CRC"); + } + + uint64_t block_count = (m_data.length() + CEPH_PAGE_SIZE - 1) / CEPH_PAGE_SIZE; + ::decode(m_data_crcs, footer_it); + if (m_data_crcs.size() != block_count) { + throw buffer::malformed_input("invalid data block CRCs"); + } + } +} + +template +uint64_t BitVector<_b>::get_footer_offset() const { + return get_header_length() + m_data.length(); +} + +template +void BitVector<_b>::encode(bufferlist& bl) const { + encode_header(bl); + encode_data(bl, 0, m_data.length()); + encode_footer(bl); +} + +template +void BitVector<_b>::decode(bufferlist::iterator& it) { + decode_header(it); + + bufferlist data_bl; + if (m_data.length() > 0) { + it.copy(m_data.length(), data_bl); + } + + decode_footer(it); + + bufferlist::iterator data_it = data_bl.begin(); + decode_data(data_it, 0); +} + +template +void BitVector<_b>::dump(Formatter *f) const { + f->dump_unsigned("size", m_size); + f->open_array_section("bit_table"); + for (unsigned i = 0; i < m_data.length(); ++i) { + f->dump_format("byte", "0x%02hhX", m_data[i]); + } + f->close_section(); +} + +template +bool BitVector<_b>::operator==(const BitVector &b) const { + return (this->m_size == b.m_size && this->m_data == b.m_data); +} + +template +typename BitVector<_b>::Reference BitVector<_b>::operator[](uint64_t offset) { + return Reference(*this, offset); +} + +template +typename BitVector<_b>::ConstReference BitVector<_b>::operator[](uint64_t offset) const { + return ConstReference(*this, offset); +} + +template +BitVector<_b>::ConstReference::ConstReference(const BitVector<_b> &bit_vector, + uint64_t offset) + : m_bit_vector(bit_vector), m_offset(offset) +{ +} + +template +BitVector<_b>::ConstReference::operator uint8_t() const { + uint64_t index; + uint64_t shift; + this->m_bit_vector.compute_index(this->m_offset, &index, &shift); + + return (this->m_bit_vector.m_data[index] >> shift) & MASK; +} + +template +BitVector<_b>::Reference::Reference(BitVector<_b> &bit_vector, uint64_t offset) + : m_bit_vector(bit_vector), m_offset(offset) +{ +} + +template +BitVector<_b>::Reference::operator uint8_t() const { + uint64_t index; + uint64_t shift; + this->m_bit_vector.compute_index(this->m_offset, &index, &shift); + + return (this->m_bit_vector.m_data[index] >> shift) & MASK; +} + +template +typename BitVector<_b>::Reference& BitVector<_b>::Reference::operator=(uint8_t v) { + uint64_t index; + uint64_t shift; + this->m_bit_vector.compute_index(this->m_offset, &index, &shift); + + uint8_t mask = MASK << shift; + char packed_value = (this->m_bit_vector.m_data[index] & ~mask) | + ((v << shift) & mask); + this->m_bit_vector.m_data.copy_in(index, 1, &packed_value); + return *this; +} + +template +void BitVector<_b>::generate_test_instances(std::list &o) { + o.push_back(new BitVector()); + + BitVector *b = new BitVector(); + const uint64_t radix = 1 << b->BIT_COUNT; + const uint64_t size = 1024; + + b->resize(size); + for (uint64_t i = 0; i < size; ++i) { + (*b)[i] = rand() % radix; + } + o.push_back(b); +} + +} + +WRITE_CLASS_ENCODER(ceph::BitVector<2>) + +template +inline std::ostream& operator<<(std::ostream& out, const ceph::BitVector<_b> &b) +{ + out << "ceph::BitVector<" << _b << ">(size=" << b.size() << ", data=" + << b.get_data() << ")"; + return out; +} + +#endif // BIT_VECTOR_HPP diff --git a/src/common/blkdev.cc b/src/common/blkdev.cc index 4846463962e77..eb152f83af671 100644 --- a/src/common/blkdev.cc +++ b/src/common/blkdev.cc @@ -1,13 +1,41 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (c) 2015 Hewlett-Packard Development Company, L.P. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ #include #include #include +#include #include #include #include +#include +#include #include "include/int_types.h" #ifdef __linux__ #include +#include "include/uuid.h" +#include + +#define UUID_LEN 36 + +static const char *sandbox_dir = ""; + +void set_block_device_sandbox_dir(const char *dir) +{ + if (dir) + sandbox_dir = dir; + else + sandbox_dir = ""; +} int get_block_device_size(int fd, int64_t *psize) { @@ -25,40 +53,122 @@ int get_block_device_size(int fd, int64_t *psize) return ret; } -bool block_device_support_discard(const char *devname) +/** + * get the base device (strip off partition suffix and /dev/ prefix) + * e.g., + * /dev/sda3 -> sda + * /dev/cciss/c0d1p2 -> cciss/c0d1 + */ +int get_block_device_base(const char *dev, char *out, size_t out_len) { - bool can_trim = false; - char *p = strstr((char *)devname, "sd"); - char name[32]; + struct stat st; + int r = 0; + char buf[PATH_MAX*2]; + struct dirent *de; + DIR *dir; + char devname[PATH_MAX], fn[PATH_MAX]; + char *p; + + if (strncmp(dev, "/dev/", 5) != 0) + return -EINVAL; + + strncpy(devname, dev + 5, PATH_MAX-1); + devname[PATH_MAX-1] = '\0'; + for (p = devname; *p; ++p) + if (*p == '/') + *p = '!'; + + snprintf(fn, sizeof(fn), "%s/sys/block/%s", sandbox_dir, devname); + if (stat(fn, &st) == 0) { + if (strlen(devname) + 1 > out_len) { + return -ERANGE; + } + strncpy(out, devname, out_len); + return 0; + } - strncpy(name, p, sizeof(name) - 1); - name[sizeof(name) - 1] = '\0'; + snprintf(fn, sizeof(fn), "%s/sys/block", sandbox_dir); + dir = opendir(fn); + if (!dir) + return -errno; - for (unsigned int i = 0; i < strlen(name); i++) { - if(isdigit(name[i])) { - name[i] = 0; + while (!::readdir_r(dir, reinterpret_cast(buf), &de)) { + if (!de) { + if (errno) { + r = -errno; + goto out; + } break; } + if (de->d_name[0] == '.') + continue; + snprintf(fn, sizeof(fn), "%s/sys/block/%s/%s", sandbox_dir, de->d_name, + devname); + + if (stat(fn, &st) == 0) { + // match! + if (strlen(de->d_name) + 1 > out_len) { + r = -ERANGE; + goto out; + } + strncpy(out, de->d_name, out_len); + r = 0; + goto out; + } } + r = -ENOENT; + + out: + closedir(dir); + return r; +} + +/** + * get a block device property + * + * return the value (we assume it is positive) + * return negative error on error + */ +int64_t get_block_device_int_property(const char *devname, const char *property) +{ + char basename[PATH_MAX], filename[PATH_MAX]; + int64_t r; + + r = get_block_device_base(devname, basename, sizeof(basename)); + if (r < 0) + return r; - char filename[100] = {0}; - sprintf(filename, "/sys/block/%s/queue/discard_granularity", name); + snprintf(filename, sizeof(filename), + "%s/sys/block/%s/queue/discard_granularity", sandbox_dir, basename); FILE *fp = fopen(filename, "r"); if (fp == NULL) { - can_trim = false; + return -errno; + } + + char buff[256] = {0}; + if (fgets(buff, sizeof(buff) - 1, fp)) { + // strip newline etc + for (char *p = buff; *p; ++p) { + if (!isdigit(*p)) { + *p = 0; + break; + } + } + char *endptr = 0; + r = strtoll(buff, &endptr, 10); + if (endptr != buff + strlen(buff)) + r = -EINVAL; } else { - char buff[256] = {0}; - if (fgets(buff, sizeof(buff) - 1, fp)) { - if (strcmp(buff, "0")) - can_trim = false; - else - can_trim = true; - } else - can_trim = false; - fclose(fp); + r = 0; } - return can_trim; + fclose(fp); + return r; +} + +bool block_device_support_discard(const char *devname) +{ + return get_block_device_int_property(devname, "discard_granularity") > 0; } int block_device_discard(int fd, int64_t offset, int64_t len) @@ -67,6 +177,44 @@ int block_device_discard(int fd, int64_t offset, int64_t len) return ioctl(fd, BLKDISCARD, range); } +int get_device_by_uuid(uuid_d dev_uuid, const char* label, char* partition, + char* device) +{ + char uuid_str[UUID_LEN+1]; + char basename[PATH_MAX]; + const char* temp_partition_ptr = NULL; + blkid_cache cache = NULL; + blkid_dev dev = NULL; + int rc = 0; + + uuid_unparse((const unsigned char*)&dev_uuid.uuid, uuid_str); + + if (blkid_get_cache(&cache, NULL) >= 0) + dev = blkid_find_dev_with_tag(cache, label, (const char*)uuid_str); + else + rc = -EINVAL; + + if (dev) { + temp_partition_ptr = blkid_dev_devname(dev); + strncpy(partition, temp_partition_ptr, PATH_MAX); + rc = get_block_device_base(partition, basename, + sizeof(basename)); + if (rc >= 0) { + strncpy(device, basename, sizeof(basename)); + rc = 0; + } else { + rc = -ENODEV; + } + } else { + rc = -EINVAL; + } + + /* From what I can tell, blkid_put_cache cleans up dev, which + * appears to be a pointer into cache, as well */ + if (cache) + blkid_put_cache(cache); + return rc; +} #elif defined(__APPLE__) #include @@ -94,6 +242,12 @@ int block_device_discard(int fd, int64_t offset, int64_t len) { return -EOPNOTSUPP; } + +int get_device_by_uuid(uuid_d dev_uuid, const char* label, char* partition, + char* device) +{ + return -EOPNOTSUPP; +} #elif defined(__FreeBSD__) #include @@ -114,6 +268,12 @@ int block_device_discard(int fd, int64_t offset, int64_t len) { return -EOPNOTSUPP; } + +int get_device_by_uuid(uuid_d dev_uuid, const char* label, char* partition, + char* device) +{ + return -EOPNOTSUPP; +} #else # error "Unable to query block device size: unsupported platform, please report." #endif diff --git a/src/common/blkdev.h b/src/common/blkdev.h index 5606205cc3277..697e8a848eafd 100644 --- a/src/common/blkdev.h +++ b/src/common/blkdev.h @@ -1,7 +1,14 @@ #ifndef __CEPH_COMMON_BLKDEV_H #define __CEPH_COMMON_BLKDEV_H +/* for testing purposes */ +extern void set_block_device_sandbox_dir(const char *dir); + +extern int get_block_device_base(const char *dev, char *out, size_t out_len); extern int get_block_device_size(int fd, int64_t *psize); +extern int64_t get_block_device_int_property(const char *devname, const char *property); extern bool block_device_support_discard(const char *devname); extern int block_device_discard(int fd, int64_t offset, int64_t len); +extern int get_device_by_uuid(uuid_d dev_uuid, const char* label, + char* partition, char* device); #endif diff --git a/src/common/buffer.cc b/src/common/buffer.cc index 9a1ac5fee69cb..fabaa89e93c95 100644 --- a/src/common/buffer.cc +++ b/src/common/buffer.cc @@ -19,10 +19,14 @@ #include "common/safe_io.h" #include "common/simple_spin.h" #include "common/strtol.h" +#include "common/likely.h" #include "include/atomic.h" -#include "common/Mutex.h" +#include "common/RWLock.h" #include "include/types.h" #include "include/compat.h" +#if defined(HAVE_XIO) +#include "msg/xio/XioMsg.h" +#endif #include #include @@ -30,6 +34,7 @@ #include #include +#include namespace ceph { #ifdef BUFFER_DEBUG @@ -41,8 +46,8 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER; # define bendl std::endl; } #endif - atomic_t buffer_total_alloc; - bool buffer_track_alloc = get_env_bool("CEPH_BUFFER_TRACK"); + static atomic_t buffer_total_alloc; + const bool buffer_track_alloc = get_env_bool("CEPH_BUFFER_TRACK"); void buffer::inc_total_alloc(unsigned len) { if (buffer_track_alloc) @@ -56,9 +61,9 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER; return buffer_total_alloc.read(); } - atomic_t buffer_cached_crc; - atomic_t buffer_cached_crc_adjusted; - bool buffer_track_crc = get_env_bool("CEPH_BUFFER_TRACK"); + static atomic_t buffer_cached_crc; + static atomic_t buffer_cached_crc_adjusted; + static bool buffer_track_crc = get_env_bool("CEPH_BUFFER_TRACK"); void buffer::track_cached_crc(bool b) { buffer_track_crc = b; @@ -70,8 +75,8 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER; return buffer_cached_crc_adjusted.read(); } - atomic_t buffer_c_str_accesses; - bool buffer_track_c_str = get_env_bool("CEPH_BUFFER_TRACK"); + static atomic_t buffer_c_str_accesses; + static bool buffer_track_c_str = get_env_bool("CEPH_BUFFER_TRACK"); void buffer::track_c_str(bool b) { buffer_track_c_str = b; @@ -80,7 +85,7 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER; return buffer_c_str_accesses.read(); } - atomic_t buffer_max_pipe_size; + static atomic_t buffer_max_pipe_size; int update_max_pipe_size() { #ifdef CEPH_HAVE_SETPIPE_SZ char buf[32]; @@ -123,16 +128,16 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER; unsigned len; atomic_t nref; - mutable Mutex crc_lock; + mutable RWLock crc_lock; map, pair > crc_map; raw(unsigned l) : data(NULL), len(l), nref(0), - crc_lock("buffer::raw::crc_lock", false, false) + crc_lock("buffer::raw::crc_lock", false) { } raw(char *c, unsigned l) : data(c), len(l), nref(0), - crc_lock("buffer::raw::crc_lock", false, false) + crc_lock("buffer::raw::crc_lock", false) { } virtual ~raw() {} @@ -161,24 +166,40 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER; bool is_n_page_sized() { return (len & ~CEPH_PAGE_MASK) == 0; } + virtual bool is_shareable() { + // true if safe to reference/share the existing buffer copy + // false if it is not safe to share the buffer, e.g., due to special + // and/or registered memory that is scarce + return true; + } bool get_crc(const pair &fromto, - pair *crc) const { - Mutex::Locker l(crc_lock); + pair *crc) const { + crc_lock.get_read(); map, pair >::const_iterator i = - crc_map.find(fromto); - if (i == crc_map.end()) - return false; + crc_map.find(fromto); + if (i == crc_map.end()) { + crc_lock.unlock(); + return false; + } *crc = i->second; + crc_lock.unlock(); return true; } void set_crc(const pair &fromto, - const pair &crc) { - Mutex::Locker l(crc_lock); + const pair &crc) { + crc_lock.get_write(); crc_map[fromto] = crc; + crc_lock.unlock(); } void invalidate_crc() { - Mutex::Locker l(crc_lock); - crc_map.clear(); + // don't own the write lock when map is empty + crc_lock.get_read(); + if (crc_map.size() != 0) { + crc_lock.unlock(); + crc_lock.get_write(); + crc_map.clear(); + } + crc_lock.unlock(); } }; @@ -492,6 +513,27 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER; } }; + class buffer::raw_unshareable : public buffer::raw { + public: + raw_unshareable(unsigned l) : raw(l) { + if (len) + data = new char[len]; + else + data = 0; + } + raw_unshareable(unsigned l, char *b) : raw(b, l) { + } + raw* clone_empty() { + return new raw_char(len); + } + bool is_shareable() { + return false; // !shareable, will force make_shareable() + } + ~raw_unshareable() { + delete[] data; + } + }; + class buffer::raw_static : public buffer::raw { public: raw_static(const char *d, unsigned l) : raw((char*)d, l) { } @@ -501,6 +543,59 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER; } }; +#if defined(HAVE_XIO) + class buffer::xio_msg_buffer : public buffer::raw { + private: + XioDispatchHook* m_hook; + public: + xio_msg_buffer(XioDispatchHook* _m_hook, const char *d, + unsigned l) : + raw((char*)d, l), m_hook(_m_hook->get()) {} + + bool is_shareable() { return false; } + static void operator delete(void *p) + { + xio_msg_buffer *buf = static_cast(p); + // return hook ref (counts against pool); it appears illegal + // to do this in our dtor, because this fires after that + buf->m_hook->put(); + } + raw* clone_empty() { + return new buffer::raw_char(len); + } + }; + + class buffer::xio_mempool : public buffer::raw { + public: + struct xio_reg_mem *mp; + xio_mempool(struct xio_reg_mem *_mp, unsigned l) : + raw((char*)mp->addr, l), mp(_mp) + { } + ~xio_mempool() {} + raw* clone_empty() { + return new buffer::raw_char(len); + } + }; + + struct xio_reg_mem* get_xio_mp(const buffer::ptr& bp) + { + buffer::xio_mempool *mb = dynamic_cast(bp.get_raw()); + if (mb) { + return mb->mp; + } + return NULL; + } + + buffer::raw* buffer::create_msg( + unsigned len, char *buf, XioDispatchHook* m_hook) { + XioPool& pool = m_hook->get_pool(); + buffer::raw* bp = + static_cast(pool.alloc(sizeof(xio_msg_buffer))); + new (bp) xio_msg_buffer(m_hook, buf, len); + return bp; + } +#endif /* HAVE_XIO */ + buffer::raw* buffer::copy(const char *c, unsigned len) { raw* r = new raw_char(len); memcpy(r->data, c, len); @@ -547,6 +642,10 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER; #endif } + buffer::raw* buffer::create_unshareable(unsigned len) { + return new raw_unshareable(len); + } + buffer::ptr::ptr(raw *r) : _raw(r), _off(0), _len(r->len) // no lock needed; this is an unref raw. { r->nref.inc(); @@ -602,6 +701,18 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER; return _raw->clone(); } + buffer::ptr& buffer::ptr::make_shareable() { + if (_raw && !_raw->is_shareable()) { + buffer::raw *tr = _raw; + _raw = tr->clone(); + _raw->nref.set(1); + if (unlikely(tr->nref.dec() == 0)) { + delete tr; + } + } + return *this; + } + void buffer::ptr::swap(ptr& other) { raw *r = _raw; @@ -1065,12 +1176,23 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER; it != _buffers.end(); ++it) { if (p + it->length() > o) { - if (p >= o && p+it->length() <= o+l) - it->zero(); // all - else if (p >= o) - it->zero(0, o+l-p); // head - else - it->zero(o-p, it->length()-(o-p)); // tail + if (p >= o && p+it->length() <= o+l) { + // 'o'------------- l -----------| + // 'p'-- it->length() --| + it->zero(); + } else if (p >= o) { + // 'o'------------- l -----------| + // 'p'------- it->length() -------| + it->zero(0, o+l-p); + } else if (p + it->length() <= o+l) { + // 'o'------------- l -----------| + // 'p'------- it->length() -------| + it->zero(o-p, it->length()-(o-p)); + } else { + // 'o'----------- l -----------| + // 'p'---------- it->length() ----------| + it->zero(o-p, l); + } } p += it->length(); if (o+l <= p) @@ -1170,27 +1292,31 @@ void buffer::list::rebuild_page_aligned() } // sort-of-like-assignment-op - void buffer::list::claim(list& bl) + void buffer::list::claim(list& bl, unsigned int flags) { // free my buffers clear(); - claim_append(bl); + claim_append(bl, flags); } - void buffer::list::claim_append(list& bl) + void buffer::list::claim_append(list& bl, unsigned int flags) { // steal the other guy's buffers _len += bl._len; - _buffers.splice( _buffers.end(), bl._buffers ); + if (!(flags & CLAIM_ALLOW_NONSHAREABLE)) + bl.make_shareable(); + _buffers.splice(_buffers.end(), bl._buffers ); bl._len = 0; bl.last_p = bl.begin(); } - void buffer::list::claim_prepend(list& bl) + void buffer::list::claim_prepend(list& bl, unsigned int flags) { // steal the other guy's buffers _len += bl._len; - _buffers.splice( _buffers.begin(), bl._buffers ); + if (!(flags & CLAIM_ALLOW_NONSHAREABLE)) + bl.make_shareable(); + _buffers.splice(_buffers.begin(), bl._buffers ); bl._len = 0; bl.last_p = bl.begin(); } @@ -1379,9 +1505,24 @@ void buffer::list::rebuild_page_aligned() } if (off + len > curbuf->length()) { - // FIXME we'll just rebuild the whole list for now. - rebuild(); - return c_str() + orig_off; + bufferlist tmp; + unsigned l = off + len; + + do { + if (l >= curbuf->length()) + l -= curbuf->length(); + else + l = 0; + tmp.append(*curbuf); + curbuf = _buffers.erase(curbuf); + + } while (curbuf != _buffers.end() && l > 0); + + assert(l == 0); + + tmp.rebuild(); + _buffers.insert(curbuf, tmp._buffers.front()); + return tmp.c_str() + off; } return curbuf->c_str() + off; @@ -1747,6 +1888,15 @@ __u32 buffer::list::crc32c(__u32 crc) const return crc; } +void buffer::list::invalidate_crc() +{ + for (std::list::const_iterator p = _buffers.begin(); p != _buffers.end(); ++p) { + raw *r = p->get_raw(); + if (r) { + r->invalidate_crc(); + } + } +} /** * Binary write all contents to a C++ stream @@ -1798,5 +1948,34 @@ std::ostream& operator<<(std::ostream& out, const buffer::raw &r) { return out << "buffer::raw(" << (void*)r.data << " len " << r.len << " nref " << r.nref.read() << ")"; } +std::ostream& operator<<(std::ostream& out, const buffer::ptr& bp) { + if (bp.have_raw()) + out << "buffer::ptr(" << bp.offset() << "~" << bp.length() + << " " << (void*)bp.c_str() + << " in raw " << (void*)bp.raw_c_str() + << " len " << bp.raw_length() + << " nref " << bp.raw_nref() << ")"; + else + out << "buffer:ptr(" << bp.offset() << "~" << bp.length() << " no raw)"; + return out; +} + +std::ostream& operator<<(std::ostream& out, const buffer::list& bl) { + out << "buffer::list(len=" << bl.length() << "," << std::endl; + + std::list::const_iterator it = bl.buffers().begin(); + while (it != bl.buffers().end()) { + out << "\t" << *it; + if (++it == bl.buffers().end()) break; + out << "," << std::endl; + } + out << std::endl << ")"; + return out; +} + +std::ostream& operator<<(std::ostream& out, const buffer::error& e) +{ + return out << e.what(); +} } diff --git a/src/common/ceph_argparse.cc b/src/common/ceph_argparse.cc index a76c42422a927..858882baf0efd 100644 --- a/src/common/ceph_argparse.cc +++ b/src/common/ceph_argparse.cc @@ -45,6 +45,26 @@ #undef generic_dout #undef dendl +struct strict_str_convert { + const char *str; + std::string *err; + strict_str_convert(const char *str, std::string *err) + : str(str), err(err) {} + + inline operator float() const + { + return strict_strtof(str, err); + } + inline operator int() const + { + return strict_strtol(str, 10, err); + } + inline operator long long() const + { + return strict_strtoll(str, 10, err); + } +}; + void string_to_vec(std::vector& args, std::string argstr) { istringstream iss(argstr); @@ -131,6 +151,50 @@ void vec_to_argv(const char *argv0, std::vector& args, (*argv)[(*argc)++] = args[i]; } +void ceph_arg_value_type(const char * nextargstr, bool *bool_option, bool *bool_numeric) +{ + bool is_numeric = true; + bool is_float = false; + bool is_option; + + if (nextargstr == NULL) { + return; + } + + if (strlen(nextargstr) < 2) { + is_option = false; + } else { + is_option = (nextargstr[0] == '-') && (nextargstr[1] == '-'); + } + + for (unsigned int i = 0; i < strlen(nextargstr); i++) { + if (!(nextargstr[i] >= '0' && nextargstr[i] <= '9')) { + // May be negative numeral value + if ((i == 0) && (strlen(nextargstr) >= 2)) { + if (nextargstr[0] == '-') + continue; + } + if ( (nextargstr[i] == '.') && (is_float == false) ) { + is_float = true; + continue; + } + + is_numeric = false; + break; + } + } + + // -