diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 01d44ab3e..a430f5e8a 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -55,19 +55,19 @@ jobs: buser: qfsbuild runs_on: ubuntu-latest - distro: ubuntu - ver: 18.04 + ver: 20.04 codecov: yes btype: release buser: qfsbuild runs_on: ubuntu-latest - distro: ubuntu - ver: 18.04 + ver: 20.04 codecov: no btype: debug buser: qfsbuild runs_on: ubuntu-latest - distro: ubuntu - ver: 18.04 + ver: 20.04 codecov: no btype: release buser: root @@ -102,12 +102,6 @@ jobs: btype: release buser: qfsbuild runs_on: ubuntu-latest - - distro: debian - ver: 10 - codecov: no - btype: release - buser: qfsbuild - runs_on: ubuntu-latest - distro: centos ver: 6 codecov: no @@ -165,6 +159,7 @@ jobs: CODECOV: ${{ matrix.codecov }} BTYPE: ${{ matrix.btype }} BUSER: ${{ matrix.buser }} + UBUNTU_APT_OPTIONS: "-o Acquire:Queue-mode=host" steps: - name: Checkout code uses: actions/checkout@v2 diff --git a/CMakeLists.txt b/CMakeLists.txt index 4c5c3dc67..297d17bd7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,7 +4,7 @@ # Created 2006/10/20 # Author: Sriram Rao (Kosmix Corp) # -# Copyright 2008-2017 Quantcast Corporation. All rights reserved. +# Copyright 2008-2025 Quantcast Corporation. All rights reserved. # Copyright 2006 Kosmix Corp. # # This file is part of Quantcast File System. @@ -22,25 +22,25 @@ # permissions and limitations under the License. cmake_minimum_required(VERSION 2.8.4...10.0 FATAL_ERROR) -cmake_policy(VERSION ${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}.${CMAKE_PATCH_VERSION}) +cmake_policy(VERSION + ${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}.${CMAKE_PATCH_VERSION}) -IF (NOT DEFINED KFS_DIR_PREFIX) - set (KFS_DIR_PREFIX ${CMAKE_SOURCE_DIR}) -ENDIF (NOT DEFINED KFS_DIR_PREFIX) +if(NOT DEFINED KFS_DIR_PREFIX) + set(KFS_DIR_PREFIX ${CMAKE_SOURCE_DIR}) +endif() -project (QFS) +project(QFS) set(CMAKE_MODULE_PATH ${KFS_DIR_PREFIX}/cmake/Modules) -IF (NOT DEFINED Boost_NO_BOOST_CMAKE) - IF (${CMAKE_MAJOR_VERSION} EQUAL 2 AND ${CMAKE_MINOR_VERSION} EQUAL 8 AND - 5 LESS ${CMAKE_PATCH_VERSION}) +if(NOT DEFINED Boost_NO_BOOST_CMAKE) + if(${CMAKE_MAJOR_VERSION} EQUAL 2 AND ${CMAKE_MINOR_VERSION} EQUAL 8 AND + 5 LESS ${CMAKE_PATCH_VERSION}) # Turn off by default with 2.8.5 < cmake < 2.9.0 to # work around cmake28 and boost libraries problem on centos 6. # Boost_NO_BOOST_CMAKE was introduced int cmake 2.8.6 set(Boost_NO_BOOST_CMAKE ON) - ENDIF (${CMAKE_MAJOR_VERSION} EQUAL 2 AND ${CMAKE_MINOR_VERSION} EQUAL 8 AND - 5 LESS ${CMAKE_PATCH_VERSION}) -ENDIF (NOT DEFINED Boost_NO_BOOST_CMAKE) + endif() +endif() if(NOT DEFINED QFS_USE_STATIC_LIB_LINKAGE) set(QFS_USE_STATIC_LIB_LINKAGE ON) @@ -53,9 +53,8 @@ endif() set(Boost_USE_MULTITHREADED ON) # Require the packages we need to build -if(CMAKE_SYSTEM_NAME STREQUAL "Darwin" OR - CYGWIN OR - CMAKE_SYSTEM_NAME STREQUAL "FreeBSD") +if(CYGWIN OR + CMAKE_SYSTEM_NAME STREQUAL "FreeBSD") find_package(Boost COMPONENTS regex system REQUIRED) else() find_package(Boost COMPONENTS regex REQUIRED) @@ -67,46 +66,64 @@ elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin") set(CMAKE_INSTALL_RPATH @loader_path) endif() -if (CYGWIN) +if(CYGWIN) set(CMAKE_FIND_LIBRARY_PREFIXES ${CMAKE_FIND_LIBRARY_PREFIXES} "") - set(CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES} ".dll.a" ".lib") + set(CMAKE_FIND_LIBRARY_SUFFIXES + ${CMAKE_FIND_LIBRARY_SUFFIXES} ".dll.a" ".lib") + # For now only for cygwin to get strptime, and gnu specific glob flags. add_definitions(-D_GNU_SOURCE) endif() -if (QFS_OMIT_JNI) - message (STATUS "Not building QFS JNI") +if(QFS_OMIT_JNI) + message(STATUS "Not building QFS JNI") else() set(JAVA_AWT_LIBRARY NotNeeded) set(JAVA_JVM_LIBRARY NotNeeded) set(JAVA_AWT_INCLUDE_PATH NotNeeded) find_package(JNI REQUIRED) endif() + find_package(Jerasure REQUIRED) find_package(Krb5 REQUIRED krb5) find_package(OpenSSL REQUIRED) find_package(FUSE) add_definitions(-DBOOST_BIND_GLOBAL_PLACEHOLDERS) + +# BOOST_SP_USE_QUICK_ALLOCATOR and BOOST_SP_USE_STD_ALLOCATOR are deprecated in +# Boost 1.87 and later. +if(Boost_MAJOR_VERSION LESS 1 OR + (Boost_MAJOR_VERSION EQUAL 1 AND Boost_MINOR_VERSION LESS 87)) + add_definitions(-DBOOST_SP_USE_QUICK_ALLOCATOR) +endif() + # Define various options based on the library configuration we use if(KRB5_FLAVOR) add_definitions(-DKFS_KRB_USE_${KRB5_FLAVOR}) endif() + if(KRB5_USE_KRB5H) add_definitions(-DKFS_KRB_USE_KRB5H) endif() -if("${KRB5_FLAVOR}" STREQUAL "HEIMDAL" OR KRB5_HAS_krb5_get_init_creds_opt_set_out_ccache) + +if("${KRB5_FLAVOR}" STREQUAL "HEIMDAL" OR + KRB5_HAS_krb5_get_init_creds_opt_set_out_ccache) add_definitions(-DKFS_KRB_USE_KRB5_GET_INIT_CREDS_OPT) endif() + if(KRB5_HAS_krb5_unparse_name_flags_ext) add_definitions(-DKRB5_HAS_krb5_unparse_name_flags_ext) endif() + if(KRB5_HAS_krb5_unparse_name_ext) add_definitions(-DKRB5_HAS_krb5_unparse_name_ext) endif() + if(KRB5_HAS_krb5_free_keytab_entry_contents) add_definitions(-DKRB5_HAS_krb5_free_keytab_entry_contents) endif() + if(KRB5_HAS_krb5_kt_free_entry) add_definitions(-DKRB5_HAS_krb5_kt_free_entry) endif() @@ -125,38 +142,38 @@ endif() # Build with statically linked libraries; the value for this variable has to be # defined here overwriting whatever is in the cache. -# When setto ON, we build with statically linked libraries; when off we +# When set to ON, we build with statically linked libraries; when off we # link with dynamically linked libs +if(NOT DEFINED QFS_USE_STATIC_LIB_LINKAGE) + set(QFS_USE_STATIC_LIB_LINKAGE TRUE) +endif() -IF (NOT DEFINED QFS_USE_STATIC_LIB_LINKAGE) - set (QFS_USE_STATIC_LIB_LINKAGE TRUE) -ENDIF (NOT DEFINED QFS_USE_STATIC_LIB_LINKAGE) +if(QFS_USE_STATIC_LIB_LINKAGE) + message(STATUS "Build binaries with statically linked QFS libraries") +else() + message(STATUS "Build binaries with dynamically linked QFS libraries") +endif() -IF (QFS_USE_STATIC_LIB_LINKAGE) - message (STATUS "Build binaries with statically linked QFS libraries") -ELSE (QFS_USE_STATIC_LIB_LINKAGE) - message (STATUS "Build binaries with dynamically linked QFS libraries") -ENDIF (QFS_USE_STATIC_LIB_LINKAGE) -set (USE_STATIC_LIB_LINKAGE ${QFS_USE_STATIC_LIB_LINKAGE} CACHE BOOL +set(USE_STATIC_LIB_LINKAGE ${QFS_USE_STATIC_LIB_LINKAGE} CACHE BOOL "Build binaries with statically linked libraries" FORCE) if(ENABLE_COVERAGE) message(STATUS "Enabling code coverage with gcov") - set(CMAKE_CXX_FLAGS "-coverage") + set(CMAKE_CXX_FLAGS "-coverage") set(CMAKE_C_FLAGS "-coverage") endif() if(CMAKE_SYSTEM_NAME STREQUAL "Darwin") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=gnu++14") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=gnu++14") endif() if(ENABLE_PROFILING) message(STATUS "Enabling profiling with gprof") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pg") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pg") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pg") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pg") set(CMAKE_SHAREDBoost_USE_MULTITHREADED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -pg") - set(CMAKE_EXE_FLAGS "${CMAKE_EXE_FLAGS} -pg") + set(CMAKE_EXE_FLAGS "${CMAKE_EXE_FLAGS} -pg") endif() # Change the line to Release to build release binaries @@ -169,14 +186,14 @@ if(CMAKE_BUILD_TYPE STREQUAL "Release") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D NDEBUG -g3") endif() -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -DBOOST_SP_USE_QUICK_ALLOCATOR") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall") string(TOUPPER KFS_OS_NAME_${CMAKE_SYSTEM_NAME} KFS_OS_NAME) -add_definitions (-D${KFS_OS_NAME}) +add_definitions(-D${KFS_OS_NAME}) -IF (ENABLE_IO_BUFFER_DEBUG) +if(ENABLE_IO_BUFFER_DEBUG) add_definitions(-DDEBUG_IOBuffer) message(STATUS "Enabled IO buffer debug") -ENDIF (ENABLE_IO_BUFFER_DEBUG) +endif() if(DEFINED QFS_EXTRA_CXX_OPTIONS) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${QFS_EXTRA_CXX_OPTIONS}") @@ -195,11 +212,13 @@ if(FUSE_FOUND) message(STATUS "fuse definitions: ${FUSE_DEFINITIONS}") message(STATUS "fuse version: ${FUSE_MAJOR_VERSION}.${FUSE_MINOR_VERSION}") add_definitions(${FUSE_DEFINITIONS}) + if(${FUSE_MAJOR_VERSION} LESS 3) add_definitions(-D FUSE_USE_VERSION=26) else() add_definitions(-D FUSE_USE_VERSION=30) endif() + include_directories(${FUSE_INCLUDE_DIRS}) else() message(STATUS "Not building qfs_fuse") @@ -215,10 +234,11 @@ include_directories( ${JAVA_INCLUDE_PATH2} ) -# get the subdirs we want -if (NOT QFS_OMIT_JNI) +# Get the sub directories we want. +if(NOT QFS_OMIT_JNI) add_subdirectory(${KFS_DIR_PREFIX}/src/cc/access src/cc/access) endif() + add_subdirectory(${KFS_DIR_PREFIX}/src/cc/chunk src/cc/chunk) add_subdirectory(${KFS_DIR_PREFIX}/src/cc/common src/cc/common) add_subdirectory(${KFS_DIR_PREFIX}/src/cc/devtools src/cc/devtools) @@ -242,7 +262,7 @@ if(FUSE_FOUND) add_subdirectory(${KFS_DIR_PREFIX}/src/cc/fuse src/cc/fuse) endif() -add_custom_target ( +add_custom_target( rat ${KFS_DIR_PREFIX}/scripts/rat.sh ${KFS_DIR_PREFIX} COMMENT "Running license release audit tool (rat)" VERBATIM diff --git a/Makefile b/Makefile index 67ad33f05..9a1da4d61 100644 --- a/Makefile +++ b/Makefile @@ -49,8 +49,8 @@ run-cmake: dir build: run-cmake cd build/${BUILD_TYPE} && $(MAKE) ${MAKE_OPTIONS} install \ `${QFS_MSTRESS_ON} && \ - echo ${QFSHADOOP_VERSIONS} | grep 2.10.2 >/dev/null 2>&1 && \ - mvn --version >/dev/null 2>&1 && echo mstress-tarball` + echo ${QFSHADOOP_VERSIONS} | grep '3\.4\.1' >/dev/null 2>&1 && \ + mvn --version >/dev/null 2>&1 && echo mstress-bootstrap mstress-tarball` .PHONY: java java: build diff --git a/README.md b/README.md index 31cabefc1..ebcd1f034 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,8 @@ # QFS version 2 -QFS version 2.2.7 +Release 2.2.8 + +- Release 2.2.8 contains updated Java module, Java 9+ support, Hadoop 3 benchmark support, and minor improvements. [Release notes](https://github.com/quantcast/qfs/wiki/Release-Notes). Release 2.2.7 @@ -64,7 +66,7 @@ Meta server replication is the major new feature in release 2.0.0. Meta server replication provides automatic meta server fail over. With meta server replication configured QFS does not have single point of failure. -Release notes are available [here](https://github.com/quantcast/qfs/wiki/Release-Notes). +[Release notes are available](https://github.com/quantcast/qfs/wiki/Release-Notes). ## Quantcast File System @@ -122,54 +124,78 @@ yourself with QFS is very easy. 1. Extract the distribution tarball. - $ tar -xzf qfs.tgz && cd qfs - -1. Set up a single node QFS instance. This will create a workspace in - `~/qfsbase`, start two chunk servers and one metaserver. + ```sh + tar -xzf qfs.tgz && cd qfs + ``` + +2. Set up a single node QFS instance. This will create a workspace in + `~/qfsbase`, start two chunk servers and one metaserver. + + ```sh + examples/sampleservers/sample_setup.py -a install + ``` + + ```console + Binaries presence checking - OK. + Setup directories - OK. + Setup config files - OK. + Started servers - OK. + ``` + +3. Add tools binary path to `PATH` + + ```sh + PATH=${PWD}/bin/tools:${PATH} + ``` + +4. Make a temporary directory on the file system + + ```sh + qfsshell -s localhost -p 20000 -q -- mkdir /qfs/tmp + ``` + +5. Create a file containing "Hello World", Reed-Solomon encoded, with + replication 1. + + ```sh + echo 'Hello World' | cptoqfs -s localhost -p 20000 -S -k /qfs/tmp/helloworld -d - + ``` + +6. Cat the file content. - $ ./examples/sampleservers/sample_setup.py -a install - Binaries presence checking - OK. - Setup directories - OK. - Setup config files - OK. - Started servers - OK. + ```sh + qfscat -s localhost -p 20000 /qfs/tmp/helloworld + ``` -1. Add tools binary path to`PATH` +7. Stat the file to see encoding (RS or not), replication level, and mtime. - $ PATH=${PWD}/bin/tools:${PATH} - -1. Make a temporary directory on the file system + ```sh + qfsshell -s localhost -p 20000 -q -- stat /qfs/tmp/helloworld + ``` - $ qfsshell -s localhost -p 20000 -q -- mkdir /qfs/tmp - -1. Create a file containing "Hello World", Reed-Solomon encoded, with - replication 1. +8. Copy the file locally to the current directory. - $ echo 'Hello World' | cptoqfs -s localhost -p 20000 -S -k /qfs/tmp/helloworld -d - - -1. Cat the file content. + ```sh + cpfromqfs -s localhost -p 20000 -k /qfs/tmp/helloworld -d ./helloworld + ``` - $ qfscat -s localhost -p 20000 /qfs/tmp/helloworld - -1. Stat the file to see encoding (RS or not), replication level, and mtime. +9. Remove the file from QFS. - $ qfsshell -s localhost -p 20000 -q -- stat /qfs/tmp/helloworld - -1. Copy the file locally to the current directory. + ```sh + qfsshell -s localhost -p 20000 -q -- rm /qfs/tmp/helloworld + ``` - $ cpfromqfs -s localhost -p 20000 -k /qfs/tmp/helloworld -d ./helloworld - -1. Remove the file from QFS. +10. Stop the servers. - $ qfsshell -s localhost -p 20000 -q -- rm /qfs/tmp/helloworld - -1. Stop the servers. + ```sh + examples/sampleservers/sample_setup.py -a stop + ``` - $ ./examples/sampleservers/sample_setup.py -a stop - -1. Uninstall the single node instance. +11. Uninstall the single node instance. - $ ./examples/sampleservers/sample_setup.py -a uninstall - + ```sh + examples/sampleservers/sample_setup.py -a uninstall + ``` ## Benchmarking QFS diff --git a/benchmarks/mstress/CMakeLists.txt b/benchmarks/mstress/CMakeLists.txt index acb6f21b4..2f168d4a1 100644 --- a/benchmarks/mstress/CMakeLists.txt +++ b/benchmarks/mstress/CMakeLists.txt @@ -1,6 +1,6 @@ # $Id$ # -# Copyright 2012,2016 Quantcast Corporation. All rights reserved. +# Copyright 2012-2025 Quantcast Corporation. All rights reserved. # # This file is part of Quantcast File System (QFS). # @@ -21,29 +21,38 @@ add_executable(mstress_client EXCLUDE_FROM_ALL mstress_client.cc) -IF(USE_STATIC_LIB_LINKAGE) +if(USE_STATIC_LIB_LINKAGE) add_dependencies(mstress_client kfsClient) target_link_libraries(mstress_client kfsClient) -ELSE(USE_STATIC_LIB_LINKAGE) +else(USE_STATIC_LIB_LINKAGE) add_dependencies(mstress_client kfsClient-shared) target_link_libraries(mstress_client kfsClient-shared) -ENDIF(USE_STATIC_LIB_LINKAGE) +endif() + +if(NOT DEFINED MSTRESS_HADOOP_VERSION) + if(DEFINED ENV{MSTRESS_HADOOP_VERSION}) + set(MSTRESS_HADOOP_VERSION $ENV{MSTRESS_HADOOP_VERSION}) + else() + set(MSTRESS_HADOOP_VERSION 3.4.1) + endif() +endif() add_custom_command( - OUTPUT mstress.jar + OUTPUT mstress.jar mstress-jar-with-dependencies.jar COMMAND sh ${CMAKE_CURRENT_SOURCE_DIR}/../../src/java/javabuild.sh -d ${CMAKE_CURRENT_SOURCE_DIR} -v ${CMAKE_CURRENT_SOURCE_DIR}/../../src/cc/common/buildversgit.sh -- -DbuildDirectory=${CMAKE_CURRENT_BINARY_DIR} - -Dhadoop.release.version=2.10.2 + -Dhadoop.release.version=${MSTRESS_HADOOP_VERSION} package DEPENDS src/main/java/com/quantcast/qfs/mstress/MStress_Client.java pom.xml COMMENT "The HDFS mstress client bundled as a jar." VERBATIM ) -add_custom_target(mstress DEPENDS mstress_client mstress.jar) +add_custom_target(mstress DEPENDS + mstress_client mstress.jar mstress-jar-with-dependencies.jar) set(mstress_scripts mstress_plan.py @@ -72,6 +81,7 @@ add_custom_target( COMMAND cd .. && ${CMAKE_COMMAND} -E tar czvf mstress.tgz mstress/mstress_client mstress/mstress.jar + mstress/mstress-jar-with-dependencies.jar mstress/*.py mstress/*.sh DEPENDS mstress @@ -109,6 +119,7 @@ add_custom_target( COMMAND cd .. && ${CMAKE_COMMAND} -E tar czvf mstress-bootstrap.tgz mstress/mstress_client mstress/mstress.jar + mstress/mstress-jar-with-dependencies.jar mstress/*.py mstress/*.sh mstress/bin/metaserver @@ -119,6 +130,17 @@ add_custom_target( ) add_dependencies(mstress-bootstrap mstress-scripts) +set(additional_files_to_clean + ../mstress.tgz + ../mstress-bootstrap.tgz + mstress.jar + mstress-jar-with-dependencies.jar + classes + bin + webui + setup.py + ${mstress_scripts} +) set_directory_properties(PROPERTIES - ADDITIONAL_MAKE_CLEAN_FILES "../mstress.tgz;mstress.jar;${mstress_scripts}" + ADDITIONAL_MAKE_CLEAN_FILES "${additional_files_to_clean}" ) diff --git a/benchmarks/mstress/mstress.py b/benchmarks/mstress/mstress.py index 8ec51484b..173dd5d4b 100755 --- a/benchmarks/mstress/mstress.py +++ b/benchmarks/mstress/mstress.py @@ -467,17 +467,17 @@ def ReadPlanFile(opts): if line.startswith("#"): continue if line.startswith("hostslist="): - hostsList = line[len("hostslist=") :].strip().split(",") + hostsList = line[len("hostslist="):].strip().split(",") elif line.startswith("clientsperhost="): - clientsPerHost = int(line[len("clientsperhost=") :].strip()) + clientsPerHost = int(line[len("clientsperhost="):].strip()) elif line.startswith("type="): - leafType = line[len("type=") :].strip() + leafType = line[len("type="):].strip() elif line.startswith("levels="): - numLevels = int(line[len("levels=") :].strip()) + numLevels = int(line[len("levels="):].strip()) elif line.startswith("nstat="): - numToStat = int(line[len("nstat=") :].strip()) + numToStat = int(line[len("nstat="):].strip()) elif line.startswith("inodes="): - nodesPerLevel = int(line[len("inodes=") :].strip()) + nodesPerLevel = int(line[len("inodes="):].strip()) planfile.close() if None in ( hostsList, @@ -546,7 +546,9 @@ def SetGlobalPaths(opts): Globals.SERVER_CMD = Globals.KFS_SERVER_CMD Globals.SERVER_KEYWORD = Globals.KFS_SERVER_KEYWORD elif opts.filesystem == "hdfs": - Globals.CLIENT_PATH = "java -Xmx256m -jar %s/mstress.jar" % mydir + Globals.CLIENT_PATH = ( + "java -Xmx256m -jar %s/mstress-jar-with-dependencies.jar" % mydir + ) Globals.SERVER_CMD = Globals.HDFS_SERVER_CMD Globals.SERVER_KEYWORD = Globals.HDFS_SERVER_KEYWORD else: diff --git a/benchmarks/mstress/pom.xml b/benchmarks/mstress/pom.xml index 9ad720bb6..296aac8ad 100644 --- a/benchmarks/mstress/pom.xml +++ b/benchmarks/mstress/pom.xml @@ -19,70 +19,98 @@ permissions and limitations under the License. --> + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> - 4.0.0 - com.quantcast.qfs - mstress - jar - 1.0.0 - mstress - http://quantcast.github.com/qfs/benchmarks/mstress + 4.0.0 + com.quantcast.qfs + mstress + jar + 1.0.0 + mstress + http://quantcast.github.com/qfs/benchmarks/mstress - - UTF-8 - ${project.basedir}/../../build/java/mstress - 2.10.2 - 00000000 - 0 - + + UTF-8 + ${project.basedir}/../../build/java/mstress + 3.4.1 + 00000000 + 0 + - - ${buildDirectory} - - mstress - - - org.apache.maven.plugins - maven-compiler-plugin - 2.4 - - - org.apache.maven.plugins - maven-jar-plugin - 2.2 - - - true - - QFS Mstress - ${qfs.release.version}-${qfs.source.revision} - Quantcast Corp. - - - - MStress_Client - - - - - - + + ${buildDirectory} + + mstress + + + org.apache.maven.plugins + maven-compiler-plugin + 2.4 + + + org.apache.maven.plugins + maven-jar-plugin + 2.2 + + + true + + QFS Mstress + ${qfs.release.version}-${qfs.source.revision} + Quantcast Corp. + + + + com.quantcast.qfs.mstress.MStress_Client + + + + + + maven-assembly-plugin + + + package + + single + + + + + + true + + QFS Mstress + ${qfs.release.version}-${qfs.source.revision} + Quantcast Corp. + + + + com.quantcast.qfs.mstress.MStress_Client + + + + jar-with-dependencies + + + + + - - - org.apache.hadoop - hadoop-common - ${hadoop.release.version} - compile - - - org.apache.hadoop - hadoop-hdfs - ${hadoop.release.version} - compile - - + + + org.apache.hadoop + hadoop-common + ${hadoop.release.version} + compile + + + org.apache.hadoop + hadoop-hdfs + ${hadoop.release.version} + compile + + \ No newline at end of file diff --git a/benchmarks/mstress/src/main/java/com/quantcast/qfs/mstress/MStress_Client.java b/benchmarks/mstress/src/main/java/com/quantcast/qfs/mstress/MStress_Client.java index 7c4e43a49..64d7c14f3 100644 --- a/benchmarks/mstress/src/main/java/com/quantcast/qfs/mstress/MStress_Client.java +++ b/benchmarks/mstress/src/main/java/com/quantcast/qfs/mstress/MStress_Client.java @@ -6,458 +6,419 @@ * Copyright 2012,2016 Quantcast Corporation. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy - * of the License at + * use this file except in compliance with the License. You may obtain a copy of + * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations - * under the License. + * License for the specific language governing permissions and limitations under + * the License. * * This Java client performs filesystem meta operations on the Hadoop namenode * using HDFS DFSClient. */ +package com.quantcast.qfs.mstress; import java.io.BufferedReader; import java.io.DataInputStream; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; -import java.net.InetSocketAddress; -import java.util.Arrays; +import java.util.ArrayList; import java.util.Collections; import java.util.Date; import java.util.LinkedList; +import java.util.List; import java.util.Queue; import java.util.Random; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hdfs.DFSClient; -import org.apache.hadoop.hdfs.protocol.DirectoryListing; -import org.apache.hadoop.hdfs.protocol.HdfsFileStatus; - -public class MStress_Client -{ - static final String TEST_BASE_DIR = new String("/mstress"); - - static DFSClient dfsClient_ = null; - static StringBuilder path_ = new StringBuilder(4096); - static int pathLen_ = 0; - static int totalCreateCount = 0; - static final int COUNT_INCR = 500; - - //From commandline - static String dfsServer_ = ""; - static int dfsPort_ = 0; - static String testName_ = ""; - static String prefix_ = ""; - static int prefixLen_ = 0; - static String planfilePath_ = ""; - static String hostName_ = ""; - static String processName_ = ""; - - //From plan file - static String type_ = ""; - static int levels_ = 0; - static int inodesPerLevel_ = 0; - static int pathsToStat_ = 0; - - private static void pathPush(String leafStr) { - int leafLen = leafStr.length(); - if (leafLen == 0) { - return; - } - if (leafStr.charAt(0) != '/') { - path_.insert(pathLen_, "/"); - System.out.printf("Leaf = %s, path_ = [%s]\n", leafStr, path_.toString()); - pathLen_ ++; - } - path_.insert(pathLen_, leafStr); - System.out.printf("After push Leaf = %s, path_ = [%s]\n", leafStr, path_.toString()); - pathLen_ += leafLen; - } - - private static void pathPop(String leafStr) { - int leafLen = leafStr.length(); - if (leafLen > pathLen_ - 1) { - System.out.printf("Error in pop: %s from %s, leafLen = %d, pathLen_ = %d\n", leafStr, path_.toString(), leafLen, pathLen_); - return; - } - String lastPart = path_.substring(pathLen_ - leafLen, pathLen_); - System.out.printf("lastPart = [%s - %s] leafStr = [%s - %s]\n", lastPart, lastPart.getClass().getName(), leafStr, leafStr.getClass().getName()); +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; - if (!leafStr.equals(lastPart)) { - System.out.printf("Error in pop: %s from %s\n", leafStr, path_.toString()); - System.exit(1); - return; - } - pathLen_ -= leafLen + 1; - path_.insert(pathLen_, '\0'); - System.out.printf("After pop, path_ = [%s]\n", path_.toString()); - } - - private static void pathReset() { - path_.insert(0, '\0'); - pathLen_ = 0; - } - - - public static void main(String args[]) { - parseOptions(args); - int result = 0; - - try { - Configuration conf = new Configuration(true); - String confSet = "hdfs://" + dfsServer_ + ":" + dfsPort_; - conf.set("fs.default.name", confSet); - conf.set("fs.trash.interval", "0"); - InetSocketAddress inet = new InetSocketAddress(dfsServer_, dfsPort_); - dfsClient_ = new DFSClient(inet, conf); - - if (parsePlanFile() < 0) { - System.exit(-1); - } - - if (testName_.equals("create")) { - result = createDFSPaths(); - } else if (testName_.equals("stat")) { - result = statDFSPaths(); - } else if (testName_.equals("readdir")) { - result = listDFSPaths(); - } else if (testName_.equals("delete")) { - result = removeDFSPaths(); - } else { - System.out.printf("Error: unrecognized test \'%s\'\n", testName_); - System.exit(-1); - } - } catch( IOException e) { - e.printStackTrace(); - System.exit(-1); - } +public class MStress_Client { - if (result != 0) { - System.exit(-1); - } + static final Path TEST_BASE_DIR = new Path(Path.SEPARATOR, "mstress"); - return; - } + static FileSystem dfs_ = null; + static int totalCreateCount = 0; + static final int COUNT_INCR = 500; - private static void parseOptions(String args[]) - { - if (!(args.length == 14 || args.length == 12 || args.length == 5)) { - usage(); - } + //From commandline + static String dfsServer_ = ""; + static int dfsPort_ = 0; + static String testName_ = ""; + static String prefix_ = ""; + static String planfilePath_ = ""; + static String hostName_ = ""; + static String processName_ = ""; - for (int i = 0; i < args.length; i++) { - if (args[i].equals("-s") && i+1 < args.length) { - dfsServer_ = args[i+1]; - System.out.println(args[i+1]); - i++; - } else if (args[i].equals("-p") && i+1 < args.length) { - dfsPort_ = Integer.parseInt(args[i+1]); - System.out.println(args[i+1]); - i++; - } else if (args[i].equals("-t") && i+1 < args.length) { - testName_ = args[i+1]; - System.out.println(args[i+1]); - i++; - } else if (args[i].equals("-a") && i+1 < args.length) { - planfilePath_ = args[i+1]; - System.out.println(args[i+1]); - i++; - } else if (args[i].equals("-c") && i+1 < args.length) { - hostName_ = args[i+1]; - System.out.println(args[i+1]); - i++; - } else if (args[i].equals("-n") && i+1 < args.length) { - processName_ = args[i+1]; - System.out.println(args[i+1]); - i++; - } else if (args[i].equals("-P") && i+1 < args.length) { - prefix_ = args[i+1]; - System.out.println(args[i+1]); - i++; - } - } + //From plan file + static String type_ = ""; + static int levels_ = 0; + static int inodesPerLevel_ = 0; + static int pathsToStat_ = 0; + static boolean _debugTrace = false; - if (dfsServer_.length() == 0 || - testName_.length() == 0 || - planfilePath_.length() == 0 || - hostName_.length() == 0 || - processName_.length() == 0 || - dfsPort_ == 0) { - usage(); + public static void main(String args[]) { + parseOptions(args); + + try { + Configuration conf = new Configuration(true); + String confSet = "hdfs://" + dfsServer_ + ":" + dfsPort_; + conf.set("fs.default.name", confSet); + conf.set("fs.trash.interval", "0"); + dfs_ = FileSystem.get(conf); + + if (parsePlanFile() < 0) { + System.exit(-1); + } + + int result; + if (testName_.equals("create")) { + result = createDFSPaths(); + } else if (testName_.equals("stat")) { + result = statDFSPaths(); + } else if (testName_.equals("readdir")) { + result = listDFSPaths(); + } else if (testName_.equals("delete")) { + result = removeDFSPaths(); + } else { + System.out.printf("Error: unrecognized test \'%s\'\n", testName_); + result = -1; + } + if (result != 0) { + System.exit(-1); + } + } catch (IOException e) { + printStackTrace(e); + System.exit(-1); + } + System.exit(0); } - if (prefix_ == null) { - prefix_ = new String("PATH_"); + + private static void printStackTrace(Throwable e) { + e.printStackTrace(System.err); } - prefixLen_ = prefix_.length(); - } - - private static void usage() - { - String className = MStress_Client.class.getName(); - System.out.printf("Usage: java %s -s dfs-server -p dfs-port [-t [create|stat|readdir|rmdir] -a planfile-path -c host -n process-name -P prefix]\n", - className); - System.out.printf(" -t: this option requires -a, -c, and -n options.\n"); - System.out.printf(" -P: default prefix is PATH_.\n"); - System.out.printf("eg:\n"); - System.out.printf(" java %s -s -p -t create -a -c localhost -n Proc_00\n", className); - System.exit(1); - } - - private static int parsePlanFile() - { - int ret = -1; - try { - FileInputStream fis = new FileInputStream(planfilePath_); - DataInputStream dis = new DataInputStream(fis); - BufferedReader br = new BufferedReader(new InputStreamReader(dis)); - - if (prefix_.isEmpty()) { - prefix_ = "PATH_"; - } - - String line; - while ((line = br.readLine()) != null) { - if (line.length() == 0 || line.startsWith("#")) { - continue; - } - if (line.startsWith("type=")) { - type_ = line.substring(5); - continue; - } - if (line.startsWith("levels=")) { - levels_ = Integer.parseInt(line.substring(7)); - continue; + + private static void parseOptions(String args[]) { + for (int i = 0; i < args.length; i++) { + if (args[i].equals("-s") && i + 1 < args.length) { + dfsServer_ = args[i + 1]; + System.out.println(args[i + 1]); + i++; + } else if (args[i].equals("-p") && i + 1 < args.length) { + dfsPort_ = Integer.parseInt(args[i + 1]); + System.out.println(args[i + 1]); + i++; + } else if (args[i].equals("-t") && i + 1 < args.length) { + testName_ = args[i + 1]; + System.out.println(args[i + 1]); + i++; + } else if (args[i].equals("-a") && i + 1 < args.length) { + planfilePath_ = args[i + 1]; + System.out.println(args[i + 1]); + i++; + } else if (args[i].equals("-c") && i + 1 < args.length) { + hostName_ = args[i + 1]; + System.out.println(args[i + 1]); + i++; + } else if (args[i].equals("-n") && i + 1 < args.length) { + processName_ = args[i + 1]; + System.out.println(args[i + 1]); + i++; + } else if (args[i].equals("-P") && i + 1 < args.length) { + prefix_ = args[i + 1]; + System.out.println(args[i + 1]); + i++; + } else if (args[i].equals("-h") || args[i].equals("--help")) { + usage(0); + } else { + System.err.println("invalid argument: " + args[i]); + usage(1); + } } - if (line.startsWith("inodes=")) { - inodesPerLevel_ = Integer.parseInt(line.substring(7)); - continue; + + if (dfsServer_.isEmpty() + || testName_.isEmpty() + || planfilePath_.isEmpty() + || hostName_.isEmpty() + || processName_.isEmpty() + || dfsPort_ <= 0) { + usage(1); } - if (line.startsWith("nstat=")) { - pathsToStat_ = Integer.parseInt(line.substring(6)); - continue; + if (prefix_ == null) { + prefix_ = "PATH_"; } - } - dis.close(); - if (levels_ > 0 && !type_.isEmpty() && inodesPerLevel_ > 0 && pathsToStat_ > 0) { - ret = 0; - } - } catch (Exception e) { - System.out.println("Error: " + e.getMessage()); - } - return ret; - } - - private static long timeDiffMilliSec(Date alpha, Date zigma) - { - return zigma.getTime() - alpha.getTime(); - } - - private static int CreateDFSPaths(int level, String parentPath) { - Boolean isLeaf = false; - Boolean isDir = false; - if (level + 1 >= levels_) { - isLeaf = true; - } - if (isLeaf) { - if (type_.equals("dir")) { - isDir = true; - } else { - isDir = false; - } - } else { - isDir = true; } - int err = 0; - for (int i = 0; i < inodesPerLevel_; i++) { - String path = parentPath + "/" + prefix_ + Integer.toString(i); - //System.out.printf("Creating (isdir=%b) [%s]\n", isDir, path.toString()); + private static void usage(int status) { + final String className = MStress_Client.class.getName(); + (0 == status ? System.out : System.err).printf( + "Usage: java %s -s dfs-server -p dfs-port" + + " [-t [create|stat|readdir|rmdir] -a planfile-path -c host" + + " -n process-name -P prefix]\n" + + " -t: this option requires -a, -c, and -n options.\n" + + " -P: default prefix is PATH_.\n" + + "eg:\n" + + " java %s -s -p -t" + + " create -a -c localhost -n Proc_00\n", + className, className + ); + System.exit(status); + } - if (isDir) { + @SuppressWarnings("UseSpecificCatch") + private static int parsePlanFile() { + int ret = -1; try { - if (dfsClient_.mkdirs(path) == false) { - System.out.printf("Error in mkdirs(%s)\n", path); - return -1; - } - totalCreateCount ++; - if (totalCreateCount % COUNT_INCR == 0) { - System.out.printf("Created paths so far: %d\n", totalCreateCount); - } - if (!isLeaf) { - if (CreateDFSPaths(level+1, path) < 0) { - System.out.printf("Error in CreateDFSPaths(%s)\n", path); - return -1; + FileInputStream fis = new FileInputStream(planfilePath_); + DataInputStream dis = new DataInputStream(fis); + BufferedReader br = new BufferedReader(new InputStreamReader(dis)); + + if (prefix_.isEmpty()) { + prefix_ = "PATH_"; } - } - } catch(IOException e) { - e.printStackTrace(); - return -1; - } - } else { - try { - dfsClient_.create(path, true); - totalCreateCount ++; - if (totalCreateCount % COUNT_INCR == 0) { - System.out.printf("Created paths so far: %d\n", totalCreateCount); - } - } catch( IOException e) { - e.printStackTrace(); - return -1; + + String line; + while ((line = br.readLine()) != null) { + if (line.length() == 0 || line.startsWith("#")) { + continue; + } + if (line.startsWith("type=")) { + type_ = line.substring(5); + continue; + } + if (line.startsWith("levels=")) { + levels_ = Integer.parseInt(line.substring(7)); + continue; + } + if (line.startsWith("inodes=")) { + inodesPerLevel_ = Integer.parseInt(line.substring(7)); + continue; + } + if (line.startsWith("nstat=")) { + pathsToStat_ = Integer.parseInt(line.substring(6)); + } + } + dis.close(); + if (levels_ > 0 && !type_.isEmpty() && inodesPerLevel_ > 0 + && pathsToStat_ > 0) { + ret = 0; + } + } catch (Exception e) { + System.out.println("Error: " + e.getMessage()); } - } + return ret; } - return 0; - } - - private static int createDFSPaths() - { - String basePath = new String(TEST_BASE_DIR) + "/" + hostName_ + "_" + processName_; - try { - Boolean ret = dfsClient_.mkdirs(basePath); - if (!ret) { - System.out.printf("Error: failed to create test base dir [%s]\n", basePath); - return -1; - } - } catch( IOException e) { - e.printStackTrace(); - throw new RuntimeException(); + + private static long timeDiffMilliSec(Date alpha, Date zigma) { + return zigma.getTime() - alpha.getTime(); } - Date alpha = new Date(); + private static int CreateDFSPaths(int level, Path parentPath) { + final Boolean isLeaf = level + 1 >= levels_; + final Boolean isDir = !isLeaf || type_.equals("dir"); + + for (int i = 0; i < inodesPerLevel_; i++) { + Path path = new Path(parentPath, prefix_ + Integer.toString(i)); + if (_debugTrace) { + System.out.printf( + "Creating (isdir=%b) [%s]\n", + isDir, path.toString() + ); + } - if (CreateDFSPaths(0, basePath) < 0) { - return -1; + if (isDir) { + try { + if (!dfs_.mkdirs(path)) { + System.out.printf("Error in mkdirs(%s)\n", path); + return -1; + } + totalCreateCount++; + if (totalCreateCount % COUNT_INCR == 0) { + System.out.printf( + "Created paths so far: %d\n", totalCreateCount); + } + if (!isLeaf) { + if (CreateDFSPaths(level + 1, path) < 0) { + System.out.printf( + "Error in CreateDFSPaths(%s)\n", path); + return -1; + } + } + } catch (IOException e) { + printStackTrace(e); + return -1; + } + } else { + try { + dfs_.create(path, true); + totalCreateCount++; + if (totalCreateCount % COUNT_INCR == 0) { + System.out.printf( + "Created paths so far: %d\n", totalCreateCount); + } + } catch (IOException e) { + printStackTrace(e); + return -1; + } + } + } + return 0; } - Date zigma = new Date(); - System.out.printf("Client: %d paths created in %d msec\n", totalCreateCount, timeDiffMilliSec(alpha, zigma)); - return 0; - } - - private static int statDFSPaths() - { - String basePath = new String(TEST_BASE_DIR) + "/" + hostName_ + "_" + processName_; - - Date alpha = new Date(); - Random random = new Random(alpha.getTime()); - - for (int count = 0; count < pathsToStat_; count++) { - String path = basePath; - for (int d = 0; d < levels_; d++) { - int randIdx = random.nextInt(inodesPerLevel_); - String name = new String(prefix_) + Integer.toString(randIdx); - path = path + "/" + name; - } - - //System.out.printf("Doing stat on [%s]\n", path); - HdfsFileStatus stat = null; - try { - stat = dfsClient_.getFileInfo(path); - } catch(IOException e) { - e.printStackTrace(); - return -1; - } - if (count % COUNT_INCR == 0) { - System.out.printf("Stat paths so far: %d\n", count); - } - } - Date zigma = new Date(); - System.out.printf("Client: Stat done on %d paths in %d msec\n", pathsToStat_, timeDiffMilliSec(alpha, zigma)); - return 0; - } - - private static int listDFSPaths() - { - Date alpha = new Date(); - int inodeCount = 0; - - String basePath = new String(TEST_BASE_DIR) + "/" + hostName_ + "_" + processName_; - Queue pending = new LinkedList(); - pending.add(basePath); - - while (!pending.isEmpty()) { - String parent = pending.remove(); - DirectoryListing thisListing; - try { - thisListing = dfsClient_.listPaths(parent, HdfsFileStatus.EMPTY_NAME); - if (thisListing == null || thisListing.getPartialListing().length == 0) { - //System.out.println("Empty directory"); - continue; + private static int createDFSPaths() { + Path basePath = new Path(TEST_BASE_DIR, hostName_ + "_" + processName_); + try { + if (!dfs_.mkdirs(basePath)) { + System.out.printf( + "Error: failed to create test base dir [%s]\n", + basePath.toString() + ); + return -1; + } + } catch (IOException e) { + printStackTrace(e); + throw new RuntimeException(); } - do { - HdfsFileStatus[] children = thisListing.getPartialListing(); - for (int i = 0; i < children.length; i++) { - String localName = children[i].getLocalName(); - //System.out.printf("Readdir going through [%s/%s]\n", parent, localName); - if (localName.equals(".") || localName.equals("..")) { - continue; + + Date alpha = new Date(); + + if (CreateDFSPaths(0, basePath) < 0) { + return -1; + } + + final Date zigma = new Date(); + System.out.printf( + "Client: %d paths created in %d msec\n", + totalCreateCount, timeDiffMilliSec(alpha, zigma) + ); + return 0; + } + + private static int statDFSPaths() { + final Path basePath + = new Path(TEST_BASE_DIR, hostName_ + "_" + processName_); + final Date alpha = new Date(); + final Random random = new Random(alpha.getTime()); + + for (int count = 0; count < pathsToStat_; count++) { + Path path = basePath; + for (int d = 0; d < levels_; d++) { + int randIdx = random.nextInt(inodesPerLevel_); + String name = prefix_ + Integer.toString(randIdx); + path = new Path(path, name); + } + + if (_debugTrace) { + System.out.printf("Doing stat on [%s]\n", path); + } + try { + dfs_.getFileStatus(path); + } catch (IOException e) { + printStackTrace(e); + return -1; } - inodeCount ++; - if (inodeCount % COUNT_INCR == 0) { - System.out.printf("Readdir paths so far: %d\n", inodeCount); + if (count % COUNT_INCR == 0) { + System.out.printf("Stat paths so far: %d\n", count); } - if (children[i].isDir()) { - pending.add(parent + "/" + localName); + } + final Date zigma = new Date(); + System.out.printf( + "Client: Stat done on %d paths in %d msec\n", + pathsToStat_, timeDiffMilliSec(alpha, zigma) + ); + return 0; + } + + private static int listDFSPaths() { + final Date alpha = new Date(); + final Queue pending = new LinkedList(); + pending.add(new Path(TEST_BASE_DIR, hostName_ + "_" + processName_)); + + int inodeCount = 0; + while (!pending.isEmpty()) { + Path parent = pending.remove(); + try { + FileStatus[] children = dfs_.listStatus(parent); + for (FileStatus child : children) { + String localName = child.getPath().getName(); + if (_debugTrace) { + System.out.printf( + "Readdir going through [%s/%s]\n", + parent.toString(), localName + ); + } + if (localName.equals(".") || localName.equals("..")) { + continue; + } + inodeCount++; + if (inodeCount % COUNT_INCR == 0) { + System.out.printf( + "Readdir paths so far: %d\n", inodeCount); + } + if (child.isDir()) { + pending.add(new Path(parent, localName)); + } + } + } catch (IOException e) { + printStackTrace(e); + return -1; } - } - if (!thisListing.hasMore()) { - break; - } else { - //System.out.println("Remaining entries " + Integer.toString(thisListing.getRemainingEntries())); - } - thisListing = dfsClient_.listPaths(parent, thisListing.getLastName()); - } while (thisListing != null); - } catch (IOException e) { - e.printStackTrace(); - return -1; - } + } + + final Date zigma = new Date(); + System.out.printf( + "Client: Directory walk done over %d inodes in %d msec\n", + inodeCount, timeDiffMilliSec(alpha, zigma) + ); + return 0; } - Date zigma = new Date(); - System.out.printf("Client: Directory walk done over %d inodes in %d msec\n", inodeCount, timeDiffMilliSec(alpha, zigma)); - return 0; - } - - private static int removeDFSPaths() - { - String rmPath = new String(TEST_BASE_DIR) + "/" + hostName_ + "_" + processName_; - - System.out.printf("Deleting %s ...\n", rmPath); - - int countLeaf = (int) Math.round(Math.pow(inodesPerLevel_, levels_)); - int[] leafIdxRangeForDel = new int[countLeaf]; - for(int i=0;i 0) { - path = prefix_ + delta + "/" + path; - } else { - path = prefix_ + delta; - } - } - dfsClient_.delete(rmPath + "/" + path,true); - } - dfsClient_.delete(rmPath, true); - } catch(IOException e) { - e.printStackTrace(); - return -1; + private static int removeDFSPaths() { + final Path rmPath + = new Path(TEST_BASE_DIR, hostName_ + "_" + processName_); + + System.out.printf("Deleting %s ...\n", rmPath.toString()); + + final int countLeaf + = (int) Math.round(Math.pow(inodesPerLevel_, levels_)); + final List leafIdxRangeForDel = new ArrayList(countLeaf); + for (int i = 0; i < countLeaf; i++) { + leafIdxRangeForDel.add(i); + } + Collections.shuffle(leafIdxRangeForDel); + + final Date alpha = new Date(); + try { + for (int idx : leafIdxRangeForDel) { + Path path = null; + for (int lev = 0; lev < levels_; lev++) { + int delta = idx % inodesPerLevel_; + idx /= inodesPerLevel_; + if (path != null) { + path = new Path(prefix_ + delta, path); + } else { + path = new Path(prefix_ + delta); + } + } + dfs_.delete(new Path(rmPath, path), true); + } + dfs_.delete(rmPath, true); + } catch (IOException e) { + printStackTrace(e); + return -1; + } + final Date zigma = new Date(); + System.out.printf( + "Client: Deleted %s. Delete took %d msec\n", + rmPath, timeDiffMilliSec(alpha, zigma) + ); + return 0; } - Date zigma = new Date(); - System.out.printf("Client: Deleted %s. Delete took %d msec\n", rmPath, timeDiffMilliSec(alpha, zigma)); - return 0; - } } diff --git a/src/cc/access/qfs_access_jni.cc b/src/cc/access/qfs_access_jni.cc index a522bd3a4..c1392d1fb 100644 --- a/src/cc/access/qfs_access_jni.cc +++ b/src/cc/access/qfs_access_jni.cc @@ -31,236 +31,228 @@ typedef int64_t __int64; #endif #include -#include -#include -#include #include -#include -#include +#include #include -using std::vector; -using std::string; -using std::cout; -using std::endl; -using std::ostringstream; - -#include #include "libclient/KfsClient.h" + using namespace KFS; +using std::vector; +using std::string; extern "C" { - jlong Java_com_quantcast_qfs_access_KfsAccess_initF( + jlong Java_com_quantcast_qfs_access_KfsAccessBase_initF( JNIEnv *jenv, jclass jcls, jstring jpath); - jlong Java_com_quantcast_qfs_access_KfsAccess_initS( + jlong Java_com_quantcast_qfs_access_KfsAccessBase_initS( JNIEnv *jenv, jclass jcls, jstring jmetaServerHost, jint metaServerPort); - void Java_com_quantcast_qfs_access_KfsAccess_destroy( + void Java_com_quantcast_qfs_access_KfsAccessBase_destroy( JNIEnv *jenv, jclass jcls, jlong jptr); - jint Java_com_quantcast_qfs_access_KfsAccess_cd( + jint Java_com_quantcast_qfs_access_KfsAccessBase_cd( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath); - jint Java_com_quantcast_qfs_access_KfsAccess_mkdir( + jint Java_com_quantcast_qfs_access_KfsAccessBase_mkdir( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jint mode); - jint Java_com_quantcast_qfs_access_KfsAccess_mkdirs( + jint Java_com_quantcast_qfs_access_KfsAccessBase_mkdirs( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jint mode); - jint Java_com_quantcast_qfs_access_KfsAccess_rmdir( + jint Java_com_quantcast_qfs_access_KfsAccessBase_rmdir( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath); - jint Java_com_quantcast_qfs_access_KfsAccess_compareChunkReplicas( + jint Java_com_quantcast_qfs_access_KfsAccessBase_compareChunkReplicas( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jobject stringbuffermd5); - jint Java_com_quantcast_qfs_access_KfsAccess_rmdirs( + jint Java_com_quantcast_qfs_access_KfsAccessBase_rmdirs( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath); - jobjectArray Java_com_quantcast_qfs_access_KfsAccess_readdirplus( + jobjectArray Java_com_quantcast_qfs_access_KfsAccessBase_readdirplus( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath); - jobjectArray Java_com_quantcast_qfs_access_KfsAccess_readdir( + jobjectArray Java_com_quantcast_qfs_access_KfsAccessBase_readdir( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jboolean jpreloadattr); - jint Java_com_quantcast_qfs_access_KfsAccess_remove( + jint Java_com_quantcast_qfs_access_KfsAccessBase_remove( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath); - jint Java_com_quantcast_qfs_access_KfsAccess_rename( + jint Java_com_quantcast_qfs_access_KfsAccessBase_rename( JNIEnv *jenv, jclass jcls, jlong jptr, jstring joldpath, jstring jnewpath, jboolean joverwrite); - jint Java_com_quantcast_qfs_access_KfsAccess_symlink( + jint Java_com_quantcast_qfs_access_KfsAccessBase_symlink( JNIEnv *jenv, jclass jcls, jlong jptr, jstring joldpath, jstring jnewpath, jint jmode, jboolean joverwrite); - jint Java_com_quantcast_qfs_access_KfsAccess_exists( + jint Java_com_quantcast_qfs_access_KfsAccessBase_exists( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath); - jint Java_com_quantcast_qfs_access_KfsAccess_isFile( + jint Java_com_quantcast_qfs_access_KfsAccessBase_isFile( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath); - jint Java_com_quantcast_qfs_access_KfsAccess_isDirectory( + jint Java_com_quantcast_qfs_access_KfsAccessBase_isDirectory( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath); - jlong Java_com_quantcast_qfs_access_KfsAccess_filesize( + jlong Java_com_quantcast_qfs_access_KfsAccessBase_filesize( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath); - jobjectArray Java_com_quantcast_qfs_access_KfsAccess_getDataLocation( + jobjectArray Java_com_quantcast_qfs_access_KfsAccessBase_getDataLocation( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jlong jstart, jlong jlen); - jobjectArray Java_com_quantcast_qfs_access_KfsAccess_getBlocksLocation( + jobjectArray Java_com_quantcast_qfs_access_KfsAccessBase_getBlocksLocation( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jlong jstart, jlong jlen); - jshort Java_com_quantcast_qfs_access_KfsAccess_getReplication( + jshort Java_com_quantcast_qfs_access_KfsAccessBase_getReplication( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath); - jshort Java_com_quantcast_qfs_access_KfsAccess_setReplication( + jshort Java_com_quantcast_qfs_access_KfsAccessBase_setReplication( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jint jnumReplicas); - jlong Java_com_quantcast_qfs_access_KfsAccess_getModificationTime( + jlong Java_com_quantcast_qfs_access_KfsAccessBase_getModificationTime( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath); - jint Java_com_quantcast_qfs_access_KfsAccess_setUTimes( + jint Java_com_quantcast_qfs_access_KfsAccessBase_setUTimes( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jlong jmtime_usec, jlong jatime_usec, jlong jctime_usec); - jint Java_com_quantcast_qfs_access_KfsAccess_open( + jint Java_com_quantcast_qfs_access_KfsAccessBase_open( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jstring jmode, jint jnumReplicas, jint jnumStripes, jint jnumRecoveryStripes, jint jstripeSize, jint jstripedType, jint jcreateMode); - jint Java_com_quantcast_qfs_access_KfsAccess_create( + jint Java_com_quantcast_qfs_access_KfsAccessBase_create( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jint jnumReplicas, jboolean jexclusive, jint jnumStripes, jint jnumRecoveryStripes, jint jstripeSize, jint jstripedType, jboolean foreceType, jint mode, jint jminSTier, jint jmaxSTier); - jint Java_com_quantcast_qfs_access_KfsAccess_create2( + jint Java_com_quantcast_qfs_access_KfsAccessBase_create2( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jboolean jexclusive, jstring jcreateParams); - jint Java_com_quantcast_qfs_access_KfsAccess_create2ex( + jint Java_com_quantcast_qfs_access_KfsAccessBase_create2ex( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jboolean jexclusive, jstring jcreateParams, jint jmode, jboolean jforceTypeFlag); - jlong Java_com_quantcast_qfs_access_KfsAccess_setDefaultIoBufferSize( + jlong Java_com_quantcast_qfs_access_KfsAccessBase_setDefaultIoBufferSize( JNIEnv *jenv, jclass jcls, jlong jptr, jlong jsize); - jlong Java_com_quantcast_qfs_access_KfsAccess_getDefaultIoBufferSize( + jlong Java_com_quantcast_qfs_access_KfsAccessBase_getDefaultIoBufferSize( JNIEnv *jenv, jclass jcls, jlong jptr); - jlong Java_com_quantcast_qfs_access_KfsAccess_setDefaultReadAheadSize( + jlong Java_com_quantcast_qfs_access_KfsAccessBase_setDefaultReadAheadSize( JNIEnv *jenv, jclass jcls, jlong jptr, jlong jsize); - jlong Java_com_quantcast_qfs_access_KfsAccess_getDefaultReadAheadSize( + jlong Java_com_quantcast_qfs_access_KfsAccessBase_getDefaultReadAheadSize( JNIEnv *jenv, jclass jcls, jlong jptr); - jlong Java_com_quantcast_qfs_access_KfsAccess_setIoBufferSize( + jlong Java_com_quantcast_qfs_access_KfsAccessBase_setIoBufferSize( JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd, jlong jsize); - jlong Java_com_quantcast_qfs_access_KfsAccess_getIoBufferSize( + jlong Java_com_quantcast_qfs_access_KfsAccessBase_getIoBufferSize( JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd); - jlong Java_com_quantcast_qfs_access_KfsAccess_setReadAheadSize( + jlong Java_com_quantcast_qfs_access_KfsAccessBase_setReadAheadSize( JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd, jlong jsize); - jlong Java_com_quantcast_qfs_access_KfsAccess_getReadAheadSize( + jlong Java_com_quantcast_qfs_access_KfsAccessBase_getReadAheadSize( JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd); - jint Java_com_quantcast_qfs_access_KfsAccess_getStripedType( + jint Java_com_quantcast_qfs_access_KfsAccessBase_getStripedType( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath); - void Java_com_quantcast_qfs_access_KfsAccess_setFileAttributeRevalidateTime( + void Java_com_quantcast_qfs_access_KfsAccessBase_setFileAttributeRevalidateTime( JNIEnv *jenv, jclass jcls, jlong jptr, jint secs); - jint Java_com_quantcast_qfs_access_KfsAccess_chmod( + jint Java_com_quantcast_qfs_access_KfsAccessBase_chmod( JNIEnv *jenv, jclass jcls, jlong jptr, jstring path, jint mode); - jint Java_com_quantcast_qfs_access_KfsAccess_chmodr( + jint Java_com_quantcast_qfs_access_KfsAccessBase_chmodr( JNIEnv *jenv, jclass jcls, jlong jptr, jstring path, jint mode); - jint Java_com_quantcast_qfs_access_KfsAccess_fchmod( + jint Java_com_quantcast_qfs_access_KfsAccessBase_fchmod( JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd, jint mode); - jint Java_com_quantcast_qfs_access_KfsAccess_chowns( + jint Java_com_quantcast_qfs_access_KfsAccessBase_chowns( JNIEnv *jenv, jclass jcls, jlong jptr, jstring path, jstring user, jstring group); - jint Java_com_quantcast_qfs_access_KfsAccess_chownsr( + jint Java_com_quantcast_qfs_access_KfsAccessBase_chownsr( JNIEnv *jenv, jclass jcls, jlong jptr, jstring path, jstring user, jstring group); - jint Java_com_quantcast_qfs_access_KfsAccess_chown( + jint Java_com_quantcast_qfs_access_KfsAccessBase_chown( JNIEnv *jenv, jclass jcls, jlong jptr, jstring path, jlong user, jlong group); - jint Java_com_quantcast_qfs_access_KfsAccess_chownr( + jint Java_com_quantcast_qfs_access_KfsAccessBase_chownr( JNIEnv *jenv, jclass jcls, jlong jptr, jstring path, jlong user, jlong group); - jint Java_com_quantcast_qfs_access_KfsAccess_fchowns( + jint Java_com_quantcast_qfs_access_KfsAccessBase_fchowns( JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd, jstring user, jstring group); - jint Java_com_quantcast_qfs_access_KfsAccess_fchown( + jint Java_com_quantcast_qfs_access_KfsAccessBase_fchown( JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd, jlong user, jlong group); - jint Java_com_quantcast_qfs_access_KfsAccess_setEUserAndEGroup( + jint Java_com_quantcast_qfs_access_KfsAccessBase_setEUserAndEGroup( JNIEnv *jenv, jclass jcls, jlong jptr, jlong user, jlong group, jlongArray); - jint Java_com_quantcast_qfs_access_KfsAccess_stat( + jint Java_com_quantcast_qfs_access_KfsAccessBase_stat( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jobject attr); - jint Java_com_quantcast_qfs_access_KfsAccess_lstat( + jint Java_com_quantcast_qfs_access_KfsAccessBase_lstat( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jobject attr); - jstring Java_com_quantcast_qfs_access_KfsAccess_strerror( + jstring Java_com_quantcast_qfs_access_KfsAccessBase_strerror( JNIEnv *jenv, jclass jcls, jlong jptr, jint jerr); - jboolean Java_com_quantcast_qfs_access_KfsAccess_isnotfound( + jboolean Java_com_quantcast_qfs_access_KfsAccessBase_isnotfound( JNIEnv *jenv, jclass jcls, jlong jptr, jint jerr); - jint Java_com_quantcast_qfs_access_KfsAccess_close( + jint Java_com_quantcast_qfs_access_KfsAccessBase_close( JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd); - jlong Java_com_quantcast_qfs_access_KfsAccess_seek( + jlong Java_com_quantcast_qfs_access_KfsAccessBase_seek( JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd, jlong joffset); - jlong Java_com_quantcast_qfs_access_KfsAccess_tell( + jlong Java_com_quantcast_qfs_access_KfsAccessBase_tell( JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd); - jint Java_com_quantcast_qfs_access_KfsAccess_getUMask( + jint Java_com_quantcast_qfs_access_KfsAccessBase_getUMask( JNIEnv *jenv, jclass jcls, jlong jptr); - jint Java_com_quantcast_qfs_access_KfsAccess_setUMask( + jint Java_com_quantcast_qfs_access_KfsAccessBase_setUMask( JNIEnv *jenv, jclass jcls, jlong jptr, jint umask); - jstring Java_com_quantcast_qfs_access_KfsAccess_createDelegationToken( + jstring Java_com_quantcast_qfs_access_KfsAccessBase_createDelegationToken( JNIEnv *jenv, jclass jcls, jlong jptr, jboolean allowDelegationFlag, jlong validTime, jobject result); - jstring Java_com_quantcast_qfs_access_KfsAccess_renewDelegationToken( + jstring Java_com_quantcast_qfs_access_KfsAccessBase_renewDelegationToken( JNIEnv *jenv, jclass jcls, jlong jptr, jobject token); - jstring Java_com_quantcast_qfs_access_KfsAccess_cancelDelegationToken( + jstring Java_com_quantcast_qfs_access_KfsAccessBase_cancelDelegationToken( JNIEnv *jenv, jclass jcls, jlong jptr, jobject token); - jobjectArray Java_com_quantcast_qfs_access_KfsAccess_getStats( + jobjectArray Java_com_quantcast_qfs_access_KfsAccessBase_getStats( JNIEnv *jenv, jclass jcls, jlong jptr); /* Input channel methods */ - jint Java_com_quantcast_qfs_access_KfsInputChannel_read( + jint Java_com_quantcast_qfs_access_KfsInputChannelBase_read( JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd, jobject buf, jint begin, jint end); - jint Java_com_quantcast_qfs_access_KfsInputChannel_close( + jint Java_com_quantcast_qfs_access_KfsInputChannelBase_close( JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd); /* Output channel methods */ - jint Java_com_quantcast_qfs_access_KfsOutputChannel_write( + jint Java_com_quantcast_qfs_access_KfsOutputChannelBase_write( JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd, jobject buf, jint begin, jint end); - jint Java_com_quantcast_qfs_access_KfsOutputChannel_atomicRecordAppend( + jint Java_com_quantcast_qfs_access_KfsOutputChannelBase_atomicRecordAppend( JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd, jobject buf, jint begin, jint end); - jint Java_com_quantcast_qfs_access_KfsOutputChannel_sync( + jint Java_com_quantcast_qfs_access_KfsOutputChannelBase_sync( JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd); - jint Java_com_quantcast_qfs_access_KfsOutputChannel_close( + jint Java_com_quantcast_qfs_access_KfsOutputChannelBase_close( JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd); } @@ -279,7 +271,7 @@ namespace } } -jlong Java_com_quantcast_qfs_access_KfsAccess_initF( +jlong Java_com_quantcast_qfs_access_KfsAccessBase_initF( JNIEnv *jenv, jclass jcls, jstring jpath) { string path; @@ -288,13 +280,13 @@ jlong Java_com_quantcast_qfs_access_KfsAccess_initF( return (jlong) clnt; } -jint Java_com_quantcast_qfs_access_KfsAccess_compareChunkReplicas( +jint Java_com_quantcast_qfs_access_KfsAccessBase_compareChunkReplicas( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jobject stringbuffermd5) { if (! jptr) { return -EFAULT; } - string path , md5Sum; + string path, md5Sum; setStr(path, jenv, jpath); KfsClient* const clnt = (KfsClient *) jptr; @@ -316,7 +308,7 @@ jint Java_com_quantcast_qfs_access_KfsAccess_compareChunkReplicas( return res; } -jlong Java_com_quantcast_qfs_access_KfsAccess_initS( +jlong Java_com_quantcast_qfs_access_KfsAccessBase_initS( JNIEnv *jenv, jclass jcls, jstring jmetaServerHost, jint metaServerPort) { string path; @@ -325,14 +317,14 @@ jlong Java_com_quantcast_qfs_access_KfsAccess_initS( return (jlong) clnt; } -void Java_com_quantcast_qfs_access_KfsAccess_destroy( +void Java_com_quantcast_qfs_access_KfsAccessBase_destroy( JNIEnv *jenv, jclass jcls, jlong jptr) { KfsClient* const clnt = (KfsClient*)jptr; delete clnt; } -jint Java_com_quantcast_qfs_access_KfsAccess_cd( +jint Java_com_quantcast_qfs_access_KfsAccessBase_cd( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath) { if (! jptr) { @@ -345,7 +337,7 @@ jint Java_com_quantcast_qfs_access_KfsAccess_cd( return clnt->Cd(path.c_str()); } -jint Java_com_quantcast_qfs_access_KfsAccess_mkdir( +jint Java_com_quantcast_qfs_access_KfsAccessBase_mkdir( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jint mode) { if (! jptr) { @@ -358,7 +350,7 @@ jint Java_com_quantcast_qfs_access_KfsAccess_mkdir( return clnt->Mkdir(path.c_str(), (kfsMode_t)mode); } -jint Java_com_quantcast_qfs_access_KfsAccess_mkdirs( +jint Java_com_quantcast_qfs_access_KfsAccessBase_mkdirs( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jint mode) { if (! jptr) { @@ -371,7 +363,7 @@ jint Java_com_quantcast_qfs_access_KfsAccess_mkdirs( return clnt->Mkdirs(path.c_str(), (kfsMode_t)mode); } -jint Java_com_quantcast_qfs_access_KfsAccess_rmdir( +jint Java_com_quantcast_qfs_access_KfsAccessBase_rmdir( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath) { if (! jptr) { @@ -384,7 +376,7 @@ jint Java_com_quantcast_qfs_access_KfsAccess_rmdir( return clnt->Rmdir(path.c_str()); } -jint Java_com_quantcast_qfs_access_KfsAccess_rmdirs( +jint Java_com_quantcast_qfs_access_KfsAccessBase_rmdirs( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath) { if (! jptr) { @@ -397,7 +389,7 @@ jint Java_com_quantcast_qfs_access_KfsAccess_rmdirs( return clnt->Rmdirs(path.c_str()); } -jobjectArray Java_com_quantcast_qfs_access_KfsAccess_readdir( +jobjectArray Java_com_quantcast_qfs_access_KfsAccessBase_readdir( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jboolean jpreloadattr) { if (! jptr) { @@ -439,7 +431,7 @@ jobjectArray Java_com_quantcast_qfs_access_KfsAccess_readdir( return jentries; } -jint Java_com_quantcast_qfs_access_KfsAccess_open( +jint Java_com_quantcast_qfs_access_KfsAccessBase_open( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jstring jmode, jint jnumReplicas, jint jnumStripes, jint jnumRecoveryStripes, jint jstripeSize, jint jstripedType, jint jcreateMode) @@ -471,7 +463,7 @@ jint Java_com_quantcast_qfs_access_KfsAccess_open( jnumStripes, jnumRecoveryStripes, jstripeSize, jstripedType, jcreateMode); } -jint Java_com_quantcast_qfs_access_KfsInputChannel_close( +jint Java_com_quantcast_qfs_access_KfsInputChannelBase_close( JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd) { if (! jptr) { @@ -482,7 +474,7 @@ jint Java_com_quantcast_qfs_access_KfsInputChannel_close( return clnt->Close(jfd); } -jint Java_com_quantcast_qfs_access_KfsOutputChannel_close( +jint Java_com_quantcast_qfs_access_KfsOutputChannelBase_close( JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd) { if (! jptr) { @@ -493,7 +485,7 @@ jint Java_com_quantcast_qfs_access_KfsOutputChannel_close( return clnt->Close(jfd); } -jint Java_com_quantcast_qfs_access_KfsAccess_create( +jint Java_com_quantcast_qfs_access_KfsAccessBase_create( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jint jnumReplicas, jboolean jexclusive, jint jnumStripes, jint jnumRecoveryStripes, jint jstripeSize, jint jstripedType, jboolean foreceType, jint mode, jint minSTier, jint maxSTier) @@ -510,7 +502,7 @@ jint Java_com_quantcast_qfs_access_KfsAccess_create( (kfsMode_t)mode, (kfsSTier_t)minSTier, (kfsSTier_t)maxSTier); } -jint Java_com_quantcast_qfs_access_KfsAccess_create2( +jint Java_com_quantcast_qfs_access_KfsAccessBase_create2( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jboolean jexclusive, jstring jcreateParams) { @@ -525,7 +517,7 @@ jint Java_com_quantcast_qfs_access_KfsAccess_create2( return clnt->Create(path.c_str(), (bool) jexclusive, createParams.c_str()); } -jint Java_com_quantcast_qfs_access_KfsAccess_create2ex( +jint Java_com_quantcast_qfs_access_KfsAccessBase_create2ex( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jboolean jexclusive, jstring jcreateParams, jint jmode, jboolean jforceTypeFlag) { @@ -541,7 +533,7 @@ jint Java_com_quantcast_qfs_access_KfsAccess_create2ex( (kfsMode_t)jmode, (bool)jforceTypeFlag); } -jint Java_com_quantcast_qfs_access_KfsAccess_remove( +jint Java_com_quantcast_qfs_access_KfsAccessBase_remove( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath) { if (! jptr) { @@ -554,7 +546,7 @@ jint Java_com_quantcast_qfs_access_KfsAccess_remove( return clnt->Remove(path.c_str()); } -jint Java_com_quantcast_qfs_access_KfsAccess_rename( +jint Java_com_quantcast_qfs_access_KfsAccessBase_rename( JNIEnv *jenv, jclass jcls, jlong jptr, jstring joldpath, jstring jnewpath, jboolean joverwrite) { @@ -570,7 +562,7 @@ jint Java_com_quantcast_qfs_access_KfsAccess_rename( return clnt->Rename(opath.c_str(), npath.c_str(), joverwrite); } -jint Java_com_quantcast_qfs_access_KfsAccess_symlink( +jint Java_com_quantcast_qfs_access_KfsAccessBase_symlink( JNIEnv *jenv, jclass jcls, jlong jptr, jstring target, jstring linkpath, jint jmode, jboolean joverwrite) { @@ -586,7 +578,7 @@ jint Java_com_quantcast_qfs_access_KfsAccess_symlink( return clnt->Symlink(starget.c_str(), slinkpath.c_str(), jmode, joverwrite); } -jlong Java_com_quantcast_qfs_access_KfsAccess_setDefaultIoBufferSize( +jlong Java_com_quantcast_qfs_access_KfsAccessBase_setDefaultIoBufferSize( JNIEnv *jenv, jclass jcls, jlong jptr, jlong jsize) { if (! jptr) { @@ -597,7 +589,7 @@ jlong Java_com_quantcast_qfs_access_KfsAccess_setDefaultIoBufferSize( return (jlong)clnt->SetDefaultIoBufferSize(jsize); } -jlong Java_com_quantcast_qfs_access_KfsAccess_getDefaultIoBufferSize( +jlong Java_com_quantcast_qfs_access_KfsAccessBase_getDefaultIoBufferSize( JNIEnv *jenv, jclass jcls, jlong jptr) { if (! jptr) { @@ -608,7 +600,7 @@ jlong Java_com_quantcast_qfs_access_KfsAccess_getDefaultIoBufferSize( return (jlong)clnt->GetDefaultIoBufferSize(); } -jlong Java_com_quantcast_qfs_access_KfsAccess_setDefaultReadAheadSize( +jlong Java_com_quantcast_qfs_access_KfsAccessBase_setDefaultReadAheadSize( JNIEnv *jenv, jclass jcls, jlong jptr, jlong jsize) { if (! jptr) { @@ -619,7 +611,7 @@ jlong Java_com_quantcast_qfs_access_KfsAccess_setDefaultReadAheadSize( return (jlong)clnt->SetDefaultReadAheadSize(jsize); } -jlong Java_com_quantcast_qfs_access_KfsAccess_getDefaultReadAheadSize( +jlong Java_com_quantcast_qfs_access_KfsAccessBase_getDefaultReadAheadSize( JNIEnv *jenv, jclass jcls, jlong jptr) { if (! jptr) { @@ -630,7 +622,7 @@ jlong Java_com_quantcast_qfs_access_KfsAccess_getDefaultReadAheadSize( return (jlong)clnt->GetDefaultReadAheadSize(); } -jlong Java_com_quantcast_qfs_access_KfsAccess_setIoBufferSize( +jlong Java_com_quantcast_qfs_access_KfsAccessBase_setIoBufferSize( JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd, jlong jsize) { if (! jptr) { @@ -641,7 +633,7 @@ jlong Java_com_quantcast_qfs_access_KfsAccess_setIoBufferSize( return (jlong)clnt->SetIoBufferSize(jfd, jsize); } -jlong Java_com_quantcast_qfs_access_KfsAccess_getIoBufferSize( +jlong Java_com_quantcast_qfs_access_KfsAccessBase_getIoBufferSize( JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd) { if (! jptr) { @@ -652,7 +644,7 @@ jlong Java_com_quantcast_qfs_access_KfsAccess_getIoBufferSize( return (jlong)clnt->GetIoBufferSize(jfd); } -jlong Java_com_quantcast_qfs_access_KfsAccess_setReadAheadSize( +jlong Java_com_quantcast_qfs_access_KfsAccessBase_setReadAheadSize( JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd, jlong jsize) { if (! jptr) { @@ -663,7 +655,7 @@ jlong Java_com_quantcast_qfs_access_KfsAccess_setReadAheadSize( return (jlong)clnt->SetReadAheadSize(jfd, jsize); } -jlong Java_com_quantcast_qfs_access_KfsAccess_getReadAheadSize( +jlong Java_com_quantcast_qfs_access_KfsAccessBase_getReadAheadSize( JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd) { if (! jptr) { @@ -674,7 +666,7 @@ jlong Java_com_quantcast_qfs_access_KfsAccess_getReadAheadSize( return (jlong)clnt->GetReadAheadSize(jfd); } -jint Java_com_quantcast_qfs_access_KfsAccess_getStripedType( +jint Java_com_quantcast_qfs_access_KfsAccessBase_getStripedType( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath) { if (! jptr) { @@ -690,7 +682,7 @@ jint Java_com_quantcast_qfs_access_KfsAccess_getStripedType( KFS_STRIPED_FILE_TYPE_UNKNOWN : attr.striperType); } -void Java_com_quantcast_qfs_access_KfsAccess_setFileAttributeRevalidateTime( +void Java_com_quantcast_qfs_access_KfsAccessBase_setFileAttributeRevalidateTime( JNIEnv *jenv, jclass jcls, jlong jptr, jint secs) { if (! jptr) { @@ -700,7 +692,7 @@ void Java_com_quantcast_qfs_access_KfsAccess_setFileAttributeRevalidateTime( clnt->SetFileAttributeRevalidateTime(secs); } -jint Java_com_quantcast_qfs_access_KfsAccess_chmod( +jint Java_com_quantcast_qfs_access_KfsAccessBase_chmod( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jint mode) { if (! jptr) { @@ -712,7 +704,7 @@ jint Java_com_quantcast_qfs_access_KfsAccess_chmod( return clnt->Chmod(path.c_str(), (kfsMode_t)mode); } -jint Java_com_quantcast_qfs_access_KfsAccess_chmodr( +jint Java_com_quantcast_qfs_access_KfsAccessBase_chmodr( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jint mode) { if (! jptr) { @@ -724,7 +716,7 @@ jint Java_com_quantcast_qfs_access_KfsAccess_chmodr( return clnt->ChmodR(path.c_str(), (kfsMode_t)mode); } -jint Java_com_quantcast_qfs_access_KfsAccess_fchmod( +jint Java_com_quantcast_qfs_access_KfsAccessBase_fchmod( JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd, jint mode) { if (! jptr) { @@ -734,7 +726,7 @@ jint Java_com_quantcast_qfs_access_KfsAccess_fchmod( return clnt->Chmod(jfd, (kfsMode_t)mode); } -jint Java_com_quantcast_qfs_access_KfsAccess_chowns( +jint Java_com_quantcast_qfs_access_KfsAccessBase_chowns( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jstring juser, jstring jgroup) { if (! jptr) { @@ -754,7 +746,7 @@ jint Java_com_quantcast_qfs_access_KfsAccess_chowns( return clnt->Chown(path.c_str(), user.c_str(), group.c_str()); } -jint Java_com_quantcast_qfs_access_KfsAccess_chownsr( +jint Java_com_quantcast_qfs_access_KfsAccessBase_chownsr( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jstring juser, jstring jgroup) { if (! jptr) { @@ -774,7 +766,7 @@ jint Java_com_quantcast_qfs_access_KfsAccess_chownsr( return clnt->ChownR(path.c_str(), user.c_str(), group.c_str()); } -jint Java_com_quantcast_qfs_access_KfsAccess_chown( +jint Java_com_quantcast_qfs_access_KfsAccessBase_chown( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jlong user, jlong group) { if (! jptr) { @@ -786,7 +778,7 @@ jint Java_com_quantcast_qfs_access_KfsAccess_chown( return clnt->Chown(path.c_str(), (kfsUid_t)user, (kfsGid_t)group); } -jint Java_com_quantcast_qfs_access_KfsAccess_chownr( +jint Java_com_quantcast_qfs_access_KfsAccessBase_chownr( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jlong user, jlong group) { if (! jptr) { @@ -798,7 +790,7 @@ jint Java_com_quantcast_qfs_access_KfsAccess_chownr( return clnt->ChownR(path.c_str(), (kfsUid_t)user, (kfsGid_t)group); } -jint Java_com_quantcast_qfs_access_KfsAccess_chownR( +jint Java_com_quantcast_qfs_access_KfsAccessBase_chownR( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jlong user, jlong group) { if (! jptr) { @@ -810,7 +802,7 @@ jint Java_com_quantcast_qfs_access_KfsAccess_chownR( return clnt->ChownR(path.c_str(), (kfsUid_t)user, (kfsGid_t)group); } -jint Java_com_quantcast_qfs_access_KfsAccess_fchowns( +jint Java_com_quantcast_qfs_access_KfsAccessBase_fchowns( JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd, jstring juser, jstring jgroup) { if (! jptr) { @@ -828,7 +820,7 @@ jint Java_com_quantcast_qfs_access_KfsAccess_fchowns( return clnt->Chown(jfd, user.c_str(), group.c_str()); } -jint Java_com_quantcast_qfs_access_KfsAccess_fchown( +jint Java_com_quantcast_qfs_access_KfsAccessBase_fchown( JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd, jlong user, jlong group) { if (! jptr) { @@ -838,7 +830,7 @@ jint Java_com_quantcast_qfs_access_KfsAccess_fchown( return clnt->Chown(jfd, (kfsUid_t)user, (kfsGid_t)group); } -jint Java_com_quantcast_qfs_access_KfsOutputChannel_sync( +jint Java_com_quantcast_qfs_access_KfsOutputChannelBase_sync( JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd) { if (! jptr) { @@ -848,7 +840,7 @@ jint Java_com_quantcast_qfs_access_KfsOutputChannel_sync( return clnt->Sync(jfd); } -jint Java_com_quantcast_qfs_access_KfsAccess_setEUserAndEGroup( +jint Java_com_quantcast_qfs_access_KfsAccessBase_setEUserAndEGroup( JNIEnv *jenv, jclass jcls, jlong jptr, jlong user, jlong group, jlongArray jgroups) { if (! jptr) { @@ -872,7 +864,7 @@ jint Java_com_quantcast_qfs_access_KfsAccess_setEUserAndEGroup( return ret; } -jint Java_com_quantcast_qfs_access_KfsAccess_exists( +jint Java_com_quantcast_qfs_access_KfsAccessBase_exists( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath) { if (! jptr) { @@ -885,7 +877,7 @@ jint Java_com_quantcast_qfs_access_KfsAccess_exists( return (clnt->Exists(path.c_str()) ? 1 : 0); } -jint Java_com_quantcast_qfs_access_KfsAccess_isFile( +jint Java_com_quantcast_qfs_access_KfsAccessBase_isFile( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath) { if (! jptr) { @@ -899,7 +891,7 @@ jint Java_com_quantcast_qfs_access_KfsAccess_isFile( return (clnt->IsFile(path.c_str()) ? 1 : 0); } -jint Java_com_quantcast_qfs_access_KfsAccess_isDirectory( +jint Java_com_quantcast_qfs_access_KfsAccessBase_isDirectory( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath) { if (! jptr) { @@ -913,7 +905,7 @@ jint Java_com_quantcast_qfs_access_KfsAccess_isDirectory( return (clnt->IsDirectory(path.c_str()) ? 1 : 0); } -jlong Java_com_quantcast_qfs_access_KfsAccess_filesize( +jlong Java_com_quantcast_qfs_access_KfsAccessBase_filesize( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath) { if (! jptr) { @@ -932,7 +924,7 @@ jlong Java_com_quantcast_qfs_access_KfsAccess_filesize( return attr.fileSize; } -jlong Java_com_quantcast_qfs_access_KfsAccess_getModificationTime( +jlong Java_com_quantcast_qfs_access_KfsAccessBase_getModificationTime( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath) { if (! jptr) { @@ -951,7 +943,7 @@ jlong Java_com_quantcast_qfs_access_KfsAccess_getModificationTime( return ((jlong) attr.mtime.tv_sec) * 1000 + (jlong) (attr.mtime.tv_usec / 1000); } -jint Java_com_quantcast_qfs_access_KfsAccess_setUTimes( +jint Java_com_quantcast_qfs_access_KfsAccessBase_setUTimes( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jlong jmtime_usec, jlong jatime_usec, jlong jctime_usec) { if (! jptr) { @@ -1025,7 +1017,7 @@ static jobjectArray CreateLocations( return jentries; } -jobjectArray Java_com_quantcast_qfs_access_KfsAccess_getDataLocation( +jobjectArray Java_com_quantcast_qfs_access_KfsAccessBase_getDataLocation( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jlong jstart, jlong jlen) { if (! jptr) { @@ -1043,7 +1035,7 @@ jobjectArray Java_com_quantcast_qfs_access_KfsAccess_getDataLocation( return CreateLocations(jenv, entries, 0); } -jobjectArray Java_com_quantcast_qfs_access_KfsAccess_getBlocksLocation( +jobjectArray Java_com_quantcast_qfs_access_KfsAccessBase_getBlocksLocation( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jlong jstart, jlong jlen) { if (! jptr) { @@ -1081,7 +1073,7 @@ jobjectArray Java_com_quantcast_qfs_access_KfsAccess_getBlocksLocation( return CreateLocations(jenv, entries, ptr); } -jshort Java_com_quantcast_qfs_access_KfsAccess_getReplication( +jshort Java_com_quantcast_qfs_access_KfsAccessBase_getReplication( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath) { if (! jptr) { @@ -1094,7 +1086,7 @@ jshort Java_com_quantcast_qfs_access_KfsAccess_getReplication( return clnt->GetReplicationFactor(path.c_str()); } -jshort Java_com_quantcast_qfs_access_KfsAccess_setReplication( +jshort Java_com_quantcast_qfs_access_KfsAccessBase_setReplication( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jint jnumReplicas) { if (! jptr) { @@ -1107,7 +1099,7 @@ jshort Java_com_quantcast_qfs_access_KfsAccess_setReplication( return clnt->SetReplicationFactor(path.c_str(), jnumReplicas); } -static jint Java_com_quantcast_qfs_access_KfsAccess_xstat(bool lstat_flag, +static jint Java_com_quantcast_qfs_access_KfsAccessBase_xstat(bool lstat_flag, JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jobject attr) { if (! jptr) { @@ -1291,32 +1283,32 @@ static jint Java_com_quantcast_qfs_access_KfsAccess_xstat(bool lstat_flag, return 0; } -jint Java_com_quantcast_qfs_access_KfsAccess_stat( +jint Java_com_quantcast_qfs_access_KfsAccessBase_stat( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jobject attr) { - return Java_com_quantcast_qfs_access_KfsAccess_xstat(false, jenv, jcls, jptr, jpath, attr); + return Java_com_quantcast_qfs_access_KfsAccessBase_xstat(false, jenv, jcls, jptr, jpath, attr); } -jint Java_com_quantcast_qfs_access_KfsAccess_lstat( +jint Java_com_quantcast_qfs_access_KfsAccessBase_lstat( JNIEnv *jenv, jclass jcls, jlong jptr, jstring jpath, jobject attr) { - return Java_com_quantcast_qfs_access_KfsAccess_xstat(true, jenv, jcls, jptr, jpath, attr); + return Java_com_quantcast_qfs_access_KfsAccessBase_xstat(true, jenv, jcls, jptr, jpath, attr); } -jstring Java_com_quantcast_qfs_access_KfsAccess_strerror( +jstring Java_com_quantcast_qfs_access_KfsAccessBase_strerror( JNIEnv *jenv, jclass jcls, jlong jptr, jint jerr) { const string str = KFS::ErrorCodeToStr((int)jerr); return jenv->NewStringUTF(str.c_str()); } -jboolean Java_com_quantcast_qfs_access_KfsAccess_isnotfound( +jboolean Java_com_quantcast_qfs_access_KfsAccessBase_isnotfound( JNIEnv *jenv, jclass jcls, jlong jptr, jint jerr) { return (jboolean)(jerr == -ENOENT || jerr == -ENOTDIR); } -jint Java_com_quantcast_qfs_access_KfsAccess_close( +jint Java_com_quantcast_qfs_access_KfsAccessBase_close( JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd) { if (! jptr) { @@ -1326,7 +1318,7 @@ jint Java_com_quantcast_qfs_access_KfsAccess_close( return clnt->Close(jfd); } -jlong Java_com_quantcast_qfs_access_KfsAccess_seek( +jlong Java_com_quantcast_qfs_access_KfsAccessBase_seek( JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd, jlong joffset) { if (! jptr) { @@ -1336,7 +1328,7 @@ jlong Java_com_quantcast_qfs_access_KfsAccess_seek( return (jlong)clnt->Seek(jfd, joffset); } -jlong Java_com_quantcast_qfs_access_KfsAccess_tell( +jlong Java_com_quantcast_qfs_access_KfsAccessBase_tell( JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd) { if (! jptr) { @@ -1346,7 +1338,7 @@ jlong Java_com_quantcast_qfs_access_KfsAccess_tell( return (jlong)clnt->Tell(jfd); } -jint Java_com_quantcast_qfs_access_KfsAccess_getUMask( +jint Java_com_quantcast_qfs_access_KfsAccessBase_getUMask( JNIEnv *jenv, jclass jcls, jlong jptr) { if (! jptr) { @@ -1356,7 +1348,7 @@ jint Java_com_quantcast_qfs_access_KfsAccess_getUMask( return (jint)(clnt->GetUMask() & 0777); } -jint Java_com_quantcast_qfs_access_KfsAccess_setUMask( +jint Java_com_quantcast_qfs_access_KfsAccessBase_setUMask( JNIEnv *jenv, jclass jcls, jlong jptr, jint umask) { if (! jptr) { @@ -1367,7 +1359,7 @@ jint Java_com_quantcast_qfs_access_KfsAccess_setUMask( return 0; } -jstring Java_com_quantcast_qfs_access_KfsAccess_createDelegationToken( +jstring Java_com_quantcast_qfs_access_KfsAccessBase_createDelegationToken( JNIEnv *jenv, jclass jcls, jlong jptr, jboolean allowDelegationFlag, jlong validTime, jobject result) { @@ -1453,7 +1445,7 @@ jstring Java_com_quantcast_qfs_access_KfsAccess_createDelegationToken( return 0; } -jstring Java_com_quantcast_qfs_access_KfsAccess_renewDelegationToken( +jstring Java_com_quantcast_qfs_access_KfsAccessBase_renewDelegationToken( JNIEnv *jenv, jclass jcls, jlong jptr, jobject token) { @@ -1547,7 +1539,7 @@ jstring Java_com_quantcast_qfs_access_KfsAccess_renewDelegationToken( return 0; } -jstring Java_com_quantcast_qfs_access_KfsAccess_cancelDelegationToken( +jstring Java_com_quantcast_qfs_access_KfsAccessBase_cancelDelegationToken( JNIEnv *jenv, jclass jcls, jlong jptr, jobject token) { @@ -1598,7 +1590,7 @@ jstring Java_com_quantcast_qfs_access_KfsAccess_cancelDelegationToken( return 0; } -jobjectArray Java_com_quantcast_qfs_access_KfsAccess_getStats( +jobjectArray Java_com_quantcast_qfs_access_KfsAccessBase_getStats( JNIEnv *jenv, jclass jcls, jlong jptr) { if (! jptr) { @@ -1636,7 +1628,7 @@ jobjectArray Java_com_quantcast_qfs_access_KfsAccess_getStats( return jentries; } -jint Java_com_quantcast_qfs_access_KfsInputChannel_read( +jint Java_com_quantcast_qfs_access_KfsInputChannelBase_read( JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd, jobject buf, jint begin, jint end) { if (! jptr) { @@ -1662,7 +1654,7 @@ jint Java_com_quantcast_qfs_access_KfsInputChannel_read( return (jint)sz; } -jint Java_com_quantcast_qfs_access_KfsOutputChannel_write( +jint Java_com_quantcast_qfs_access_KfsOutputChannelBase_write( JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd, jobject buf, jint begin, jint end) { if (! jptr) { @@ -1688,7 +1680,7 @@ jint Java_com_quantcast_qfs_access_KfsOutputChannel_write( return (jint)sz; } -jint Java_com_quantcast_qfs_access_KfsOutputChannel_atomicRecordAppend( +jint Java_com_quantcast_qfs_access_KfsOutputChannelBase_atomicRecordAppend( JNIEnv *jenv, jclass jcls, jlong jptr, jint jfd, jobject buf, jint begin, jint end) { if (! jptr) { diff --git a/src/cc/common/LinearHash.h b/src/cc/common/LinearHash.h index 7e743db9b..f1df064d2 100644 --- a/src/cc/common/LinearHash.h +++ b/src/cc/common/LinearHash.h @@ -379,7 +379,7 @@ class LinearHash {} KeyValT* Next() { - return mHashTable.template NextEntryT( + return mHashTable.template NextEntryT<>( mNextBucketIdx, mNextEntryPtr, static_cast(0)); } private: diff --git a/src/cc/common/StBuffer.h b/src/cc/common/StBuffer.h index adf841a1f..c6331eb39 100644 --- a/src/cc/common/StBuffer.h +++ b/src/cc/common/StBuffer.h @@ -188,7 +188,7 @@ class StBufferT } } } - swap(mCapacity, inBuf.capacity); + swap(mCapacity, inBuf.mCapacity); swap(mSize, inBuf.mSize); } void Clear() diff --git a/src/cc/common/buildversgit.sh b/src/cc/common/buildversgit.sh index d01c79f78..a1ff14014 100755 --- a/src/cc/common/buildversgit.sh +++ b/src/cc/common/buildversgit.sh @@ -19,7 +19,7 @@ # permissions and limitations under the License. # default version to use if git is not available -qfs_no_git_version="2.2.7" +qfs_no_git_version="2.2.8" usage() { echo " diff --git a/src/cc/common/kfstypes.h b/src/cc/common/kfstypes.h index 83e861dcb..60fad6719 100644 --- a/src/cc/common/kfstypes.h +++ b/src/cc/common/kfstypes.h @@ -6,7 +6,7 @@ // Created 2006/10/20 // Author: Sriram Rao // -// Copyright 2008-2012,2016 Quantcast Corporation. All rights reserved. +// Copyright 2008-2025 Quantcast Corporation. All rights reserved. // Copyright 2006-2008 Kosmix Corp. // // This file is part of Kosmos File System (KFS). @@ -205,6 +205,12 @@ typedef uint16_t FileAttrExtTypes; // Extended attributes type. const int KFS_SYMLOOP_MAX = 32; -} +#if defined(__cplusplus) && __cplusplus >= 201103L + // C++11 or later. +# define KFS_CONSTEXPR constexpr +#else +# define KFS_CONSTEXPR +#endif +} // namespace KFS #endif // COMMON_KFSTYPES_H diff --git a/src/cc/kfsio/Base64.h b/src/cc/kfsio/Base64.h index f9eee3d84..68891f4d8 100644 --- a/src/cc/kfsio/Base64.h +++ b/src/cc/kfsio/Base64.h @@ -4,7 +4,7 @@ // Created 2013/9/9 // Author: Mike Ovsiannikov // -// Copyright 2013,2016 Quantcast Corporation. All rights reserved. +// Copyright 2013-2025 Quantcast Corporation. All rights reserved. // // This file is part of Kosmos File System (KFS). // @@ -26,19 +26,21 @@ #ifndef KFSIO_BASE64_H #define KFSIO_BASE64_H +#include "common/kfstypes.h" + namespace KFS { class Base64 { public: - static int EncodedLength( + KFS_CONSTEXPR static int EncodedLength( int inLength) { return ((inLength + 2) / 3 * 4); } - static int GetEncodedMaxBufSize( + KFS_CONSTEXPR static int GetEncodedMaxBufSize( int inLength) { return EncodedLength(inLength) + 1; } - static int GetMaxDecodedLength( + KFS_CONSTEXPR static int GetMaxDecodedLength( int inLength) { return ((inLength + 3) / 4 * 3); } static int Encode( diff --git a/src/cc/kfsio/CryptoKeys.h b/src/cc/kfsio/CryptoKeys.h index 71d0bea09..c2e3fe0f6 100644 --- a/src/cc/kfsio/CryptoKeys.h +++ b/src/cc/kfsio/CryptoKeys.h @@ -4,7 +4,7 @@ // Created 2013/09/25 // Author: Mike Ovsiannikov // -// Copyright 2013,2016 Quantcast Corporation. All rights reserved. +// Copyright 2013-2025 Quantcast Corporation. All rights reserved. // // This file is part of Kosmos File System (KFS). // @@ -108,7 +108,7 @@ class CryptoKeys ostream& Display( ostream& inStream, bool inUrlSafeFmtFlag = false) const; - static int GetSize() + KFS_CONSTEXPR static int GetSize() { return kLength; } const char* GetPtr() const { return mKey; } diff --git a/src/cc/libclient/Writer.cc b/src/cc/libclient/Writer.cc index 1f81f0ce0..e4a8d6d75 100644 --- a/src/cc/libclient/Writer.cc +++ b/src/cc/libclient/Writer.cc @@ -45,7 +45,6 @@ #include "qcdio/QCDLList.h" #include "RSStriper.h" #include "KfsOps.h" -#include "utils.h" #include "KfsClient.h" #include "Monitor.h" @@ -704,7 +703,7 @@ class Writer::Impl : ReportCompletion(); return; } - if (! CanWrite() && ! SheduleLeaseUpdate()) { + if (! CanWrite() && ! ScheduleLeaseUpdate()) { return; } if (0 < mAllocOp.chunkId && min(mLeaseEndTime - 1, @@ -720,7 +719,7 @@ class Writer::Impl : " empty" << KFS_LOG_EOM; Reset(); - if (! CanWrite() && ! SheduleLeaseUpdate()) { + if (! CanWrite() && ! ScheduleLeaseUpdate()) { // Do not try to preallocate chunk after inactivity timeout // or error, if no data pending. return; @@ -921,7 +920,7 @@ class Writer::Impl : mKeepLeaseFlag = mAllocOp.chunkVersion < 0; AllocateWriteId(); } - bool SheduleLeaseUpdate() + bool ScheduleLeaseUpdate() { if (! mKeepLeaseFlag) { return false; @@ -1166,6 +1165,14 @@ class Writer::Impl : } UpdateAccess(inOp); UpdateLeaseExpirationTime(); + if (mKeepLeaseFlag && ! mClosingFlag && ! CanWrite()) { + // Reset retry count, as this is purely lease update + // operation, and with no outstanding writes completion / + // progress update that resets retry count will not be + // invoked. In the case of failure retry will start from the + // chunk allocation. + mRetryCount = 0; + } StartWrite(); } void Write() @@ -1529,6 +1536,7 @@ class Writer::Impl : " chunkserver: " << (mChunkServer.IsDataSent() ? (mChunkServer.IsAllDataSent() ? "all" : "partial") : "no") << " data sent" << + " retry: " << mRetryCount << "\nRequest:\n" << theOStream.str() << KFS_LOG_EOM; int theStatus = inOp.status; diff --git a/src/cc/meta/LayoutManager.cc b/src/cc/meta/LayoutManager.cc index 6f0ac1787..130746c14 100644 --- a/src/cc/meta/LayoutManager.cc +++ b/src/cc/meta/LayoutManager.cc @@ -7386,6 +7386,8 @@ LayoutManager::GetChunkWriteLease(MetaAllocate& req) panic("failed to get write lease for a chunk"); req.status = -EFAULT; } + req.allChunkServersShortRpcFlag = ! req.servers.empty() && + req.servers.front()->IsShortRpcFormat(); return; } if (GetInFlightChunkModificationOpCount(req.chunkId) > 0) { diff --git a/src/java/javabuild.sh b/src/java/javabuild.sh index c341d6297..487ae1e4e 100755 --- a/src/java/javabuild.sh +++ b/src/java/javabuild.sh @@ -2,7 +2,7 @@ # Author: Thilee Subramaniam # -# Copyright 2012,2016 Quantcast Corporation. All rights reserved. +# Copyright 2012-2025 Quantcast Corporation. All rights reserved. # # This file is part of Quantcast File System (QFS). # @@ -21,14 +21,14 @@ # Helper script to build Java components of QFS. # -mymaxtry=1 +my_max_try=1 work_dir='' build_vers_git_path=../cc/common/buildversgit.sh while [ $# -gt 0 ]; do if [ x"$1" = x'-r' -a $# -gt 1 ]; then shift - mymaxtry=${1-1} + my_max_try=${1-1} elif [ x"$1" = x'-d' -a $# -gt 1 ]; then shift work_dir=$1 @@ -69,13 +69,13 @@ if [ $# -eq 1 ]; then exit fi if [ x"$1" != x'--' ]; then - myversion="$(echo "$1" | cut -d. -f 1-2)" - myversionmaj="$(echo "$1" | cut -d. -f 1)" - if [ x"$myversion" = x"1.0" -o x"$myversion" = x"1.1" ]; then + my_version="$(echo "$1" | cut -d. -f 1-2)" + my_version_maj="$(echo "$1" | cut -d. -f 1)" + if [ x"$my_version" = x"1.0" -o x"$my_version" = x"1.1" ]; then hadoop_qfs_profile="hadoop_branch1_profile" - elif [ x"$myversion" = x"0.23" ]; then + elif [ x"$my_version" = x"0.23" ]; then hadoop_qfs_profile="hadoop_trunk_profile" - elif [ x"$myversionmaj" = x"2" -o x"$myversionmaj" = x"3" ]; then + elif [ x"$my_version_maj" = x"2" -o x"$my_version_maj" = x"3" ]; then hadoop_qfs_profile="hadoop_trunk_profile,hadoop_trunk_profile_2" else echo "Unsupported Hadoop release version." @@ -100,12 +100,25 @@ until javac --release $min_supported_release -version >/dev/null 2>&1; do fi min_supported_release=$(expr $min_supported_release + 1) done +if [ x"${qfs_access_profile-}" = x ]; then + if [ $min_supported_release -lt 9 ]; then + qfs_access_profile="qfs_access_java_pre_9" + else + qfs_access_profile="qfs_access_java_9" + fi +fi min_supported_release=1.$min_supported_release echo "qfs_release_version = $qfs_release_version" echo "qfs_source_revision = $qfs_source_revision" echo "hadoop_qfs_profile = $hadoop_qfs_profile" echo "test_build_data = $test_build_data" +echo "qfs_access_profile = $qfs_access_profile" +if [ x"$qfs_access_profile" = x'qfs_access_java_9' ]; then + qfs_access_project='qfs-access' +else + qfs_access_project='qfs-access-pre-9' +fi run_maven_exit_if_success() { set -x @@ -121,21 +134,23 @@ run_maven_exit_if_success() { set +x } -mytry=0 +my_try=0 while true; do if [ x"$1" = x'--' ]; then shift run_maven_exit_if_success ${1+"$@"} elif [ x"$hadoop_qfs_profile" = x'none' ]; then - run_maven_exit_if_success --projects qfs-access package + run_maven_exit_if_success -P "$qfs_access_profile" \ + --projects "$qfs_access_project" package else - run_maven_exit_if_success -P "$hadoop_qfs_profile" \ + run_maven_exit_if_success \ + -P "$hadoop_qfs_profile","$qfs_access_profile" \ -Dhadoop.release.version="$1" package fi - mytry=$(expr $mytry + 1) - [ $mytry -lt $mymaxtry ] || break - echo "Retry: $mytry in 20 * $mytry seconds" - sleep $(expr 20 \* $mytry) + my_try=$(expr $my_try + 1) + [ $my_try -lt $my_max_try ] || break + echo "Retry: $my_try in 20 * $my_try seconds" + sleep $(expr 20 \* $my_try) done exit 1 diff --git a/src/java/pom.xml b/src/java/pom.xml index 179d72f3f..71b4c6636 100644 --- a/src/java/pom.xml +++ b/src/java/pom.xml @@ -60,7 +60,6 @@ permissions and limitations under the License. - qfs-access hadoop-qfs @@ -70,5 +69,17 @@ permissions and limitations under the License. hadoop-qfs-2 + + qfs_access_java_pre_9 + + qfs-access-pre-9 + + + + qfs_access_java_9 + + qfs-access + + diff --git a/src/java/qfs-access-pre-9/pom.xml b/src/java/qfs-access-pre-9/pom.xml new file mode 120000 index 000000000..cb284ccef --- /dev/null +++ b/src/java/qfs-access-pre-9/pom.xml @@ -0,0 +1 @@ +../qfs-access/pom.xml \ No newline at end of file diff --git a/src/java/qfs-access-pre-9/src/main/java/com/quantcast/qfs/access/BufferPool.java b/src/java/qfs-access-pre-9/src/main/java/com/quantcast/qfs/access/BufferPool.java new file mode 120000 index 000000000..5336ad92e --- /dev/null +++ b/src/java/qfs-access-pre-9/src/main/java/com/quantcast/qfs/access/BufferPool.java @@ -0,0 +1 @@ +../../../../../../../../qfs-access/src/main/java/com/quantcast/qfs/access/BufferPool.java \ No newline at end of file diff --git a/src/java/qfs-access-pre-9/src/main/java/com/quantcast/qfs/access/KfsAccess.java b/src/java/qfs-access-pre-9/src/main/java/com/quantcast/qfs/access/KfsAccess.java new file mode 100644 index 000000000..cef3f96f8 --- /dev/null +++ b/src/java/qfs-access-pre-9/src/main/java/com/quantcast/qfs/access/KfsAccess.java @@ -0,0 +1,50 @@ +/** + * $Id$ + * + * Created 2025/04/20 + * + * @author: Mike Ovsiannikov (Quantcast Corporation) + * + * Copyright 2025 Quantcast Corporation. All rights reserved. Copyright 2007 + * Kosmix Corp. + * + * This file is part of Kosmos File System (KFS). + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * \brief Java wrappers to get to the KFS client. + */ +package com.quantcast.qfs.access; + +import java.io.IOException; + +final public class KfsAccess extends KfsAccessBase { + + public KfsAccess(String configFn) throws IOException { + super(configFn); + } + + public KfsAccess(String metaServerHost, + int metaServerPort) throws IOException { + super(metaServerHost, metaServerPort); + } + + @Override + protected void finalize() throws Throwable { + try { + kfs_destroy(); + } finally { + super.finalize(); + } + } +} diff --git a/src/java/qfs-access-pre-9/src/main/java/com/quantcast/qfs/access/KfsAccessBase.java b/src/java/qfs-access-pre-9/src/main/java/com/quantcast/qfs/access/KfsAccessBase.java new file mode 120000 index 000000000..a0e9a6f9a --- /dev/null +++ b/src/java/qfs-access-pre-9/src/main/java/com/quantcast/qfs/access/KfsAccessBase.java @@ -0,0 +1 @@ +../../../../../../../../qfs-access/src/main/java/com/quantcast/qfs/access/KfsAccessBase.java \ No newline at end of file diff --git a/src/java/qfs-access-pre-9/src/main/java/com/quantcast/qfs/access/KfsDelegation.java b/src/java/qfs-access-pre-9/src/main/java/com/quantcast/qfs/access/KfsDelegation.java new file mode 120000 index 000000000..618fdaaf0 --- /dev/null +++ b/src/java/qfs-access-pre-9/src/main/java/com/quantcast/qfs/access/KfsDelegation.java @@ -0,0 +1 @@ +../../../../../../../../qfs-access/src/main/java/com/quantcast/qfs/access/KfsDelegation.java \ No newline at end of file diff --git a/src/java/qfs-access-pre-9/src/main/java/com/quantcast/qfs/access/KfsFileAttr.java b/src/java/qfs-access-pre-9/src/main/java/com/quantcast/qfs/access/KfsFileAttr.java new file mode 120000 index 000000000..d4cff6cd1 --- /dev/null +++ b/src/java/qfs-access-pre-9/src/main/java/com/quantcast/qfs/access/KfsFileAttr.java @@ -0,0 +1 @@ +../../../../../../../../qfs-access/src/main/java/com/quantcast/qfs/access/KfsFileAttr.java \ No newline at end of file diff --git a/src/java/qfs-access-pre-9/src/main/java/com/quantcast/qfs/access/KfsInputChannel.java b/src/java/qfs-access-pre-9/src/main/java/com/quantcast/qfs/access/KfsInputChannel.java new file mode 100644 index 000000000..2780efe31 --- /dev/null +++ b/src/java/qfs-access-pre-9/src/main/java/com/quantcast/qfs/access/KfsInputChannel.java @@ -0,0 +1,45 @@ +/** + * $Id$ + * + * Created 2007/09/11 + * + * @author: Sriram Rao (Kosmix Corp.) + * + * Copyright 2008-2012,2016 Quantcast Corporation. All rights reserved. + * Copyright 2007 Kosmix Corp. + * + * This file is part of Kosmos File System (KFS). + * + * Licensed under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. See the License for the specific language governing + * permissions and limitations under the License. + * + * \brief An input channel that does buffered I/O. This is to reduce + * the overhead of JNI calls. + */ +package com.quantcast.qfs.access; + +/* A byte channel interface with seek support */ +final public class KfsInputChannel extends KfsInputChannelBase { + + KfsInputChannel(KfsAccessBase ka, int fd) { + super(ka, fd); + } + + @Override + protected void finalize() throws Throwable { + try { + state.release(); + } finally { + super.finalize(); + } + } +} diff --git a/src/java/qfs-access-pre-9/src/main/java/com/quantcast/qfs/access/KfsInputChannelBase.java b/src/java/qfs-access-pre-9/src/main/java/com/quantcast/qfs/access/KfsInputChannelBase.java new file mode 120000 index 000000000..f773c4f88 --- /dev/null +++ b/src/java/qfs-access-pre-9/src/main/java/com/quantcast/qfs/access/KfsInputChannelBase.java @@ -0,0 +1 @@ +../../../../../../../../qfs-access/src/main/java/com/quantcast/qfs/access/KfsInputChannelBase.java \ No newline at end of file diff --git a/src/java/qfs-access-pre-9/src/main/java/com/quantcast/qfs/access/KfsOutputChannel.java b/src/java/qfs-access-pre-9/src/main/java/com/quantcast/qfs/access/KfsOutputChannel.java new file mode 100644 index 000000000..6065d0954 --- /dev/null +++ b/src/java/qfs-access-pre-9/src/main/java/com/quantcast/qfs/access/KfsOutputChannel.java @@ -0,0 +1,43 @@ +/** + * $Id$ + * + * Created 2007/09/11 + * + * @author: Sriram Rao (Kosmix Corp.) + * + * Copyright 2008-2012,2016 Quantcast Corporation. All rights reserved. + * Copyright 2007 Kosmix Corp. + * + * This file is part of Kosmos File System (KFS). + * + * Licensed under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. See the License for the specific language governing + * permissions and limitations under the License. + * + * \brief An output channel pre java 9 style cleanup. + */ +package com.quantcast.qfs.access; + +final public class KfsOutputChannel extends KfsOutputChannelBase { + + KfsOutputChannel(KfsAccessBase kfsAccess, int fd, boolean append) { + super(kfsAccess, fd, append); + } + + @Override + protected void finalize() throws Throwable { + try { + state.run(); + } finally { + super.finalize(); + } + } +} diff --git a/src/java/qfs-access-pre-9/src/main/java/com/quantcast/qfs/access/KfsOutputChannelBase.java b/src/java/qfs-access-pre-9/src/main/java/com/quantcast/qfs/access/KfsOutputChannelBase.java new file mode 120000 index 000000000..7fe78e5c0 --- /dev/null +++ b/src/java/qfs-access-pre-9/src/main/java/com/quantcast/qfs/access/KfsOutputChannelBase.java @@ -0,0 +1 @@ +../../../../../../../../qfs-access/src/main/java/com/quantcast/qfs/access/KfsOutputChannelBase.java \ No newline at end of file diff --git a/src/java/qfs-access-pre-9/src/main/java/com/quantcast/qfs/access/KfsTest.java b/src/java/qfs-access-pre-9/src/main/java/com/quantcast/qfs/access/KfsTest.java new file mode 120000 index 000000000..afbddc121 --- /dev/null +++ b/src/java/qfs-access-pre-9/src/main/java/com/quantcast/qfs/access/KfsTest.java @@ -0,0 +1 @@ +../../../../../../../../qfs-access/src/main/java/com/quantcast/qfs/access/KfsTest.java \ No newline at end of file diff --git a/src/java/qfs-access-pre-9/src/main/java/com/quantcast/qfs/access/Positionable.java b/src/java/qfs-access-pre-9/src/main/java/com/quantcast/qfs/access/Positionable.java new file mode 120000 index 000000000..af294288f --- /dev/null +++ b/src/java/qfs-access-pre-9/src/main/java/com/quantcast/qfs/access/Positionable.java @@ -0,0 +1 @@ +../../../../../../../../qfs-access/src/main/java/com/quantcast/qfs/access/Positionable.java \ No newline at end of file diff --git a/src/java/qfs-access-pre-9/src/test b/src/java/qfs-access-pre-9/src/test new file mode 120000 index 000000000..9f629b468 --- /dev/null +++ b/src/java/qfs-access-pre-9/src/test @@ -0,0 +1 @@ +../../qfs-access/src/test \ No newline at end of file diff --git a/src/java/qfs-access/src/main/java/com/quantcast/qfs/access/KfsAccess.java b/src/java/qfs-access/src/main/java/com/quantcast/qfs/access/KfsAccess.java index 5662d0962..1f918fee9 100644 --- a/src/java/qfs-access/src/main/java/com/quantcast/qfs/access/KfsAccess.java +++ b/src/java/qfs-access/src/main/java/com/quantcast/qfs/access/KfsAccess.java @@ -1,1051 +1,65 @@ /** * $Id$ * - * Created 2007/08/24 - * @author: Sriram Rao (Kosmix Corp.) + * Created 2025/04/20 * - * Copyright 2008-2012,2016 Quantcast Corporation. All rights reserved. - * Copyright 2007 Kosmix Corp. + * @author: Mike Ovsiannikov (Quantcast Corporation) + * + * Copyright 2025 Quantcast Corporation. All rights reserved. Copyright 2007 + * Kosmix Corp. * * This file is part of Kosmos File System (KFS). * - * Licensed under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - * implied. See the License for the specific language governing - * permissions and limitations under the License. + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. * - * \brief Java wrappers to get to the KFS client. + * \brief QFS access java 9 style cleanup. */ - package com.quantcast.qfs.access; import java.io.IOException; -import java.io.FileNotFoundException; -import java.nio.ByteBuffer; -import java.nio.CharBuffer; -import java.nio.charset.Charset; -import java.nio.charset.CharsetDecoder; -import java.nio.charset.CodingErrorAction; -import java.nio.charset.CoderResult; -import java.util.Properties; -import java.util.ArrayList; -import java.util.TreeMap; -import java.util.Map; - -final public class KfsAccess -{ - public final int DEFAULT_APPEND_REPLICATION = 2; - public final int DEFAULT_REPLICATION = 1; - public final int DEFAULT_NUM_STRIPES = 6; - public final int DEFAULT_NUM_RECOVERY_STRIPES = 3; - public final int DEFAULT_STRIPE_SIZE = 65536; - public final int DEFAULT_STRIPER_TYPE = - KfsFileAttr.STRIPED_FILE_TYPE_RS; - public final long SET_TIME_TIME_NOT_VALID = 1L << 63; - - // the pointer in C++ - private long cPtr; - - private final static native - long initF(String configFn); - - private final static native - long initS(String metaServerHost, int metaServerPort); - - private final static native - void destroy(long ptr); - - private final static native - int cd(long ptr, String path); - - private final static native - int mkdir(long ptr, String path, int mode); - - private final static native - int mkdirs(long ptr, String path, int mode); - - private final static native - int rmdir(long ptr, String path); - - private final static native - int rmdirs(long ptr, String path); - - private final static native - String[] readdir(long ptr, String path, boolean prefetchAttr); - - private final static native - String[][] getDataLocation(long ptr, String path, long start, long len); - - private final static native - String[][] getBlocksLocation(long ptr, String path, long start, long len); - - private final static native - short getReplication(long ptr, String path); - - private final static native - short setReplication(long ptr, String path, int numReplicas); - - private final static native - long getModificationTime(long ptr, String path); - - private final static native - int create(long ptr, String path, int numReplicas, boolean exclusive, - int numStripes, int numRecoveryStripes, int stripeSize, int stripedType, - boolean forceType, int mode, int minSTier, int maxSTier); - - private final static native - int create2(long ptr, String path, boolean exclusive, String createParams); - - private final static native - int create2ex(long ptr, String path, boolean exclusive, String createParams, - int mode, boolean forceTypeFlag); - - private final static native - int remove(long ptr, String path); - - private final static native - int rename(long ptr, String oldpath, String newpath, boolean overwrite); - - private final static native - int symlink(long ptr, String target, String linkpath, int mode, boolean overwrite); - - private final static native - int open(long ptr, String path, String mode, int numReplicas, - int numStripes, int numRecoveryStripes, int stripeSize, int stripedType, - int createMode); - - private final static native - int exists(long ptr, String path); - - private final static native - int isFile(long ptr, String path); - - private final static native - int isDirectory(long ptr, String path); - - private final static native - long filesize(long ptr, String path); - - private final static native - long setDefaultIoBufferSize(long ptr, long size); - - private final static native - long getDefaultIoBufferSize(long ptr); - - private final static native - long setDefaultReadAheadSize(long ptr, long size); - - private final static native - long getDefaultReadAheadSize(long ptr); - - private final static native - long setIoBufferSize(long ptr, int fd, long size); - - private final static native - long getIoBufferSize(long ptr, int fd); - - private final static native - long setReadAheadSize(long ptr, int fd, long size); - - private final static native - long getReadAheadSize(long ptr, int fd); - - private final static native - int setUTimes(long ptr, String path, long mtime_usec, long atime_usec, long ctime_usec); - - private final static native - int compareChunkReplicas(long ptr, String path, StringBuffer md5sum); - - private final static native - int getStripedType(long ptr, String path); - - private final static native - void setFileAttributeRevalidateTime(long ptr, int secs); - - private final static native - int chmod(long ptr, String path, int mode); - - private final static native - int chmodr(long ptr, String path, int mode); - - private final static native - int fchmod(long ptr, int fd, int mode); - - private final static native - int chowns(long ptr, String path, String user, String group); - - private final static native - int chownsr(long ptr, String path, String user, String group); - - private final static native - int chown(long ptr, String path, long user, long group); - - private final static native - int chownr(long ptr, String path, long user, long group); - - private final static native - int fchowns(long ptr, int fd, String user, String group); - - private final static native - int fchown(long ptr, int fd, long user, long group); - - private final static native - int setEUserAndEGroup(long ptr, long user, long group, long[] groups); - - private final static native - int stat(long ptr, String path, KfsFileAttr attr); - - private final static native - int lstat(long ptr, String path, KfsFileAttr attr); - - private final static native - String strerror(long ptr, int err); - - private final static native - boolean isnotfound(long ptr, int err); - - private final static native - int close(long ptr, int fd); - - private final static native - long seek(long ptr, int fd, long offset); - - private final static native - long tell(long ptr, int fd); - - private final static native - int setUMask(long ptr, int umask); - - private final static native - int getUMask(long ptr); - - private final static native - String createDelegationToken(long ptr, boolean allowDelegationFlag, - long validTime, KfsDelegation result); - - private final static native - String renewDelegationToken(long ptr, KfsDelegation token); - - private final static native - String cancelDelegationToken(long ptr, KfsDelegation token); - - private final static native - String[] getStats(long ptr); - - static { - try { - System.loadLibrary("qfs_access"); - } catch (UnsatisfiedLinkError e) { - throw new RuntimeException("Unable to load qfs_access native library", e); - } - } - - public KfsAccess(String configFn) throws IOException - { - cPtr = initF(configFn); - if (cPtr == 0) { - throw new IOException("Unable to initialize KFS Client"); - } - } - - public KfsAccess(String metaServerHost, int metaServerPort) throws IOException - { - cPtr = initS(metaServerHost, metaServerPort); - if (cPtr == 0) { - throw new IOException("Unable to initialize KFS Client"); - } - } - - // most calls wrap to a call on the KfsClient. For return values, - // see the comments in libkfsClient/KfsClient.h - // - public int kfs_cd(String path) - { - return cd(cPtr, path); - } - - // make the directory hierarchy for path - public int kfs_mkdirs(String path) - { - return mkdirs(cPtr, path, 0777); - } - - // make the directory hierarchy for path - public int kfs_mkdirs(String path, int mode) - { - return mkdirs(cPtr, path, mode); - } - - // make the directory hierarchy for path - public int kfs_mkdir(String path, int mode) - { - return mkdir(cPtr, path, mode); - } - - // remove the directory specified by path; remove will succeed only if path is empty. - public int kfs_rmdir(String path) - { - return rmdir(cPtr, path); - } - - // remove the directory tree specified by path; remove will succeed only if path is empty. - public int kfs_rmdirs(String path) - { - return rmdirs(cPtr, path); - } - - public String[] kfs_readdir(String path) - { - return kfs_readdir(path, false); - } - - public String[] kfs_readdir(String path, boolean prefetchAttr) - { - return readdir(cPtr, path, prefetchAttr); - } - - final public class DirectoryIterator - { - public long modificationTime; - public long attrChangeTime; - public long creationTime; - public long filesize; - public int replication; - public boolean isDirectory; - public int numStripes; - public int numRecoveryStripes; - public int striperType; - public int stripeSize; - public byte minSTier; - public byte maxSTier; - public String filename; - public long owner; - public long group; - public int mode; - public String ownerName; - public String groupName; - public long dirCount; - public long fileCount; - public long chunkCount; - public long fileId; - public int extAttrTypes; - public String extAttrs; - - private KfsInputChannel input; - private ByteBuffer buf; - private int limit; - private final CharsetDecoder decoder = Charset.forName("UTF-8") - .newDecoder() - .onMalformedInput(CodingErrorAction.REPLACE) - .onUnmappableCharacter(CodingErrorAction.REPLACE); - private long prevOwner; - private long prevGroup; - - public DirectoryIterator(String path) throws IOException - { - final int fd = open(cPtr, path, "opendir", 0, 0, 0, 0, 0, 0); - kfs_retToIOException(fd, path); - input = null; - try { - input = new KfsInputChannel(KfsAccess.this, fd); - } finally { - if (input == null) { - KfsAccess.this.close(cPtr, fd); - } - } - } - - private String readString(int len) throws IOException - { - if (len <= 0) { - return ""; - } - final int end = buf.position() + len; - buf.limit(end); - final String str = decoder.reset().decode(buf).toString(); - buf.position(end).limit(limit); - return str; - } - - private void skip(int len) throws IOException - { - if (len > 0) { - buf.position(buf.position() + len); - } - } - - public boolean next() throws IOException - { - for (; ;) { - if (buf == null || ! buf.hasRemaining()) { - if (input == null) { - return false; - } - buf = input.readNext(); - limit = buf == null ? 0 : buf.limit(); - if (limit <= 0) { - close(); - return false; - } - } - modificationTime = buf.getLong(); - attrChangeTime = buf.getLong(); - creationTime = buf.getLong(); - filesize = buf.getLong(); - replication = buf.getInt(); - final int nameLen = buf.getInt(); - isDirectory = buf.get() != 0; - numStripes = buf.getInt(); - numRecoveryStripes = buf.getInt(); - striperType = buf.getInt(); - stripeSize = buf.getInt(); - owner = buf.getInt(); - group = buf.getInt(); - mode = buf.getShort(); - fileId = buf.getLong(); - fileCount = isDirectory ? buf.getLong() : 0; - dirCount = isDirectory ? buf.getLong() : 0; - chunkCount = isDirectory ? 0 : buf.getLong(); - minSTier = buf.get(); - maxSTier = buf.get(); - final int onameLen = buf.getInt(); - final int gnameLen = buf.getInt(); - extAttrTypes = buf.getInt(); - final int exAtLen = - KfsFileAttr.KFS_FILE_ATTR_EXT_TYPE_NONE == extAttrTypes ? - 0 : buf.getInt(); - owner &= 0xFFFFFFFFL; - group &= 0xFFFFFFFFL; - mode &= 0xFFFF; - filename = readString(nameLen); - if (owner == prevOwner && ownerName != null) { - skip(onameLen); - } else { - prevOwner = owner; - ownerName = readString(onameLen); - } - if (group == prevGroup && groupName != null) { - skip(gnameLen); - } else { - prevGroup = group; - groupName = readString(gnameLen); - } - extAttrs = 0 < exAtLen ? readString(exAtLen) : null; - if (nameLen > 0) { - break; - } - } - return true; - } - - public void close() - { - if (input == null) { - return; - } - buf = null; - try { - input.close(); - } catch (IOException ignored) { - } - input = null; - } - } - - public KfsFileAttr[] kfs_readdirplus(String path) - { - DirectoryIterator itr = null; - try { - itr = new DirectoryIterator(path); - final ArrayList ret = new ArrayList(); - while (itr.next()) { - KfsFileAttr entry = new KfsFileAttr(); - entry.modificationTime = itr.modificationTime; - entry.attrChangeTime = itr.attrChangeTime; - entry.creationTime = itr.creationTime; - entry.filesize = itr.filesize; - entry.replication = itr.replication; - entry.isDirectory = itr.isDirectory; - entry.filename = itr.filename; - entry.numStripes = itr.numStripes; - entry.numRecoveryStripes = itr.numRecoveryStripes; - entry.striperType = itr.striperType; - entry.stripeSize = itr.stripeSize; - entry.owner = itr.owner; - entry.group = itr.group; - entry.mode = itr.mode; - entry.ownerName = itr.ownerName; - entry.groupName = itr.groupName; - entry.fileId = itr.fileId; - entry.dirCount = itr.dirCount; - entry.fileCount = itr.fileCount; - entry.chunkCount = itr.chunkCount; - entry.minSTier = itr.minSTier; - entry.maxSTier = itr.maxSTier; - entry.extAttrTypes = itr.extAttrTypes; - entry.extAttrs = itr.extAttrs; - ret.add(entry); - } - return ret.toArray(new KfsFileAttr[0]); - } catch (IOException ex) { - // System.err.println("kfs_readdirplus " + ex); - } finally { - if (itr != null) { - itr.close(); - } - } - return null; - } - - public KfsOutputChannel kfs_append(String path) - { - return kfs_append(path, DEFAULT_APPEND_REPLICATION); - } - - public KfsOutputChannel kfs_append(String path, int numReplicas) - { - try { - return kfs_append_ex(path, numReplicas, 0666); - } catch (IOException ex) { - return null; - } - } - - public KfsOutputChannel kfs_append_ex(String path, int numReplicas, int mode) throws IOException - { - final int fd = open(cPtr, path, "a", - numReplicas > 0 ? - numReplicas : DEFAULT_APPEND_REPLICATION, - 0, 0, 0, KfsFileAttr.STRIPED_FILE_TYPE_NONE, mode); - kfs_retToIOException(fd, path); - KfsOutputChannel chan = null; - try { - final boolean append = true; - chan = new KfsOutputChannel(this, fd, append); - } finally { - if (chan == null) { - close(cPtr, fd); - } - } - return chan; - } - - public KfsOutputChannel kfs_create(String path) - { - return kfs_create(path, DEFAULT_REPLICATION); - } - - public KfsOutputChannel kfs_create(String path, int numReplicas) - { - return kfs_create(path, numReplicas, false); - } - - // if exclusive is specified, then create will succeed only if the - // doesn't already exist - public KfsOutputChannel kfs_create(String path, int numReplicas, boolean exclusive) - { - return kfs_create(path, numReplicas, exclusive, -1, -1); - } - - public KfsOutputChannel kfs_create(String path, int numReplicas, boolean exclusive, - long bufferSize, long readAheadSize) - { - try { - return kfs_create_ex(path, numReplicas, exclusive, - bufferSize, readAheadSize, 0666); - } catch (IOException ex) { - return null; - } - } - - public KfsOutputChannel kfs_create_ex(String path, int numReplicas, boolean exclusive, - long bufferSize, long readAheadSize, int mode) throws IOException - { - final boolean forceStriperType = false; - return kfs_create_ex( - path, - DEFAULT_REPLICATION, // numReplicas, - exclusive, - bufferSize, - readAheadSize, - DEFAULT_NUM_STRIPES, - DEFAULT_NUM_RECOVERY_STRIPES, - DEFAULT_STRIPE_SIZE, - DEFAULT_STRIPER_TYPE, - forceStriperType, - mode - ); - } - - public void kfs_close(int fd) throws IOException - { - kfs_retToIOException(close(cPtr, fd)); - } - - public KfsOutputChannel kfs_create_ex(String path, int numReplicas, boolean exclusive, - long bufferSize, long readAheadSize, - int numStripes, int numRecoveryStripes, int stripeSize, int stripedType, - boolean forceType, int mode) throws IOException - { - int minSTier = 15; - int maxSTier = 15; - return kfs_create_ex(path, numReplicas, exclusive, bufferSize, readAheadSize, - numStripes, numRecoveryStripes, stripeSize, stripedType, forceType, - mode, minSTier, maxSTier); - } - - public KfsOutputChannel kfs_create_ex(String path, int numReplicas, boolean exclusive, - long bufferSize, long readAheadSize, - int numStripes, int numRecoveryStripes, int stripeSize, int stripedType, - boolean forceType, int mode, int minSTier, int maxSTier) throws IOException - { - final int fd = create(cPtr, path, numReplicas, exclusive, - numStripes, numRecoveryStripes, stripeSize, stripedType, forceType, - mode, minSTier, maxSTier); - kfs_retToIOException(fd, path); - if (bufferSize >= 0) { - setIoBufferSize(cPtr, fd, bufferSize); - } - if (readAheadSize >= 0) { - setReadAheadSize(cPtr, fd, readAheadSize); - } - KfsOutputChannel chan = null; - try { - final boolean append = false; - chan = new KfsOutputChannel(this, fd, append); - } finally { - if (chan == null) { - close(cPtr, fd); - } - } - return chan; - } - - public KfsOutputChannel kfs_create_ex(String path, boolean exclusive, - String createParams, int mode, - boolean forceTypeFlag) throws IOException - { - return kfs_create_ex_fd(create2ex(cPtr, path, exclusive, createParams, - mode, forceTypeFlag), path); - } - - public KfsOutputChannel kfs_create_ex(String path, boolean exclusive, - String createParams) throws IOException - { - return kfs_create_ex_fd(create2(cPtr, path, exclusive, createParams), - path); - } - - private final KfsOutputChannel kfs_create_ex_fd(int fd, - String path) throws IOException - { - kfs_retToIOException(fd, path); - KfsOutputChannel chan = null; - try { - final boolean append = false; - chan = new KfsOutputChannel(this, fd, append); - } finally { - if (chan == null) { - close(cPtr, fd); - } - } - return chan; - } - - private final int kfs_open_ro(String path) - { - return open(cPtr, path, "r", - DEFAULT_REPLICATION, - DEFAULT_NUM_STRIPES, - DEFAULT_NUM_RECOVERY_STRIPES, - DEFAULT_STRIPE_SIZE, - DEFAULT_STRIPER_TYPE, - 0 - ); - } - - public KfsInputChannel kfs_open(String path) - { - return kfs_open(path, -1, -1); - } - - public KfsInputChannel kfs_open(String path, long bufferSize, long readAheadSize) - { - try { - return kfs_open_ex(path, bufferSize, readAheadSize); - } catch (IOException ex) { - return null; - } - } - - public KfsInputChannel kfs_open_ex(String path, long bufferSize, long readAheadSize) throws IOException - { - final int fd = kfs_open_ro(path); - kfs_retToIOException(fd, path); - if (bufferSize >= 0) { - setIoBufferSize(cPtr, fd, bufferSize); - } - if (readAheadSize >= 0) { - setReadAheadSize(cPtr, fd, readAheadSize); - } - KfsInputChannel chan = null; - try { - chan = new KfsInputChannel(this, fd); - } finally { - if (chan == null) { - close(cPtr, fd); - } - } - return chan; - } - - public int kfs_remove(String path) - { - return remove(cPtr, path); - } - - public int kfs_rename(String oldpath, String newpath) - { - return rename(cPtr, oldpath, newpath, true); - } - - // if overwrite is turned off, rename will succeed only if newpath - // doesn't already exist - public int kfs_rename(String oldpath, String newpath, boolean overwrite) - { - return rename(cPtr, oldpath, newpath, overwrite); - } - - public int kfs_symlink(String target, String linkpath, int mode, boolean overwrite) - { - return symlink(cPtr, target, linkpath, mode, overwrite); - } - - public boolean kfs_exists(String path) - { - return exists(cPtr, path) == 1; - } - - public boolean kfs_isFile(String path) - { - return isFile(cPtr, path) == 1; - } - - public boolean kfs_isDirectory(String path) - { - return isDirectory(cPtr, path) == 1; - } - - public long kfs_filesize(String path) - { - return filesize(cPtr, path); - } - - // Given a starting byte offset and a length, return the location(s) - // of all the chunks that cover the region. - public String[][] kfs_getDataLocation(String path, long start, long len) - { - return getDataLocation(cPtr, path, start, len); - } - - // Given a starting byte offset and a length, return the location(s) - // of all "chunk blocks" that cover the region. - // The first entry always contains "chunk block" size in hex notation with - // leading 0 omitted, or if negative as status code, which can be converted - // into exceptions with kfs_retToIOException() - public String[][] kfs_getBlocksLocation(String path, long start, long len) - { - final String[][] ret = getBlocksLocation(cPtr, path, start, len); - if (ret == null) { - throw new OutOfMemoryError(); - } - if (ret.length < 1 || ret[0].length != 1) { - throw new Error("getBlocksLocation internal error"); - } - return ret; - } - - // Return the degree of replication for this file - public short kfs_getReplication(String path) - { - return getReplication(cPtr, path); - } - - // Request a change in the degree of replication for this file - // Returns the value that was set by the server for this file - public short kfs_setReplication(String path, int numReplicas) - { - return setReplication(cPtr, path, numReplicas); - } - - public long kfs_getModificationTime(String path) - { - return getModificationTime(cPtr, path); - } - - public int kfs_setModificationTime(String path, long time) - { - return kfs_setUTimes(path, time * 1000, - SET_TIME_TIME_NOT_VALID, SET_TIME_TIME_NOT_VALID); - } - public int kfs_setUTimes(String path, long mtimeUsec, - long atimeUsec, long ctimeUsec) - { - return setUTimes(cPtr, path, mtimeUsec, atimeUsec, ctimeUsec); - } - - public boolean kfs_compareChunkReplicas( - String path, StringBuffer md5sum) throws IOException - { - final int ret = compareChunkReplicas(cPtr, path, md5sum); - kfs_retToIOException(ret); - return ret == 0; - } +import java.lang.ref.Cleaner; - public long kfs_setDefaultIoBufferSize(long size) - { - return setDefaultIoBufferSize(cPtr, size); - } +final public class KfsAccess extends KfsAccessBase { - public long kfs_getDefaultIoBufferSize(long ptr) - { - return getDefaultIoBufferSize(cPtr); - } - - public long kfs_setDefaultReadAheadSize(long size) - { - return setDefaultReadAheadSize(cPtr, size); - } + private static Cleaner cleaner = Cleaner.create(); - public long kfs_getDefaultReadAheadSize(long ptr) - { - return getDefaultReadAheadSize(cPtr); + static Cleaner.Cleanable registerCleanup(Object obj, Runnable action) { + return cleaner.register(obj, action); } - public long kfs_setIoBufferSize(int fd, long size) - { - return setIoBufferSize(cPtr, fd, size); - } - - public long kfs_getIoBufferSize(int fd) - { - return getIoBufferSize(cPtr, fd); - } - - public long kfs_setReadAheadSize(int fd, long size) - { - return setReadAheadSize(cPtr, fd, size); - } - - public long kfs_getReadAheadSize(int fd) - { - return getReadAheadSize(cPtr, fd); - } - - public void kfs_setFileAttributeRevalidateTime(int secs) - { - setFileAttributeRevalidateTime(cPtr, secs); - } - - public int kfs_chmod(String path, int mode) - { - return chmod(cPtr, path, mode); - } - - public int kfs_chmodr(String path, int mode) - { - return chmodr(cPtr, path, mode); - } - - public int kfs_chmod(int fd, int mode) - { - return fchmod(cPtr, fd, mode); - } - - public int kfs_chown(String path, String user, String group) - { - return chowns(cPtr, path, user, group); - } - - public int kfs_chownr(String path, String user, String group) - { - return chownsr(cPtr, path, user, group); - } - - public int kfs_chown(String path, long user, long group) - { - return chown(cPtr, path, user, group); - } - - public int kfs_chownr(String path, long user, long group) - { - return chownr(cPtr, path, user, group); - } - - public int kfs_chown(int fd, String user, String group) - { - return fchowns(cPtr, fd, user, group); - } - - public int kfs_chown(int fd, long user, long group) - { - return fchown(cPtr, fd, user, group); - } - - public int kfs_setEUserAndEGroup(long user, long group, long[] groups) - { - return setEUserAndEGroup(cPtr, user, group, groups); - } - - public int kfs_stat(String path, KfsFileAttr attr) - { - return stat(cPtr, path, attr); - } - - public int kfs_lstat(String path, KfsFileAttr attr) - { - return lstat(cPtr, path, attr); - } - - public void kfs_retToIOException(int ret) throws IOException - { - kfs_retToIOException(ret, null); - } - - public void kfs_retToIOException(int ret, String path) throws IOException - { - if (ret >= 0) { - return; - } - final String es = strerror(cPtr, ret); - if (es == null) { - throw new OutOfMemoryError(); - } - final String msg = path == null ? es : path + ": " + es; - if (isnotfound(cPtr, ret)) { - throw new FileNotFoundException(msg); - } - throw new IOException(msg); - } - - public long kfs_seek(int fd, long offset) throws IOException - { - final long ret = seek(cPtr, fd, offset); - if (ret < 0) { - kfs_retToIOException((int)ret); - } - return ret; - } - - public long kfs_tell(int fd) throws IOException - { - final long ret = tell(cPtr, fd); - if (ret < 0) { - kfs_retToIOException((int)ret); - } - return ret; - } - - public int kfs_setUMask(int umask) throws IOException - { - final int ret = setUMask(cPtr, umask); - if (ret < 0) { - kfs_retToIOException((int)ret); - } - return ret; - } - - public int kfs_getUMask() throws IOException - { - final int ret = getUMask(cPtr); - if (ret < 0) { - kfs_retToIOException((int)ret); - } - return ret; - } - - public KfsDelegation kfs_createDelegationToken( - boolean allowDelegationFlag, long validTime) throws IOException - { - final KfsDelegation result = new KfsDelegation(); - final String error = - createDelegationToken(cPtr, allowDelegationFlag, validTime, result); - if (error != null) { - throw new IOException(error); - } - if (result.key == null || result.token == null) { - throw new OutOfMemoryError(); - } - return result; - } - - public void kfs_renewDelegationToken( - KfsDelegation token) throws IOException - { - final String error = renewDelegationToken(cPtr, token); - if (error != null) { - throw new IOException(error); - } - if (token.key == null || token.token == null) { - throw new OutOfMemoryError(); - } - } - - public void kfs_cancelDelegationToken( - KfsDelegation token) throws IOException - { - final String error = cancelDelegationToken(cPtr, token); - if (error != null) { - throw new IOException(error); - } - } - - public Map kfs_getStats() throws IOException - { - final String[] stats = getStats(cPtr); - if (stats == null) { - throw new IOException("internal error: null stats array"); - } - if (stats.length % 2 != 0) { - throw new IOException( - "internal error: invalid stats array size: " + stats.length); - } - final Map ret = new TreeMap(); - for (int i = 0; i < stats.length; i += 2) { - ret.put(stats[i], stats[i+1]); - } - return ret; + private static void registerCleanupSelf(KfsAccess ka) { + // Ensure that the native resource is cleaned up when this object is + // garbage collected. + // Make sure that this and ka are not referenced by the cleaner closure + // otherwise it will never be cleaned up. + final long ptr = ka.getCPtr(); + registerCleanup(ka, () -> { + destroy(ptr); + }); } - private void kfs_destroy() - { - if (cPtr != 0) { - final long ptr = cPtr; - cPtr = 0; - destroy(ptr); - } + private void registerCleanupConstructed() { + registerCleanupSelf(this); } - protected void finalize() throws Throwable - { - try { - kfs_destroy(); - } finally { - super.finalize(); - } + public KfsAccess(String configFn) throws IOException { + super(configFn); + registerCleanupConstructed(); } - long getCPtr() - { - return cPtr; + public KfsAccess(String metaServerHost, + int metaServerPort) throws IOException { + super(metaServerHost, metaServerPort); + registerCleanupConstructed(); } } - - diff --git a/src/java/qfs-access/src/main/java/com/quantcast/qfs/access/KfsAccessBase.java b/src/java/qfs-access/src/main/java/com/quantcast/qfs/access/KfsAccessBase.java new file mode 100644 index 000000000..dfe693561 --- /dev/null +++ b/src/java/qfs-access/src/main/java/com/quantcast/qfs/access/KfsAccessBase.java @@ -0,0 +1,933 @@ +/** + * $Id$ + * + * Created 2007/08/24 + * + * @author: Sriram Rao (Kosmix Corp.) + * + * Copyright 2008-2012,2016 Quantcast Corporation. All rights reserved. + * Copyright 2007 Kosmix Corp. + * + * This file is part of Kosmos File System (KFS). + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * \brief Java wrappers to get to the KFS client. + */ +package com.quantcast.qfs.access; + +import java.io.IOException; +import java.io.FileNotFoundException; +import java.nio.ByteBuffer; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CodingErrorAction; +import java.util.ArrayList; +import java.util.TreeMap; +import java.util.Map; + +class KfsAccessBase { + + public final int DEFAULT_APPEND_REPLICATION = 2; + public final int DEFAULT_REPLICATION = 1; + public final int DEFAULT_NUM_STRIPES = 6; + public final int DEFAULT_NUM_RECOVERY_STRIPES = 3; + public final int DEFAULT_STRIPE_SIZE = 65536; + public final int DEFAULT_STRIPER_TYPE + = KfsFileAttr.STRIPED_FILE_TYPE_RS; + public final long SET_TIME_TIME_NOT_VALID = 1L << 63; + + // the pointer in C++ + private long cPtr; + + private static native long initF(String configFn); + + private static native long initS(String metaServerHost, int metaServerPort); + + final protected static native void destroy(long ptr); + + private static native int cd(long ptr, String path); + + private static native int mkdir(long ptr, String path, int mode); + + private static native int mkdirs(long ptr, String path, int mode); + + private static native int rmdir(long ptr, String path); + + private static native int rmdirs(long ptr, String path); + + private static native String[] readdir( + long ptr, String path, boolean prefetchAttr); + + private static native String[][] getDataLocation( + long ptr, String path, long start, long len); + + private static native String[][] getBlocksLocation( + long ptr, String path, long start, long len); + + private static native short getReplication(long ptr, String path); + + private static native short setReplication( + long ptr, String path, int numReplicas); + + private static native long getModificationTime( + long ptr, String path); + + private static native int create( + long ptr, String path, int numReplicas, boolean exclusive, + int numStripes, int numRecoveryStripes, int stripeSize, + int stripedType, boolean forceType, int mode, int minSTier, + int maxSTier); + + private static native int create2( + long ptr, String path, boolean exclusive, String createParams); + + private static native int create2ex( + long ptr, String path, boolean exclusive, String createParams, + int mode, boolean forceTypeFlag); + + private static native int remove(long ptr, String path); + + private static native int rename( + long ptr, String oldpath, String newpath, boolean overwrite); + + private static native int symlink( + long ptr, String target, String linkpath, int mode, + boolean overwrite); + + private static native int open( + long ptr, String path, String mode, int numReplicas, + int numStripes, int numRecoveryStripes, int stripeSize, + int stripedType, int createMode); + + private static native int exists(long ptr, String path); + + private static native int isFile(long ptr, String path); + + private static native int isDirectory(long ptr, String path); + + private static native long filesize(long ptr, String path); + + private static native long setDefaultIoBufferSize(long ptr, long size); + + private static native long getDefaultIoBufferSize(long ptr); + + private static native long setDefaultReadAheadSize(long ptr, long size); + + private static native long getDefaultReadAheadSize(long ptr); + + private static native long setIoBufferSize(long ptr, int fd, long size); + + private static native long getIoBufferSize(long ptr, int fd); + + private static native long setReadAheadSize(long ptr, int fd, long size); + + private static native long getReadAheadSize(long ptr, int fd); + + private static native int setUTimes( + long ptr, String path, long mtime_usec, long atime_usec, + long ctime_usec); + + private static native int compareChunkReplicas(long ptr, String path, StringBuffer md5sum); + + // private static native int getStripedType(long ptr, String path); + private static native void setFileAttributeRevalidateTime( + long ptr, int secs); + + private static native int chmod(long ptr, String path, int mode); + + private static native int chmodr(long ptr, String path, int mode); + + private static native int fchmod(long ptr, int fd, int mode); + + private static native int chowns( + long ptr, String path, String user, String group); + + private static native int chownsr( + long ptr, String path, String user, String group); + + private static native int chown( + long ptr, String path, long user, long group); + + private static native int chownr( + long ptr, String path, long user, long group); + + private static native int fchowns( + long ptr, int fd, String user, String group); + + private static native int fchown(long ptr, int fd, long user, long group); + + private static native int setEUserAndEGroup( + long ptr, long user, long group, long[] groups); + + private static native int stat(long ptr, String path, KfsFileAttr attr); + + private static native int lstat(long ptr, String path, KfsFileAttr attr); + + private static native String strerror(long ptr, int err); + + private static native boolean isnotfound(long ptr, int err); + + private static native int close(long ptr, int fd); + + private static native long seek(long ptr, int fd, long offset); + + private static native long tell(long ptr, int fd); + + private static native int setUMask(long ptr, int umask); + + private static native int getUMask(long ptr); + + private static native String createDelegationToken( + long ptr, boolean allowDelegationFlag, long validTime, + KfsDelegation result); + + private static native String renewDelegationToken( + long ptr, KfsDelegation token); + + private static native String cancelDelegationToken( + long ptr, KfsDelegation token); + + private static native String[] getStats(long ptr); + + static { + try { + System.loadLibrary("qfs_access"); + } catch (UnsatisfiedLinkError e) { + throw new RuntimeException( + "Unable to load qfs_access native library", e); + } + } + + protected KfsAccessBase(String configFn) throws IOException { + cPtr = initF(configFn); + if (cPtr == 0) { + throw new IOException("Unable to initialize KFS Client"); + } + } + + protected KfsAccessBase(String metaServerHost, + int metaServerPort) throws IOException { + cPtr = initS(metaServerHost, metaServerPort); + if (cPtr == 0) { + throw new IOException("Unable to initialize KFS Client"); + } + } + + // most calls wrap to a call on the KfsClient. For return values, + // see the comments in libkfsClient/KfsClient.h + // + final public int kfs_cd(String path) { + return cd(cPtr, path); + } + + // make the directory hierarchy for path + final public int kfs_mkdirs(String path) { + return mkdirs(cPtr, path, 0777); + } + + // make the directory hierarchy for path + final public int kfs_mkdirs(String path, int mode) { + return mkdirs(cPtr, path, mode); + } + + // make the directory hierarchy for path + final public int kfs_mkdir(String path, int mode) { + return mkdir(cPtr, path, mode); + } + + // remove the directory specified by path; remove will succeed only if path is empty. + final public int kfs_rmdir(String path) { + return rmdir(cPtr, path); + } + + // remove the directory tree specified by path; remove will succeed only if path is empty. + final public int kfs_rmdirs(String path) { + return rmdirs(cPtr, path); + } + + final public String[] kfs_readdir(String path) { + return kfs_readdir(path, false); + } + + final public String[] kfs_readdir(String path, boolean prefetchAttr) { + return readdir(cPtr, path, prefetchAttr); + } + + final public class DirectoryIterator { + + public long modificationTime; + public long attrChangeTime; + public long creationTime; + public long filesize; + public int replication; + public boolean isDirectory; + public int numStripes; + public int numRecoveryStripes; + public int striperType; + public int stripeSize; + public byte minSTier; + public byte maxSTier; + public String filename; + public long owner; + public long group; + public int mode; + public String ownerName; + public String groupName; + public long dirCount; + public long fileCount; + public long chunkCount; + public long fileId; + public int extAttrTypes; + public String extAttrs; + + private KfsInputChannel input; + private ByteBuffer buf; + private int limit; + private final CharsetDecoder decoder = Charset.forName("UTF-8") + .newDecoder() + .onMalformedInput(CodingErrorAction.REPLACE) + .onUnmappableCharacter(CodingErrorAction.REPLACE); + private long prevOwner; + private long prevGroup; + + public DirectoryIterator(String path) throws IOException { + final int fd = open(cPtr, path, "opendir", 0, 0, 0, 0, 0, 0); + kfs_retToIOException(fd, path); + input = null; + try { + input = new KfsInputChannel(KfsAccessBase.this, fd); + } finally { + if (input == null) { + KfsAccessBase.close(cPtr, fd); + } + } + } + + private String readString(int len) throws IOException { + if (len <= 0) { + return ""; + } + final int end = buf.position() + len; + buf.limit(end); + final String str = decoder.reset().decode(buf).toString(); + buf.position(end).limit(limit); + return str; + } + + private void skip(int len) throws IOException { + if (len > 0) { + buf.position(buf.position() + len); + } + } + + public boolean next() throws IOException { + for (;;) { + if (buf == null || !buf.hasRemaining()) { + if (input == null) { + return false; + } + buf = input.readNext(); + limit = buf == null ? 0 : buf.limit(); + if (limit <= 0) { + close(); + return false; + } + } + modificationTime = buf.getLong(); + attrChangeTime = buf.getLong(); + creationTime = buf.getLong(); + filesize = buf.getLong(); + replication = buf.getInt(); + final int nameLen = buf.getInt(); + isDirectory = buf.get() != 0; + numStripes = buf.getInt(); + numRecoveryStripes = buf.getInt(); + striperType = buf.getInt(); + stripeSize = buf.getInt(); + owner = buf.getInt(); + group = buf.getInt(); + mode = buf.getShort(); + fileId = buf.getLong(); + fileCount = isDirectory ? buf.getLong() : 0; + dirCount = isDirectory ? buf.getLong() : 0; + chunkCount = isDirectory ? 0 : buf.getLong(); + minSTier = buf.get(); + maxSTier = buf.get(); + final int onameLen = buf.getInt(); + final int gnameLen = buf.getInt(); + extAttrTypes = buf.getInt(); + final int exAtLen + = KfsFileAttr.KFS_FILE_ATTR_EXT_TYPE_NONE == extAttrTypes + ? 0 : buf.getInt(); + owner &= 0xFFFFFFFFL; + group &= 0xFFFFFFFFL; + mode &= 0xFFFF; + filename = readString(nameLen); + if (owner == prevOwner && ownerName != null) { + skip(onameLen); + } else { + prevOwner = owner; + ownerName = readString(onameLen); + } + if (group == prevGroup && groupName != null) { + skip(gnameLen); + } else { + prevGroup = group; + groupName = readString(gnameLen); + } + extAttrs = 0 < exAtLen ? readString(exAtLen) : null; + if (nameLen > 0) { + break; + } + } + return true; + } + + public void close() { + if (input == null) { + return; + } + buf = null; + try { + input.close(); + } catch (IOException ignored) { + } + input = null; + } + } + + final public KfsFileAttr[] kfs_readdirplus(String path) { + DirectoryIterator itr = null; + try { + itr = new DirectoryIterator(path); + final ArrayList ret = new ArrayList(); + while (itr.next()) { + KfsFileAttr entry = new KfsFileAttr(); + entry.modificationTime = itr.modificationTime; + entry.attrChangeTime = itr.attrChangeTime; + entry.creationTime = itr.creationTime; + entry.filesize = itr.filesize; + entry.replication = itr.replication; + entry.isDirectory = itr.isDirectory; + entry.filename = itr.filename; + entry.numStripes = itr.numStripes; + entry.numRecoveryStripes = itr.numRecoveryStripes; + entry.striperType = itr.striperType; + entry.stripeSize = itr.stripeSize; + entry.owner = itr.owner; + entry.group = itr.group; + entry.mode = itr.mode; + entry.ownerName = itr.ownerName; + entry.groupName = itr.groupName; + entry.fileId = itr.fileId; + entry.dirCount = itr.dirCount; + entry.fileCount = itr.fileCount; + entry.chunkCount = itr.chunkCount; + entry.minSTier = itr.minSTier; + entry.maxSTier = itr.maxSTier; + entry.extAttrTypes = itr.extAttrTypes; + entry.extAttrs = itr.extAttrs; + ret.add(entry); + } + return ret.toArray(new KfsFileAttr[0]); + } catch (IOException ex) { + // System.err.println("kfs_readdirplus " + ex); + } finally { + if (itr != null) { + itr.close(); + } + } + return null; + } + + final public KfsOutputChannel kfs_append(String path) { + return kfs_append(path, DEFAULT_APPEND_REPLICATION); + } + + final public KfsOutputChannel kfs_append(String path, int numReplicas) { + try { + return kfs_append_ex(path, numReplicas, 0666); + } catch (IOException ex) { + return null; + } + } + + final public KfsOutputChannel kfs_append_ex(String path, int numReplicas, + int mode) throws IOException { + final int fd = open(cPtr, path, "a", + numReplicas > 0 + ? numReplicas : DEFAULT_APPEND_REPLICATION, + 0, 0, 0, KfsFileAttr.STRIPED_FILE_TYPE_NONE, mode); + kfs_retToIOException(fd, path); + KfsOutputChannel chan = null; + try { + final boolean append = true; + chan = new KfsOutputChannel(this, fd, append); + } finally { + if (chan == null) { + close(cPtr, fd); + } + } + return chan; + } + + final public KfsOutputChannel kfs_create(String path) { + return kfs_create(path, DEFAULT_REPLICATION); + } + + final public KfsOutputChannel kfs_create(String path, int numReplicas) { + return kfs_create(path, numReplicas, false); + } + + // if exclusive is specified, then create will succeed only if the + // doesn't already exist + final public KfsOutputChannel kfs_create(String path, int numReplicas, + boolean exclusive) { + return kfs_create(path, numReplicas, exclusive, -1, -1); + } + + final public KfsOutputChannel kfs_create(String path, int numReplicas, + boolean exclusive, long bufferSize, long readAheadSize) { + try { + return kfs_create_ex(path, numReplicas, exclusive, + bufferSize, readAheadSize, 0666); + } catch (IOException ex) { + return null; + } + } + + final public KfsOutputChannel kfs_create_ex(String path, int numReplicas, + boolean exclusive, long bufferSize, long readAheadSize, + int mode) throws IOException { + final boolean forceStriperType = false; + return kfs_create_ex( + path, + DEFAULT_REPLICATION, // numReplicas, + exclusive, + bufferSize, + readAheadSize, + DEFAULT_NUM_STRIPES, + DEFAULT_NUM_RECOVERY_STRIPES, + DEFAULT_STRIPE_SIZE, + DEFAULT_STRIPER_TYPE, + forceStriperType, + mode + ); + } + + final public void kfs_close(int fd) throws IOException { + kfs_retToIOException(close(cPtr, fd)); + } + + final public KfsOutputChannel kfs_create_ex(String path, int numReplicas, + boolean exclusive, long bufferSize, long readAheadSize, + int numStripes, int numRecoveryStripes, int stripeSize, + int stripedType, boolean forceType, int mode) throws IOException { + int minSTier = 15; + int maxSTier = 15; + return kfs_create_ex(path, numReplicas, exclusive, bufferSize, + readAheadSize, numStripes, numRecoveryStripes, stripeSize, + stripedType, forceType, mode, minSTier, maxSTier); + } + + final public KfsOutputChannel kfs_create_ex(String path, int numReplicas, + boolean exclusive, long bufferSize, long readAheadSize, + int numStripes, int numRecoveryStripes, int stripeSize, + int stripedType, boolean forceType, int mode, int minSTier, + int maxSTier) throws IOException { + final int fd = create(cPtr, path, numReplicas, exclusive, + numStripes, numRecoveryStripes, stripeSize, stripedType, forceType, + mode, minSTier, maxSTier); + kfs_retToIOException(fd, path); + if (bufferSize >= 0) { + setIoBufferSize(cPtr, fd, bufferSize); + } + if (readAheadSize >= 0) { + setReadAheadSize(cPtr, fd, readAheadSize); + } + KfsOutputChannel chan = null; + try { + final boolean append = false; + chan = new KfsOutputChannel(this, fd, append); + } finally { + if (chan == null) { + close(cPtr, fd); + } + } + return chan; + } + + final public KfsOutputChannel kfs_create_ex(String path, boolean exclusive, + String createParams, int mode, + boolean forceTypeFlag) throws IOException { + return kfs_create_ex_fd(create2ex(cPtr, path, exclusive, createParams, + mode, forceTypeFlag), path); + } + + final public KfsOutputChannel kfs_create_ex(String path, boolean exclusive, + String createParams) throws IOException { + return kfs_create_ex_fd(create2(cPtr, path, exclusive, createParams), + path); + } + + private KfsOutputChannel kfs_create_ex_fd(int fd, + String path) throws IOException { + kfs_retToIOException(fd, path); + KfsOutputChannel chan = null; + try { + final boolean append = false; + chan = new KfsOutputChannel(this, fd, append); + } finally { + if (chan == null) { + close(cPtr, fd); + } + } + return chan; + } + + private int kfs_open_ro(String path) { + return open(cPtr, path, "r", + DEFAULT_REPLICATION, + DEFAULT_NUM_STRIPES, + DEFAULT_NUM_RECOVERY_STRIPES, + DEFAULT_STRIPE_SIZE, + DEFAULT_STRIPER_TYPE, + 0 + ); + } + + final public KfsInputChannel kfs_open(String path) { + return kfs_open(path, -1, -1); + } + + final public KfsInputChannel kfs_open(String path, long bufferSize, + long readAheadSize) { + try { + return kfs_open_ex(path, bufferSize, readAheadSize); + } catch (IOException ex) { + return null; + } + } + + final public KfsInputChannel kfs_open_ex(String path, long bufferSize, + long readAheadSize) throws IOException { + final int fd = kfs_open_ro(path); + kfs_retToIOException(fd, path); + if (bufferSize >= 0) { + setIoBufferSize(cPtr, fd, bufferSize); + } + if (readAheadSize >= 0) { + setReadAheadSize(cPtr, fd, readAheadSize); + } + KfsInputChannel chan = null; + try { + chan = new KfsInputChannel(this, fd); + } finally { + if (chan == null) { + close(cPtr, fd); + } + } + return chan; + } + + final public int kfs_remove(String path) { + return remove(cPtr, path); + } + + final public int kfs_rename(String oldpath, String newpath) { + return rename(cPtr, oldpath, newpath, true); + } + + // if overwrite is turned off, rename will succeed only if newpath + // doesn't already exist + final public int kfs_rename(String oldpath, String newpath, + boolean overwrite) { + return rename(cPtr, oldpath, newpath, overwrite); + } + + final public int kfs_symlink(String target, String linkpath, int mode, + boolean overwrite) { + return symlink(cPtr, target, linkpath, mode, overwrite); + } + + final public boolean kfs_exists(String path) { + return exists(cPtr, path) == 1; + } + + final public boolean kfs_isFile(String path) { + return isFile(cPtr, path) == 1; + } + + final public boolean kfs_isDirectory(String path) { + return isDirectory(cPtr, path) == 1; + } + + final public long kfs_filesize(String path) { + return filesize(cPtr, path); + } + + // Given a starting byte offset and a length, return the location(s) + // of all the chunks that cover the region. + final public String[][] kfs_getDataLocation(String path, long start, + long len) { + return getDataLocation(cPtr, path, start, len); + } + + // Given a starting byte offset and a length, return the location(s) + // of all "chunk blocks" that cover the region. + // The first entry always contains "chunk block" size in hex notation with + // leading 0 omitted, or if negative as status code, which can be converted + // into exceptions with kfs_retToIOException() + final public String[][] kfs_getBlocksLocation(String path, long start, + long len) { + final String[][] ret = getBlocksLocation(cPtr, path, start, len); + if (ret == null) { + throw new OutOfMemoryError(); + } + if (ret.length < 1 || ret[0].length != 1) { + throw new Error("getBlocksLocation internal error"); + } + return ret; + } + + // Return the degree of replication for this file + final public short kfs_getReplication(String path) { + return getReplication(cPtr, path); + } + + // Request a change in the degree of replication for this file + // Returns the value that was set by the server for this file + final public short kfs_setReplication(String path, int numReplicas) { + return setReplication(cPtr, path, numReplicas); + } + + final public long kfs_getModificationTime(String path) { + return getModificationTime(cPtr, path); + } + + final public int kfs_setModificationTime(String path, long time) { + return kfs_setUTimes(path, time * 1000, + SET_TIME_TIME_NOT_VALID, SET_TIME_TIME_NOT_VALID); + } + + final public int kfs_setUTimes(String path, long mtimeUsec, + long atimeUsec, long ctimeUsec) { + return setUTimes(cPtr, path, mtimeUsec, atimeUsec, ctimeUsec); + } + + final public boolean kfs_compareChunkReplicas( + String path, StringBuffer md5sum) throws IOException { + final int ret = compareChunkReplicas(cPtr, path, md5sum); + kfs_retToIOException(ret); + return ret == 0; + } + + final public long kfs_setDefaultIoBufferSize(long size) { + return setDefaultIoBufferSize(cPtr, size); + } + + final public long kfs_getDefaultIoBufferSize(long ptr) { + return getDefaultIoBufferSize(cPtr); + } + + final public long kfs_setDefaultReadAheadSize(long size) { + return setDefaultReadAheadSize(cPtr, size); + } + + final public long kfs_getDefaultReadAheadSize(long ptr) { + return getDefaultReadAheadSize(cPtr); + } + + final public long kfs_setIoBufferSize(int fd, long size) { + return setIoBufferSize(cPtr, fd, size); + } + + final public long kfs_getIoBufferSize(int fd) { + return getIoBufferSize(cPtr, fd); + } + + final public long kfs_setReadAheadSize(int fd, long size) { + return setReadAheadSize(cPtr, fd, size); + } + + final public long kfs_getReadAheadSize(int fd) { + return getReadAheadSize(cPtr, fd); + } + + final public void kfs_setFileAttributeRevalidateTime(int secs) { + setFileAttributeRevalidateTime(cPtr, secs); + } + + final public int kfs_chmod(String path, int mode) { + return chmod(cPtr, path, mode); + } + + final public int kfs_chmodr(String path, int mode) { + return chmodr(cPtr, path, mode); + } + + final public int kfs_chmod(int fd, int mode) { + return fchmod(cPtr, fd, mode); + } + + final public int kfs_chown(String path, String user, String group) { + return chowns(cPtr, path, user, group); + } + + final public int kfs_chownr(String path, String user, String group) { + return chownsr(cPtr, path, user, group); + } + + final public int kfs_chown(String path, long user, long group) { + return chown(cPtr, path, user, group); + } + + final public int kfs_chownr(String path, long user, long group) { + return chownr(cPtr, path, user, group); + } + + final public int kfs_chown(int fd, String user, String group) { + return fchowns(cPtr, fd, user, group); + } + + final public int kfs_chown(int fd, long user, long group) { + return fchown(cPtr, fd, user, group); + } + + final public int kfs_setEUserAndEGroup(long user, long group, long[] groups) { + return setEUserAndEGroup(cPtr, user, group, groups); + } + + final public int kfs_stat(String path, KfsFileAttr attr) { + return stat(cPtr, path, attr); + } + + final public int kfs_lstat(String path, KfsFileAttr attr) { + return lstat(cPtr, path, attr); + } + + final public void kfs_retToIOException(int ret) throws IOException { + kfs_retToIOException(ret, null); + } + + final public void kfs_retToIOException(int ret, + String path) throws IOException { + if (ret >= 0) { + return; + } + final String es = strerror(cPtr, ret); + if (es == null) { + throw new OutOfMemoryError(); + } + final String msg = path == null ? es : path + ": " + es; + if (isnotfound(cPtr, ret)) { + throw new FileNotFoundException(msg); + } + throw new IOException(msg); + } + + final public long kfs_seek(int fd, long offset) throws IOException { + final long ret = seek(cPtr, fd, offset); + if (ret < 0) { + kfs_retToIOException((int) ret); + } + return ret; + } + + final public long kfs_tell(int fd) throws IOException { + final long ret = tell(cPtr, fd); + if (ret < 0) { + kfs_retToIOException((int) ret); + } + return ret; + } + + final public int kfs_setUMask(int umask) throws IOException { + final int ret = setUMask(cPtr, umask); + if (ret < 0) { + kfs_retToIOException((int) ret); + } + return ret; + } + + final public int kfs_getUMask() throws IOException { + final int ret = getUMask(cPtr); + if (ret < 0) { + kfs_retToIOException((int) ret); + } + return ret; + } + + final public KfsDelegation kfs_createDelegationToken( + boolean allowDelegationFlag, long validTime) throws IOException { + final KfsDelegation result = new KfsDelegation(); + final String error = createDelegationToken( + cPtr, allowDelegationFlag, validTime, result); + if (error != null) { + throw new IOException(error); + } + if (result.key == null || result.token == null) { + throw new OutOfMemoryError(); + } + return result; + } + + final public void kfs_renewDelegationToken( + KfsDelegation token) throws IOException { + final String error = renewDelegationToken(cPtr, token); + if (error != null) { + throw new IOException(error); + } + if (token.key == null || token.token == null) { + throw new OutOfMemoryError(); + } + } + + final public void kfs_cancelDelegationToken( + KfsDelegation token) throws IOException { + final String error = cancelDelegationToken(cPtr, token); + if (error != null) { + throw new IOException(error); + } + } + + final public Map kfs_getStats() throws IOException { + final String[] stats = getStats(cPtr); + if (stats == null) { + throw new IOException("internal error: null stats array"); + } + if (stats.length % 2 != 0) { + throw new IOException( + "internal error: invalid stats array size: " + + stats.length); + } + final Map ret = new TreeMap(); + for (int i = 0; i < stats.length; i += 2) { + ret.put(stats[i], stats[i + 1]); + } + return ret; + } + + final protected void kfs_destroy() { + if (cPtr != 0) { + final long ptr = cPtr; + cPtr = 0; + destroy(ptr); + } + } + + final long getCPtr() { + return cPtr; + } +} diff --git a/src/java/qfs-access/src/main/java/com/quantcast/qfs/access/KfsInputChannel.java b/src/java/qfs-access/src/main/java/com/quantcast/qfs/access/KfsInputChannel.java index be846eefb..5da9c2a8c 100644 --- a/src/java/qfs-access/src/main/java/com/quantcast/qfs/access/KfsInputChannel.java +++ b/src/java/qfs-access/src/main/java/com/quantcast/qfs/access/KfsInputChannel.java @@ -1,11 +1,11 @@ /** * $Id$ * - * Created 2007/09/11 + * Created 2025/04/20 * - * @author: Sriram Rao (Kosmix Corp.) + * @author: Mike Ovsiannikov (Quantcast Corporation) * - * Copyright 2008-2012,2016 Quantcast Corporation. All rights reserved. + * Copyright 2025 Quantcast Corporation. All rights reserved. * Copyright 2007 Kosmix Corp. * * This file is part of Kosmos File System (KFS). @@ -21,186 +21,33 @@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. - * - * \brief An input channel that does buffered I/O. This is to reduce - * the overhead of JNI calls. + * + * \brief Input channel java 9 style cleanup. */ - package com.quantcast.qfs.access; import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.channels.ReadableByteChannel; - -/* A byte channel interface with seek support */ -final public class KfsInputChannel implements ReadableByteChannel, Positionable -{ - // To get to a byte-buffer from the C++ side as a pointer, need - // the buffer to be direct memory backed buffer. So, allocate one - // for reading/writing. - private ByteBuffer readBuffer; - private int kfsFd = -1; - private KfsAccess kfsAccess; - private boolean isReadAheadOff = false; - - private final static native - int read(long cPtr, int fd, ByteBuffer buf, int begin, int end); - - KfsInputChannel(KfsAccess ka, int fd) - { - readBuffer = BufferPool.getInstance().getBuffer(); - readBuffer.flip(); - - kfsFd = fd; - kfsAccess = ka; - } - - public synchronized boolean isOpen() - { - return kfsFd >= 0; - - } - - // Read/write from the specified fd. The basic model is: - // -- fill some data into a direct mapped byte buffer - // -- send/receive to the other side (Jave->C++ or vice-versa) - // - public synchronized int read(ByteBuffer dst) throws IOException - { - if (kfsFd < 0) { - throw new IOException("File closed"); - } - final int r0 = dst.remaining(); - - // While the dst buffer has space for more data, fill - while (dst.hasRemaining()) { - // Fill input buffer if it's empty - if (!readBuffer.hasRemaining()) { - readBuffer.clear(); - readDirect(readBuffer, dst.remaining()); - readBuffer.flip(); - - // If we failed to get anything, call that EOF - if (!readBuffer.hasRemaining()) { - break; - } - } - - // Save end of input buffer - final int lim = readBuffer.limit(); - - // If dst buffer can't contain all of input buffer, limit - // our copy size. - if (dst.remaining() < readBuffer.remaining()) { - readBuffer.limit(readBuffer.position() + dst.remaining()); - } - // Copy into dst buffer - dst.put(readBuffer); - - // Restore end of input buffer marker (maybe changed - // earlier) - readBuffer.limit(lim); - } +import java.lang.ref.Cleaner; - // If we copied anything into the dst buffer (or if there was - // no space available to do so), return the number of bytes - // copied. Otherwise return -1 to indicate EOF. - final int r1 = dst.remaining(); - if (r1 < r0 || r0 == 0) { - return r0 - r1; - } - return -1; - } +final public class KfsInputChannel extends KfsInputChannelBase { - ByteBuffer readNext() throws IOException - { - readBuffer.clear(); - readDirect(readBuffer, 0); - readBuffer.flip(); - return readBuffer; - } - - private void readDirect(ByteBuffer buf, int remRequestedBytes) throws IOException - { - if (!buf.isDirect()) { - throw new IllegalArgumentException("need direct buffer"); - } - final int pos = buf.position(); - final int end = (isReadAheadOff && remRequestedBytes > 0) ? - Math.min(buf.limit(), pos + remRequestedBytes) : buf.limit(); - final int sz = read(kfsAccess.getCPtr(), kfsFd, buf, pos, end); - kfsAccess.kfs_retToIOException(sz); - buf.position(pos + sz); - } - - // is modeled after the seek of Java's RandomAccessFile; offset is - // the offset from the beginning of the file. - public synchronized long seek(long offset) throws IOException - { - if (offset < 0) { - throw new IllegalArgumentException("seek(" + kfsFd + "," + offset + ")"); - } - if (kfsFd < 0) { - throw new IOException("File closed"); - } - readBuffer.clear(); - readBuffer.flip(); - return kfsAccess.kfs_seek(kfsFd, offset); - } + final private Cleaner.Cleanable cleanable; - public synchronized long tell() throws IOException - { - if (kfsFd < 0) { - throw new IOException("File closed"); - } - // we keep some data buffered; so, we ask the C++ side where - // we are in the file and offset that by the amount in our - // buffer - final long ret = kfsAccess.kfs_tell(kfsFd); - final int rem = readBuffer.remaining(); - if (ret < rem) { - throw new RuntimeException("KFS internal error: pos: " + ret + - " less than buffered: " + rem); - } - return ret - rem; + KfsInputChannel(KfsAccessBase ka, int fd) { + super(ka, fd); + cleanable = registerCleanup(); } - public synchronized void close() throws IOException - { - if (kfsFd < 0) { - return; - } - final int fd = kfsFd; - kfsFd = -1; - final KfsAccess ka = kfsAccess; - kfsAccess = null; - try { - ka.kfs_close(fd); - } finally { - BufferPool.getInstance().releaseBuffer(readBuffer); - readBuffer = null; - } + private Cleaner.Cleanable registerCleanup() { + return KfsAccess.registerCleanup(this, state); } - protected void finalize() throws Throwable - { + @Override + public void close() throws IOException { try { - if (kfsFd >= 0 && kfsAccess != null) { - final int fd = kfsFd; - kfsFd = -1; - final KfsAccess ka = kfsAccess; - kfsAccess = null; - ka.kfs_close(fd); - } + super.close(); } finally { - super.finalize(); - } - } - - public void setReadAheadSize(long readAheadSize) { - if(readAheadSize >= 0) { - kfsAccess.kfs_setReadAheadSize(kfsFd, readAheadSize); - isReadAheadOff = readAheadSize == 0; + cleanable.clean(); } } } diff --git a/src/java/qfs-access/src/main/java/com/quantcast/qfs/access/KfsInputChannelBase.java b/src/java/qfs-access/src/main/java/com/quantcast/qfs/access/KfsInputChannelBase.java new file mode 100644 index 000000000..b748fcc17 --- /dev/null +++ b/src/java/qfs-access/src/main/java/com/quantcast/qfs/access/KfsInputChannelBase.java @@ -0,0 +1,205 @@ +/** + * $Id$ + * + * Created 2007/09/11 + * + * @author: Sriram Rao (Kosmix Corp.) + * + * Copyright 2008-2012,2016 Quantcast Corporation. All rights reserved. + * Copyright 2007 Kosmix Corp. + * + * This file is part of Kosmos File System (KFS). + * + * Licensed under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. See the License for the specific language governing + * permissions and limitations under the License. + * + * \brief An input channel that does buffered I/O. This is to reduce + * the overhead of JNI calls. + */ +package com.quantcast.qfs.access; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.ReadableByteChannel; + +/* A byte channel interface with seek support */ +class KfsInputChannelBase implements ReadableByteChannel, Positionable { + + // To get to a byte-buffer from the C++ side as a pointer, need + // the buffer to be direct memory backed buffer. So, allocate one + // for reading/writing. + final protected static class State implements Runnable { + + ByteBuffer readBuffer; + int kfsFd; + KfsAccessBase kfsAccess; + + State(KfsAccessBase ka, int fd) { + readBuffer = BufferPool.getInstance().getBuffer(); + readBuffer.flip(); + kfsFd = fd; + kfsAccess = ka; + } + + void releaseBuffer() { + if (readBuffer != null) { + BufferPool.getInstance().releaseBuffer(readBuffer); + readBuffer = null; + } + } + + void release() throws IOException { + if (kfsFd >= 0 && kfsAccess != null) { + final int fd = kfsFd; + kfsFd = -1; + final KfsAccessBase ka = kfsAccess; + kfsAccess = null; + try { + ka.kfs_close(fd); + } finally { + releaseBuffer(); + } + } + } + + @Override + public void run() { + try { + release(); + } catch (IOException ignored) { + // Ignore + } + } + } + protected final State state; + private boolean isReadAheadOff = false; + + private static native int read(long cPtr, int fd, ByteBuffer buf, int begin, int end); + + protected KfsInputChannelBase(KfsAccessBase ka, int fd) { + state = new State(ka, fd); + } + + final public synchronized boolean isOpen() { + return state.kfsFd >= 0; + } + + // Read/write from the specified fd. The basic model is: + // -- fill some data into a direct mapped byte buffer + // -- send/receive to the other side (Jave->C++ or vice-versa) + // + final public synchronized int read(ByteBuffer dst) throws IOException { + if (state.kfsFd < 0) { + throw new IOException("File closed"); + } + final int r0 = dst.remaining(); + + // While the dst buffer has space for more data, fill + while (dst.hasRemaining()) { + // Fill input buffer if it's empty + if (!state.readBuffer.hasRemaining()) { + state.readBuffer.clear(); + readDirect(state.readBuffer, dst.remaining()); + state.readBuffer.flip(); + + // If we failed to get anything, call that EOF + if (!state.readBuffer.hasRemaining()) { + break; + } + } + + // Save end of input buffer + final int lim = state.readBuffer.limit(); + + // If dst buffer can't contain all of input buffer, limit + // our copy size. + if (dst.remaining() < state.readBuffer.remaining()) { + state.readBuffer.limit(state.readBuffer.position() + dst.remaining()); + } + // Copy into dst buffer + dst.put(state.readBuffer); + + // Restore end of input buffer marker (maybe changed + // earlier) + state.readBuffer.limit(lim); + } + + // If we copied anything into the dst buffer (or if there was + // no space available to do so), return the number of bytes + // copied. Otherwise return -1 to indicate EOF. + final int r1 = dst.remaining(); + if (r1 < r0 || r0 == 0) { + return r0 - r1; + } + return -1; + } + + final ByteBuffer readNext() throws IOException { + state.readBuffer.clear(); + readDirect(state.readBuffer, 0); + state.readBuffer.flip(); + return state.readBuffer; + } + + private void readDirect(ByteBuffer buf, int remRequestedBytes) throws IOException { + if (!buf.isDirect()) { + throw new IllegalArgumentException("need direct buffer"); + } + final int pos = buf.position(); + final int end = (isReadAheadOff && remRequestedBytes > 0) + ? Math.min(buf.limit(), pos + remRequestedBytes) : buf.limit(); + final int sz = read(state.kfsAccess.getCPtr(), state.kfsFd, buf, pos, end); + state.kfsAccess.kfs_retToIOException(sz); + buf.position(pos + sz); + } + + // is modeled after the seek of Java's RandomAccessFile; offset is + // the offset from the beginning of the file. + final public synchronized long seek(long offset) throws IOException { + if (offset < 0) { + throw new IllegalArgumentException("seek(" + state.kfsFd + "," + offset + ")"); + } + if (state.kfsFd < 0) { + throw new IOException("File closed"); + } + state.readBuffer.clear(); + state.readBuffer.flip(); + return state.kfsAccess.kfs_seek(state.kfsFd, offset); + } + + public synchronized long tell() throws IOException { + if (state.kfsFd < 0) { + throw new IOException("File closed"); + } + // we keep some data buffered; so, we ask the C++ side where + // we are in the file and offset that by the amount in our + // buffer + final long ret = state.kfsAccess.kfs_tell(state.kfsFd); + final int rem = state.readBuffer.remaining(); + if (ret < rem) { + throw new RuntimeException("KFS internal error: pos: " + ret + + " less than buffered: " + rem); + } + return ret - rem; + } + + public synchronized void close() throws IOException { + state.release(); + } + + final public void setReadAheadSize(long readAheadSize) { + if (readAheadSize >= 0) { + state.kfsAccess.kfs_setReadAheadSize(state.kfsFd, readAheadSize); + isReadAheadOff = readAheadSize == 0; + } + } +} diff --git a/src/java/qfs-access/src/main/java/com/quantcast/qfs/access/KfsOutputChannel.java b/src/java/qfs-access/src/main/java/com/quantcast/qfs/access/KfsOutputChannel.java index 7a6350914..c9e06fbd1 100644 --- a/src/java/qfs-access/src/main/java/com/quantcast/qfs/access/KfsOutputChannel.java +++ b/src/java/qfs-access/src/main/java/com/quantcast/qfs/access/KfsOutputChannel.java @@ -1,11 +1,11 @@ /** * $Id$ * - * Created 2007/09/11 + * Created 2025/04/20 * - * @author: Sriram Rao (Kosmix Corp.) + * @author: Mike Ovsiannikov (Quantcast Corporation) * - * Copyright 2008-2012,2016 Quantcast Corporation. All rights reserved. + * Copyright 2025 Quantcast Corporation. All rights reserved. * Copyright 2007 Kosmix Corp. * * This file is part of Kosmos File System (KFS). @@ -21,236 +21,33 @@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. - * - * \brief An output channel that does buffered I/O. This is to reduce - * the overhead of JNI calls. + * + * \brief Input channel java 9 style cleanup. */ - package com.quantcast.qfs.access; import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.channels.WritableByteChannel; - -public class KfsOutputChannel implements WritableByteChannel, Positionable -{ - // To get to a byte-buffer from the C++ side as a pointer, need - // the buffer to be direct memory backed buffer. So, allocate one - // for reading/writing. - private ByteBuffer writeBuffer; - private int kfsFd = -1; - private KfsAccess kfsAccess; - private final boolean append; - private boolean returnBufferToPool; - - private final static native - int write(long ptr, int fd, ByteBuffer buf, int begin, int end); - - private final static native - int atomicRecordAppend(long ptr, int fd, ByteBuffer buf, int begin, int end); - - private final static native - int sync(long ptr, int fd); - - KfsOutputChannel(KfsAccess kfsAccess, int fd, boolean append) - { - this.writeBuffer = BufferPool.getInstance().getBuffer(); - this.returnBufferToPool = true; - this.writeBuffer.clear(); - this.append = append; - this.kfsFd = fd; - this.kfsAccess = kfsAccess; - } - - public synchronized boolean isOpen() - { - return kfsFd >= 0; - - } - - // Read/write from the specified fd. The basic model is: - // -- fill some data into a direct mapped byte buffer - // -- send/receive to the other side (Jave->C++ or vice-versa) - // - public synchronized int write(ByteBuffer src) throws IOException - { - if (kfsFd < 0) { - throw new IOException("File closed"); - } - final int r0 = src.remaining(); - // While the src buffer has data, copy it in and flush - while (src.hasRemaining()) { - if (writeBuffer.remaining() < (append ? r0 : 1)) { - syncSelf(); - } - if (append) { - final int spcAvail = writeBuffer.remaining(); - if (r0 > spcAvail) { - final int maxAppendSize = 64 << 10; - if (maxAppendSize < r0) { - throw new IOException( - r0 + " exceeds KFS append size limit of " + - maxAppendSize - ); - } - final ByteBuffer buf = ByteBuffer.allocateDirect( - (r0 + BufferPool.BUFFER_SIZE - 1) / - BufferPool.BUFFER_SIZE * BufferPool.BUFFER_SIZE - ); - releaseBuffer(); - writeBuffer = buf; - } - } - // Save end of input buffer - final int lim = src.limit(); - // Copy in as much data we have space - if (writeBuffer.remaining() < src.remaining()) { - if (append) { - throw new IOException("KFS internal append error" + - " buffer space is not sufficient"); - } - src.limit(src.position() + writeBuffer.remaining()); - } - writeBuffer.put(src); - // restore the limit to what it was - src.limit(lim); - } - return r0 - src.remaining(); - } - - private void writeDirect(ByteBuffer buf) throws IOException - { - if (! buf.isDirect()) { - throw new IllegalArgumentException("need direct buffer"); - } - final int pos = buf.position(); - final int last = buf.limit(); - if (pos < last) { - final int sz = append ? - atomicRecordAppend( - kfsAccess.getCPtr(), kfsFd, writeBuffer, pos, last) : - write( - kfsAccess.getCPtr(), kfsFd, buf, pos, last); - kfsAccess.kfs_retToIOException(sz); - if (pos + sz != last) { - throw new RuntimeException("KFS internal error:" + - (append ? "append" : "write") + "(" + - (last - pos) + ") != " + sz); - } - } - buf.clear(); - } - - /** @deprecated Use write() instead */ @Deprecated - public int atomicRecordAppend(ByteBuffer src) throws IOException - { - return write(src); - } +import java.lang.ref.Cleaner; - public synchronized int sync() throws IOException - { - if (kfsFd < 0) { - throw new IOException("File closed"); - } - if (append) { - syncSelf(); - } - return 0; - } +final public class KfsOutputChannel extends KfsOutputChannelBase { - private synchronized void syncSelf() throws IOException - { - // flush everything - writeBuffer.flip(); - boolean restore = true; - try { - writeDirect(writeBuffer); - restore = false; - } finally { - if (restore) { - writeBuffer.flip(); - } - } - } + final private Cleaner.Cleanable cleanable; - // is modeled after the seek of Java's RandomAccessFile; offset is - // the offset from the beginning of the file. - public synchronized long seek(long offset) throws IOException - { - if (kfsFd < 0) { - throw new IOException("File closed"); - } - if (offset < 0) { - throw new IllegalArgumentException("seek(" + kfsFd + ", " + offset + ")"); - } - syncSelf(); - return kfsAccess.kfs_seek(kfsFd, offset); + KfsOutputChannel(KfsAccessBase ka, int fd, boolean append) { + super(ka, fd, append); + cleanable = registerCleanup(); } - public synchronized long tell() throws IOException - { - if (kfsFd < 0) { - throw new IOException("File closed"); - } - // similar issue as read: the position at which we are writing - // needs to be offset by where the C++ code thinks we are and - // how much we have buffered - return kfsAccess.kfs_tell(kfsFd) + writeBuffer.remaining(); + private Cleaner.Cleanable registerCleanup() { + return KfsAccess.registerCleanup(this, state); } - public synchronized void close() throws IOException - { - if (kfsFd < 0) { - throw new IOException("File closed"); - } - IOException origEx = null; + @Override + public void close() throws IOException { try { - syncSelf(); - } catch (IOException ex) { - origEx = ex; + super.close(); } finally { - final int fd = kfsFd; - kfsFd = -1; - KfsAccess ka = kfsAccess; - kfsAccess = null; - try { - ka.kfs_close(fd); - } finally { - releaseBuffer(); - if (origEx != null) { - throw origEx; - } - } - } - } - - private void releaseBuffer() - { - if (returnBufferToPool) { - BufferPool.getInstance().releaseBuffer(writeBuffer); - } - writeBuffer = null; - returnBufferToPool = false; - } - - protected void finalize() throws Throwable - { - try { - if (kfsFd >= 0 && kfsAccess != null) { - final int fd = kfsFd; - kfsFd = -1; - KfsAccess ka = kfsAccess; - kfsAccess = null; - ka.kfs_close(fd); - } - } finally { - super.finalize(); - } - } - - public void setIoBufferSize(long bufferSize) { - if(bufferSize >= 0) { - kfsAccess.kfs_setIoBufferSize(kfsFd, bufferSize); + cleanable.clean(); } } } diff --git a/src/java/qfs-access/src/main/java/com/quantcast/qfs/access/KfsOutputChannelBase.java b/src/java/qfs-access/src/main/java/com/quantcast/qfs/access/KfsOutputChannelBase.java new file mode 100644 index 000000000..5e2923d89 --- /dev/null +++ b/src/java/qfs-access/src/main/java/com/quantcast/qfs/access/KfsOutputChannelBase.java @@ -0,0 +1,267 @@ +/** + * $Id$ + * + * Created 2007/09/11 + * + * @author: Sriram Rao (Kosmix Corp.) + * + * Copyright 2008-2012,2016 Quantcast Corporation. All rights reserved. + * Copyright 2007 Kosmix Corp. + * + * This file is part of Kosmos File System (KFS). + * + * Licensed under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. See the License for the specific language governing + * permissions and limitations under the License. + * + * \brief An output channel that does buffered I/O. This is to reduce + * the overhead of JNI calls. + */ +package com.quantcast.qfs.access; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.WritableByteChannel; + +class KfsOutputChannelBase implements WritableByteChannel, Positionable { + + // To get to a byte-buffer from the C++ side as a pointer, need + // the buffer to be direct memory backed buffer. So, allocate one + // for reading/writing. + protected final static class State implements Runnable { + + ByteBuffer writeBuffer; + boolean returnBufferToPool; + int kfsFd; + KfsAccessBase kfsAccess; + + State(int fd, KfsAccessBase ka) { + writeBuffer = BufferPool.getInstance().getBuffer(); + returnBufferToPool = true; + writeBuffer.clear(); + kfsFd = fd; + kfsAccess = ka; + } + + void releaseBuffer() { + if (returnBufferToPool) { + BufferPool.getInstance().releaseBuffer(writeBuffer); + } + writeBuffer = null; + returnBufferToPool = false; + } + + void release() throws IOException { + if (kfsFd >= 0 && kfsAccess != null) { + final int fd = kfsFd; + kfsFd = -1; + KfsAccessBase ka = kfsAccess; + kfsAccess = null; + try { + ka.kfs_close(fd); + } finally { + releaseBuffer(); + } + } + } + + @Override + public void run() { + try { + release(); + } catch (IOException ignored) { + // Ignore + } + } + } + final protected State state; + final private boolean append; + + private static native int write( + long ptr, int fd, ByteBuffer buf, int begin, int end); + + private static native int atomicRecordAppend( + long ptr, int fd, ByteBuffer buf, int begin, int end); + + // private static native + // int sync(long ptr, int fd); + KfsOutputChannelBase(KfsAccessBase kfsAccess, int fd, boolean append) { + this.append = append; + this.state = new State(fd, kfsAccess); + } + + final public synchronized boolean isOpen() { + return state.kfsFd >= 0; + } + + // Read/write from the specified fd. The basic model is: + // -- fill some data into a direct mapped byte buffer + // -- send/receive to the other side (Jave->C++ or vice-versa) + // + final public synchronized int write(ByteBuffer src) throws IOException { + if (state.kfsFd < 0) { + throw new IOException("File closed"); + } + final int r0 = src.remaining(); + // While the src buffer has data, copy it in and flush + while (src.hasRemaining()) { + if (state.writeBuffer.remaining() < (append ? r0 : 1)) { + syncSelf(); + } + if (append) { + final int spcAvail = state.writeBuffer.remaining(); + if (r0 > spcAvail) { + final int maxAppendSize = 64 << 10; + if (maxAppendSize < r0) { + throw new IOException( + r0 + " exceeds KFS append size limit of " + + maxAppendSize + ); + } + final ByteBuffer buf = ByteBuffer.allocateDirect( + (r0 + BufferPool.BUFFER_SIZE - 1) + / BufferPool.BUFFER_SIZE * BufferPool.BUFFER_SIZE + ); + releaseBuffer(); + state.writeBuffer = buf; + } + } + // Save end of input buffer + final int lim = src.limit(); + // Copy in as much data we have space + if (state.writeBuffer.remaining() < src.remaining()) { + if (append) { + throw new IOException("KFS internal append error" + + " buffer space is not sufficient"); + } + src.limit(src.position() + state.writeBuffer.remaining()); + } + state.writeBuffer.put(src); + // restore the limit to what it was + src.limit(lim); + } + return r0 - src.remaining(); + } + + private void writeDirect(ByteBuffer buf) throws IOException { + if (!buf.isDirect()) { + throw new IllegalArgumentException("need direct buffer"); + } + final int pos = buf.position(); + final int last = buf.limit(); + if (pos < last) { + final int sz = append + ? atomicRecordAppend( + state.kfsAccess.getCPtr(), + state.kfsFd, state.writeBuffer, pos, last) + : write( + state.kfsAccess.getCPtr(), + state.kfsFd, buf, pos, last); + state.kfsAccess.kfs_retToIOException(sz); + if (pos + sz != last) { + throw new RuntimeException("KFS internal error:" + + (append ? "append" : "write") + "(" + + (last - pos) + ") != " + sz); + } + } + buf.clear(); + } + + /** + * @deprecated Use write() instead + */ + @Deprecated + final public int atomicRecordAppend(ByteBuffer src) throws IOException { + return write(src); + } + + final public synchronized int sync() throws IOException { + if (state.kfsFd < 0) { + throw new IOException("File closed"); + } + if (append) { + syncSelf(); + } + return 0; + } + + private synchronized void syncSelf() throws IOException { + // flush everything + state.writeBuffer.flip(); + boolean restore = true; + try { + writeDirect(state.writeBuffer); + restore = false; + } finally { + if (restore) { + state.writeBuffer.flip(); + } + } + } + + // is modeled after the seek of Java's RandomAccessFile; offset is + // the offset from the beginning of the file. + final public synchronized long seek(long offset) throws IOException { + if (state.kfsFd < 0) { + throw new IOException("File closed"); + } + if (offset < 0) { + throw new IllegalArgumentException( + "seek(" + state.kfsFd + ", " + offset + ")"); + } + syncSelf(); + return state.kfsAccess.kfs_seek(state.kfsFd, offset); + } + + final public synchronized long tell() throws IOException { + if (state.kfsFd < 0) { + throw new IOException("File closed"); + } + // similar issue as read: the position at which we are writing + // needs to be offset by where the C++ code thinks we are and + // how much we have buffered + return state.kfsAccess.kfs_tell( + state.kfsFd) + state.writeBuffer.remaining(); + } + + public synchronized void close() throws IOException { + if (state.kfsFd < 0) { + return; + } + IOException origEx = null; + try { + syncSelf(); + } catch (IOException ex) { + origEx = ex; + } finally { + try { + state.release(); + } catch (IOException ex) { + if (origEx == null) { + origEx = ex; + } + } + } + if (origEx != null) { + throw origEx; + } + } + + private void releaseBuffer() { + state.releaseBuffer(); + } + + final public void setIoBufferSize(long bufferSize) { + if (bufferSize >= 0) { + state.kfsAccess.kfs_setIoBufferSize(state.kfsFd, bufferSize); + } + } +} diff --git a/src/java/qfs-access/src/main/java/com/quantcast/qfs/access/KfsTest.java b/src/java/qfs-access/src/main/java/com/quantcast/qfs/access/KfsTest.java index f4d243cd8..eab2c8f1e 100644 --- a/src/java/qfs-access/src/main/java/com/quantcast/qfs/access/KfsTest.java +++ b/src/java/qfs-access/src/main/java/com/quantcast/qfs/access/KfsTest.java @@ -26,16 +26,17 @@ package com.quantcast.qfs.access; -import java.io.*; -import java.net.*; -import java.util.Random; +import java.io.File; +import java.io.IOException; +import java.nio.ByteBuffer; import java.util.Date; import java.util.Iterator; import java.util.Map; -import java.nio.ByteBuffer; +import java.util.Random; public class KfsTest { + @SuppressWarnings("UseSpecificCatch") public static void main(String args[]) { if (args.length < 1) { System.out.println("Usage: KfsTest "); @@ -45,7 +46,7 @@ public static void main(String args[]) { int port = Integer.parseInt(args[1].trim()); KfsAccess kfsAccess = new KfsAccess(args[0], port); - String basedir = new String("jtest"); + String basedir = "jtest"; final String euidp = System.getProperty("kfs.euid"); final String egidp = System.getProperty("kfs.egid"); final long euid = (euidp != null && euidp.length() > 0) ? @@ -135,8 +136,8 @@ public static void main(String args[]) { throw new IOException("QFS doesn't think " + basedir + " is a dir!"); } - final String fname = new String("foo.1"); - final String path = new String(basedir + "/" + fname); + final String fname = "foo.1"; + final String path = basedir + "/" + fname; final KfsOutputChannel outputChannel = kfsAccess.kfs_create(path); long mTime = kfsAccess.kfs_getModificationTime(path); @@ -150,8 +151,8 @@ public static void main(String args[]) { } System.out.println("Readdir returned: "); - for (int i = 0; i < entries.length; i++) { - System.out.println(entries[i]); + for (String entrie : entries) { + System.out.println(entrie); } final String absent = basedir + "/must not exist"; @@ -179,8 +180,8 @@ public static void main(String args[]) { outputChannel.sync(); outputChannel.close(); - final String symName = new String("foo.1.sym"); - final String slPath = new String(basedir + "/" + symName); + final String symName = "foo.1.sym"; + final String slPath = basedir + "/" + symName; boolean overwrite = false; kfsAccess.kfs_retToIOException( kfsAccess.kfs_symlink(path + ".1", slPath, 0777, overwrite), @@ -229,12 +230,12 @@ public static void main(String args[]) { throw new IOException(basedir + ": kfs_readdirplus failed"); } System.out.println("kfs_readdirplus returned: "); - for (int i = 0; i < fattr.length; i++) { - System.out.println(attrToString(fattr[i], "\n")); + for (KfsFileAttr fattr1 : fattr) { + System.out.println(attrToString(fattr1, "\n")); } if ((fattr = kfsAccess.kfs_readdirplus(absent)) != null) { - throw new IOException("kfs_readdirplus: " + fattr + + throw new IOException("kfs_readdirplus: " + (Object)fattr + ": non null, size: " + fattr.length); } @@ -248,8 +249,8 @@ public static void main(String args[]) { System.out.println("Block Locations:"); for (int i = 0; i < locs.length; i++) { System.out.print("chunk " + i + " : "); - for (int j = 0; j < locs[i].length; j++) { - System.out.print(locs[i][j] + " "); + for (String loc : locs[i]) { + System.out.print(loc + " "); } System.out.println(); } @@ -268,8 +269,8 @@ public static void main(String args[]) { System.out.println("block size: " + blockSize); for (int i = 1; i < locs.length; i++) { System.out.print("chunk " + (i-1) + " : "); - for (int j = 0; j < locs[i].length; j++) { - System.out.print(locs[i][j] + " "); + for (String loc : locs[i]) { + System.out.print(loc + " "); } System.out.println(); } @@ -288,7 +289,7 @@ public static void main(String args[]) { System.out.println("stat: \n" + attrToString(attr, "\n")); // rename the file - String npath = new String(basedir + "/foo.2"); + String npath = basedir + "/foo.2"; kfsAccess.kfs_rename(path, npath); if (kfsAccess.kfs_exists(path)) { @@ -332,6 +333,10 @@ public static void main(String args[]) { // read some bytes buf = new byte[128]; res = inputChannel.read(ByteBuffer.wrap(buf, 0, 128)); + if (res != 128) { + throw new IOException( + npath + ": was able to read only: " + res); + } s = new String(buf); for (int i = 0; i < 128; i++) { @@ -392,14 +397,14 @@ public static void main(String args[]) { kfsAccess.kfs_retToIOException(kfsAccess.kfs_rmdir(basedir)); System.out.println("All done...Test passed!"); } catch (Exception e) { - e.printStackTrace(); + e.printStackTrace(System.out); System.out.println(e.getMessage()); System.out.println("Test failed"); System.exit(1); } } - private static Random randGen = new Random(100); + private static final Random randGen = new Random(100); private static void generateData(char buf[], int numBytes) { @@ -554,6 +559,10 @@ private static void testDisableReadAhead(KfsAccess kfsAccess, String baseDir) inputChannel.setReadAheadSize(0); final byte[] dstBuf = new byte[128]; res = inputChannel.read(ByteBuffer.wrap(dstBuf, 0, 128)); + if (res != 128) { + throw new IOException( + filePath + ": was able to read only: " + res); + } s = new String(dstBuf); for (int i = 0; i < 128; i++) { if (dataBuf[i] != s.charAt(i)) { @@ -568,6 +577,10 @@ private static void testDisableReadAhead(KfsAccess kfsAccess, String baseDir) filePath + "failed to seek to byte 512. Pos: " + pos); } res = inputChannel.read(ByteBuffer.wrap(dstBuf, 0, 128)); + if (res != 128) { + throw new IOException( + filePath + ": was able to read only: " + res); + } s = new String(dstBuf); for (int i = 0; i < 128; i++) { if (dataBuf[512+i] != s.charAt(i)) { @@ -584,6 +597,10 @@ private static void testDisableReadAhead(KfsAccess kfsAccess, String baseDir) } inputChannel.setReadAheadSize(1048576); res = inputChannel.read(ByteBuffer.wrap(dstBuf, 0, 128)); + if (res != 128) { + throw new IOException( + filePath + ": was able to read only: " + res); + } s = new String(dstBuf); for (int i = 0; i < 128; i++) { if (dataBuf[i] != s.charAt(i)) { diff --git a/src/test-scripts/run_endurance_mc.sh b/src/test-scripts/run_endurance_mc.sh index 94d2c6441..342d97560 100755 --- a/src/test-scripts/run_endurance_mc.sh +++ b/src/test-scripts/run_endurance_mc.sh @@ -26,36 +26,35 @@ # The logic below expects that 4 directories # /mnt/data{0-3}/ # are available and correspond to 4 physical disks. -# -# - +# +# -bdir=`pwd` +bdir=$(pwd) PATH="/sbin:/usr/sbin:$PATH" export PATH if [ -f ../../CMakeLists.txt ]; then - ssrcdir="`cd ../.. >/dev/null 2>&1 && pwd`" + ssrcdir="$(cd ../.. >/dev/null 2>&1 && pwd)" else - ssrcdir="`cd ../.. >/dev/null 2>&1 && pwd`" + ssrcdir="$(cd ../.. >/dev/null 2>&1 && pwd)" fi -srcdir="`dirname "$0"`" -srcdir="`cd "$srcdir/../.." >/dev/null 2>&1 && pwd`" +srcdir="$(dirname "$0")" +srcdir="$(cd "$srcdir/../.." >/dev/null 2>&1 && pwd)" testdirsprefix='/mnt/data' n=3 while [ $n -ge 0 ]; do [ -d "${testdirsprefix}$n" ] || break - n=`expr $n - 1` + n=$(expr $n - 1) done if [ $n -ge 0 ]; then - testdirsprefix="`basename "$0" .sh`/data" + testdirsprefix="$(basename "$0" .sh)/data" fi chunksdir='./chunks' metasrvchunkport=20100 -metasrvchunkporre="`expr $metasrvchunkport / 10`[0-9]" +metasrvchunkporre="$(expr $metasrvchunkport / 10)[0-9]" chunksrvport=30000 clustername='endurance-test' numchunksrv=3 @@ -88,45 +87,43 @@ myusevalgrind='' chunkdirerrsim=0 chunkdirerrsimall=0 -mkcerts=`dirname "$0"` -mkcerts="`cd "$mkcerts" && pwd`/qfsmkcerts.sh" +mkcerts=$(dirname "$0") +mkcerts="$(cd "$mkcerts" && pwd)/qfsmkcerts.sh" -clientuser=${clientuser-"`id -un`"} +clientuser=${clientuser-"$(id -un)"} chunkserverclithreads=${chunkserverclithreads-3} -objectstorebuffersize=${objectstorebuffersize-`expr 512 \* 1024`} +objectstorebuffersize=${objectstorebuffersize-$(expr 512 \* 1024)} cabundleurl='https://raw.githubusercontent.com/bagder/ca-bundle/master/ca-bundle.crt' cabundlefileos='/etc/pki/tls/certs/ca-bundle.crt' prevlogsdir='prev_logs' vrcount=0 -if openssl version | grep 'SSL 0\.' > /dev/null; then +if openssl version | grep 'SSL 0\.' >/dev/null; then auth=${auth-no} else auth=${auth-yes} fi if [ x"$SECONDS" = x ]; then - mystartseconds=`date -u '+%s'` + mystartseconds=$(date -u '+%s') else mystartseconds='' fi -now_seconds() -{ +now_seconds() { if [ x"$mystartseconds" = x ]; then echo $SECONDS else - echo $((`date -u '+%s'` - $mystartseconds)) + echo $(($(date -u '+%s') - $mystartseconds)) fi } -update_parameters() -{ +update_parameters() { clitestdir="${testdirsprefix}3/$USER/test/cli" metasrvdir="${testdirsprefix}3/$USER/test/meta" - wuiport=`expr $metasrvport + 50` - metavrport=`expr $metasrvport + 500` + wuiport=$(expr $metasrvport + 50) + metavrport=$(expr $metasrvport + 500) chunkrundirs="${testdirsprefix}[012]/$USER" chunkbin="$bdir/src/cc/chunk/chunkserver" @@ -137,23 +134,21 @@ update_parameters() metahost=$myhost clientprop="$clitestdir/client.prp" clientproprs="${clientprop}.rs" - certsdir="`dirname "$clitestdir"`/certs" + certsdir="$(dirname "$clitestdir")/certs" objectstoredir="${testdirsprefix}3/$USER/test/object_store" - cabundlefile="`dirname "$objectstoredir"`/ca-bundle.crt" + cabundlefile="$(dirname "$objectstoredir")/ca-bundle.crt" } -kill_all_proc() -{ - if [ x"`uname`" = x'Darwin' ]; then - { find "$@" -type f -print0 | xargs -0 lsof -nt -- | xargs kill -KILL ; } \ +kill_all_proc() { + if [ x"$(uname)" = x'Darwin' ]; then + { find "$@" -type f -print0 | xargs -0 lsof -nt -- | xargs kill -KILL; } \ >/dev/null 2>&1 else - { find "$@" -type f | xargs fuser | xargs kill -KILL ; } >/dev/null 2>&1 + { find "$@" -type f | xargs fuser | xargs kill -KILL; } >/dev/null 2>&1 fi } -retry_cmd() -{ +retry_cmd() { local trycnt=$1 shift local interval=$1 @@ -162,15 +157,14 @@ retry_cmd() "$@" && break [ $trycnt -le 0 ] && return 1 sleep $interval - trycnt=`expr $trycnt - 1` + trycnt=$(expr $trycnt - 1) done return 0 } -show_help() -{ +show_help() { echo \ -"Usage: $0 {-stop|-get-logs|-status}"' + "Usage: $0 {-stop|-get-logs|-status}"' -get-logs -- get names of all log files -stop -- stop (kill -9) all started processes -status [] -- get current status (tail) from the test logs, @@ -200,8 +194,7 @@ show_help() ' } -pre_run_cleanup() -{ +pre_run_cleanup() { [ $# -ne 1 ] && return 1 mkdir -p "$1" || return 1 rm -f "$1"/* 2>/dev/null @@ -210,14 +203,13 @@ pre_run_cleanup() return 0 } -run_with_valgrind() -{ +run_with_valgrind() { if [ x"$myusevalgrind" = x ]; then exec ${1+"$@"} else GLIBCPP_FORCE_NEW=1 \ - GLIBCXX_FORCE_NEW=1 \ - exec valgrind \ + GLIBCXX_FORCE_NEW=1 \ + exec valgrind \ -v \ --log-file=valgrind.log \ --leak-check=full \ @@ -253,14 +245,13 @@ while [ $# -gt 0 ]; do "$clitestdir/fanout/kfanout_test.log" \ "$clitestdir/sortmaster/sortmaster_endurance_test.log" \ "$clitestdir/cp/cptest-"*".log" \ - "$metasrvdir/$fscklog" \ - ; do + "$metasrvdir/$fscklog"; do [ -f "$n" ] || continue - echo "============== `basename "$n"` =================" + echo "============== $(basename "$n") =================" tail -n 7 "$n" done [ x"$2" = x ] && break - [ $2 -le 0 ] && break + [ $2 -le 0 ] && break sleep $2 echo done @@ -302,12 +293,12 @@ while [ $# -gt 0 ]; do elif [ x"$1" = x'-auth' ]; then shift if [ x"$1" = x'on' -o x"$1" = x'ON' -o \ - x"$1" = x'yes' -o x"$1" = x'YES' ]; then + x"$1" = x'yes' -o x"$1" = x'YES' ]; then auth='yes' else auth='no' fi - [ $# -gt 0 ] && shift + [ $# -gt 0 ] && shift elif [ x"$1" = x'-chunk-dir-err-sim-only' ]; then shift mconly='yes' @@ -357,7 +348,7 @@ while [ $# -gt 0 ]; do done if [ $excode -ne 0 ]; then - exit `expr $excode - 1` + exit $(expr $excode - 1) fi if [ $removetestdirs -ne 0 ]; then @@ -368,8 +359,8 @@ fi if [ x"$s3test" = x'yes' ]; then if [ x"$QFS_S3_ACCESS_KEY_ID" = x -o \ - x"$QFS_S3_SECRET_ACCESS_KEY" = x -o \ - x"$QFS_S3_BUCKET_NAME" = x ]; then + x"$QFS_S3_SECRET_ACCESS_KEY" = x -o \ + x"$QFS_S3_BUCKET_NAME" = x ]; then echo "environment variables QFS_S3_ACCESS_KEY_ID," \ "QFS_S3_SECRET_ACCESS_KEY," \ "and QFS_S3_BUCKET_NAME must be set accordintly" @@ -384,24 +375,24 @@ for n in "$chunkbin" "$metabin" "$fsckbin" "$adminbin"; do fi done -tdir="`dirname "$testdirsprefix"`" -if [ x"$tdir" = x'.' -a x"`basename "$testdirsprefix"`" = x'.' ]; then +tdir="$(dirname "$testdirsprefix")" +if [ x"$tdir" = x'.' -a x"$(basename "$testdirsprefix")" = x'.' ]; then testdirsprefix="$testdirsprefix/data" fi mkdir -p "$tdir" || exit -tdir="`cd "$tdir" > /dev/null && pwd`" +tdir="$(cd "$tdir" >/dev/null && pwd)" [ x"$tdir" = x ] && exit 1 -testdirsprefix="$tdir/`basename "$testdirsprefix"`" +testdirsprefix="$tdir/$(basename "$testdirsprefix")" for i in 0 1 2 3; do mkdir -p "${testdirsprefix}$i/$USER/test" done -eval `env | awk ' +eval $(env | awk ' BEGIN { FS="="; } /QFS_CLIENT_CONFIG/ { print "unexport " $1; print "unset " $1; - }'` + }') update_parameters @@ -411,11 +402,11 @@ cd "$metasrvdir" || exit if [ $chunkdirerrsimall -lt 0 ]; then cd "$metasrvdir" || exit if [ -f "$metasrvpid" ]; then - cat >> "$metasrvprop" << EOF + cat >>"$metasrvprop" <> "$clientprop" << EOF + cat >>"$clientprop" <> "$clientprop" << EOF + cat >>"$clientprop" <> "$clientprop" << EOF +cat >>"$clientprop" <> "$clientproprs" << EOF +cat >>"$clientproprs" </dev/null fi -ulimit -n `ulimit -Hn` || exit -if [ `ulimit -n` -le 1024 ]; then - echo "Insufficient open file descriptor limit: `ulimit -n`" - exit 1 +ulimit -n $(ulimit -Hn) || exit +mycurlimit=$(ulimit -n) || exit +if [ x"$mycurlimit" != x'unlimited' ]; then + if [ $(ulimit -n) -le 1024 ]; then + echo "Insufficient open file descriptor limit: $(ulimit -n)" + exit 1 + fi fi exec 0/dev/null`" ]; then - curl "$cabundleurl" > "$cabundlefile" || exit + if [ -x "$(which curl 2>/dev/null)" ]; then + curl "$cabundleurl" >"$cabundlefile" || exit else wget "$cabundleurl" -O "$cabundlefile" || exit fi @@ -531,39 +525,39 @@ else echo "Starting meta server $metahost:$metasrvport" ( - trap '' EXIT + trap '' EXIT - mkdir -p "$metasrvdir" - cd "$metasrvdir" || exit + mkdir -p "$metasrvdir" + cd "$metasrvdir" || exit - kill_all_proc "$metasrvdir" - mkdir -p kfscp || exit - mkdir -p kfslog || exit + kill_all_proc "$metasrvdir" + mkdir -p kfscp || exit + mkdir -p kfslog || exit - metaserverbin="`basename "$metabin"`" - rm -f "$metaserverbin" - cp "$metabin" . || exit + metaserverbin="$(basename "$metabin")" + rm -f "$metaserverbin" + cp "$metabin" . || exit - qfsfsckbin="`basename "$fsckbin"`" - rm -f "$qfsfsckbin" - cp "$fsckbin" . || exit + qfsfsckbin="$(basename "$fsckbin")" + rm -f "$qfsfsckbin" + cp "$fsckbin" . || exit - qfsadminbin="`basename "$adminbin"`" - rm -f "$qfsadminbin" - cp "$adminbin" . || exit + qfsadminbin="$(basename "$adminbin")" + rm -f "$qfsadminbin" + cp "$adminbin" . || exit - if [ -d "$webui" ]; then - wdir=`basename "$webui"` - rm -rf "$wdir" - cp -a "$webui" . || exit - fi - metabinmd5=`openssl md5 < /dev/null 2>/dev/null | awk '{print $2}'` - mymd5=`openssl md5 < ./"$metaserverbin" 2>/dev/null | awk '{print $2}'` - metabinmd5="$metabinmd5 $mymd5" - mymd5=`echo test | openssl md5 2>/dev/null | awk '{print $2}'` - metabinmd5="$metabinmd5 $mymd5" + if [ -d "$webui" ]; then + wdir=$(basename "$webui") + rm -rf "$wdir" + cp -a "$webui" . || exit + fi + metabinmd5=$(openssl md5 /dev/null | awk '{print $2}') + mymd5=$(openssl md5 <./"$metaserverbin" 2>/dev/null | awk '{print $2}') + metabinmd5="$metabinmd5 $mymd5" + mymd5=$(echo test | openssl md5 2>/dev/null | awk '{print $2}') + metabinmd5="$metabinmd5 $mymd5" - cat > "$metasrvprop" << EOF + cat >"$metasrvprop" <> "$metasrvprop" << EOF + if [ x"$auth" = x'yes' ]; then + cat >>"$metasrvprop" <> "$metasrvprop" << EOF + if [ $chunkdirerrsimall -gt 0 ]; then + cat >>"$metasrvprop" <> "$metasrvprop.tmp" << EOF + if [ -f "kfscp/latest" ]; then + true + else + cp "$metasrvprop" "$metasrvprop.tmp" + cat >>"$metasrvprop.tmp" < "${metasrvout}" 2>&1; then - rm "$metasrvprop.tmp" - else - status=$? - cat "${metasrvout}" - exit $status + if ./"$metaserverbin" -c "$metasrvprop.tmp" >"${metasrvout}" 2>&1; then + rm "$metasrvprop.tmp" + else + status=$? + cat "${metasrvout}" + exit $status + fi fi - fi - if [ $vrcount -gt 2 ]; then - echo "Setting up VR with $vrcount nodes" - filesystemid=`awk ' + if [ $vrcount -gt 2 ]; then + echo "Setting up VR with $vrcount nodes" + filesystemid=$(awk ' BEGIN{FS="/";} { if ($1 == "filesysteminfo") { @@ -704,18 +698,18 @@ EOF exit; } } - ' kfscp/latest` - if [ x"$filesystemid" = x ]; then - echo "Failed to determine files system id in kfscp/latest" - exit 1 - fi - cat >> "$metasrvprop" << EOF + ' kfscp/latest) + if [ x"$filesystemid" = x ]; then + echo "Failed to determine files system id in kfscp/latest" + exit 1 + fi + cat >>"$metasrvprop" <> "$metasrvprop" << EOF + if [ x"$auth" = x'yes' ]; then + cat >>"$metasrvprop" <> "$vrdir/$metasrvprop" << EOF + i=$vrcount + myhostname='' # `hostname` + while [ $i -gt 0 ]; do + i=$(expr $i - 1) + if [ $i -gt 0 ]; then + vrdir="vr$i" + mkdir -p "$vrdir/kfscp" "$vrdir/kfslog" || exit + cp "$metasrvprop" "$vrdir/" || exit + else + vrdir='.' + fi + port=$(expr $metavrport + $i) + cport=$(expr $metasrvport + $i) + csport=$(expr $metasrvchunkport + $i) + cat >>"$vrdir/$metasrvprop" <> "$vrdir/$metasrvprop" << EOF + if [ x = x"$myhostname" ]; then + cat >>"$vrdir/$metasrvprop" <> "$vrdir/$metasrvprop" << EOF + else + cat >>"$vrdir/$metasrvprop" < "${metasrvout}" 2>&1 & - mpid=$! - echo $mpid > "$metasrvpid" - kill -0 $mpid - ) || exit - i=`expr $i + 1` - vrdir="vr$i" - cd "$mbdir" || exit - done - adminclientprop=qfsadmin.prp - cat > "$adminclientprop" << EOF + i=0 + vrdir='.' + mbdir="$(pwd)" + while [ $i -lt $vrcount ]; do + cd "$vrdir" || exit + if [ $i -gt 0 ]; then + pre_run_cleanup "$prevlogsdir" || exit + fi + rm -rf "$metasrvlog" + ( + trap '' HUP EXIT + run_with_valgrind \ + "$mbdir/$metaserverbin" "$metasrvprop" "$metasrvlog" \ + >"${metasrvout}" 2>&1 & + mpid=$! + echo $mpid >"$metasrvpid" + kill -0 $mpid + ) || exit + i=$(expr $i + 1) + vrdir="vr$i" + cd "$mbdir" || exit + done + adminclientprop=qfsadmin.prp + cat >"$adminclientprop" <> "$adminclientprop" << EOF + if [ x"$auth" = x'yes' ]; then + cat >>"$adminclientprop" <> "$vrdir/$metasrvprop" << EOF + vr_reconfiguration || + exit + echo "VR status:" + ./"$qfsadminbin" \ + -f "$adminclientprop" \ + -s "$metahost" \ + -p "$metasrvport" \ + vr_get_status || + exit + fi + # Enable error simulation after VR is configured, as otherwise + # configuration might fail or take long time. + if [ x"$errsim" = x'yes' ]; then + i=0 + vrdir='.' + cat >>"$vrdir/$metasrvprop" <> "$vrdir/$metasrvprop" << EOF + while [ $i -lt $vrcount ]; do + cat >>"$vrdir/$metasrvprop" <> "$metasrvprop" << EOF + kill -HUP $(cat "$vrdir/$metasrvpid") || exit 1 + i=$(expr $i + 1) + vrdir="vr$i" + done + fi + else + if [ x"$errsim" = x'yes' ]; then + cat >>"$metasrvprop" <> "$metasrvprop" << EOF + fi + cat >>"$metasrvprop" < "${metasrvout}" 2>&1 & - echo $! > "$metasrvpid" - ) || exit - fi - - fsckfailures=0 - fsckruns=0 - while true; do - sleep `awk 'BEGIN{printf("%.0f\n", rand() * 100); exit;}'` - if ./"$qfsfsckbin" -A 1 -c kfscp; then - fsckstatus="OK" - else - fsckstatus="FAILED" - fsckfailures=`expr $fsckfailures + 1` + ( + trap '' HUP EXIT + run_with_valgrind \ + "$(pwd)/$metaserverbin" \ + "$metasrvprop" "$metasrvlog" >"${metasrvout}" 2>&1 & + echo $! >"$metasrvpid" + ) || exit fi - fsckruns=`expr $fsckruns + 1` - echo "==== RUN: $fsckruns FAILURES: $fsckfailures STATUS: $fsckstatus ===" - done > "$fscklog" 2>&1 & - echo $! > "$fsckpid" - - if [ -d "$wdir" ]; then - cd "$wdir" || exit - unset headerscs - for h in \ + + fsckfailures=0 + fsckruns=0 + while true; do + sleep $(awk 'BEGIN{printf("%.0f\n", rand() * 100); exit;}') + if ./"$qfsfsckbin" -A 1 -c kfscp; then + fsckstatus="OK" + else + fsckstatus="FAILED" + fsckfailures=$(expr $fsckfailures + 1) + fi + fsckruns=$(expr $fsckruns + 1) + echo "==== RUN: $fsckruns FAILURES: $fsckfailures STATUS: $fsckstatus ===" + done >"$fscklog" 2>&1 & + echo $! >"$fsckpid" + + if [ -d "$wdir" ]; then + cd "$wdir" || exit + unset headerscs + for h in \ D-Timer-overrun-count \ D-Timer-overrun-sec \ XMeta-location \ @@ -921,12 +915,11 @@ EOF D-Disk-read-errors \ D-Disk-write-errors \ Num-wr-drives \ - Num-writable-chunks \ - ; do - headerscs="${headerscs-}${headerscs+&}${h}" - done - unset headerscsdirs - for h in \ + Num-writable-chunks; do + headerscs="${headerscs-}${headerscs+&}${h}" + done + unset headerscsdirs + for h in \ Chunks \ Dev-id \ Read-bytes \ @@ -950,11 +943,10 @@ EOF D-Write-time-microsec \ Write-timeout \ Chunk-server \ - Chunk-dir \ - ; do - headerscsdirs="${headerscsdirs-}${headerscsdirs+&}${h}" - done - cat > "$wuiconf" << EOF + Chunk-dir; do + headerscsdirs="${headerscsdirs-}${headerscsdirs+&}${h}" + done + cat >"$wuiconf" < "$wuilog" 2>&1 & - echo $! > "$wuipid" - kill -0 `cat "$wuipid"` || { - echo "Failed to start meta server web UI" - exit 1 - } - fi - exit 0 + rm -f *.log* + trap '' HUP INT + ./qfsstatus.py "$wuiconf" >"$wuilog" 2>&1 & + echo $! >"$wuipid" + kill -0 $(cat "$wuipid") || { + echo "Failed to start meta server web UI" + exit 1 + } + fi + exit 0 ) || { kill_all_proc "$metasrvdir" echo "Failed to start meta server" @@ -1000,9 +992,9 @@ EOF chunksrvdir="$n/test/chunk" mkdir -p "$chunksrvdir" kill_all_proc "$chunksrvdir" - rm -f "$chunksrvdir/`basename "$chunkbin"`" + rm -f "$chunksrvdir/$(basename "$chunkbin")" cp "$chunkbin" "$chunksrvdir" || exit - e=`expr $i + $numchunksrv` + e=$(expr $i + $numchunksrv) while [ $i -lt $e ]; do dir="$chunksrvdir/$i" mkdir -p "$dir" || exit @@ -1011,7 +1003,7 @@ EOF mkdir -p "kfschunk" || exit pre_run_cleanup "$prevlogsdir" || exit ) || exit - cat > "$dir/$chunksrvprop" << EOF + cat >"$dir/$chunksrvprop" <> "$dir/$chunksrvprop" << EOF + if [ $(expr $i - $chunksrvport) -lt $chunkdirerrsim ]; then + cat >>"$dir/$chunksrvprop" <> "$dir/$chunksrvprop" << EOF + cat >>"$dir/$chunksrvprop" <> "$dir/$chunksrvprop" << EOF + cat >>"$dir/$chunksrvprop" <> "$dir/$chunksrvprop" << EOF + cat >>"$dir/$chunksrvprop" <> "$dir/$chunksrvprop" << EOF + cat >>"$dir/$chunksrvprop" <> "$dir/$chunksrvprop" << EOF + if [ x"$s3test" = x'yes' ]; then + cat >>"$dir/$chunksrvprop" <> "$dir/$chunksrvprop" << EOF + else + cat >>"$dir/$chunksrvprop" <> "$dir/$chunksrvprop" << EOF + fi + if [ $vrcount -gt 2 ]; then + cat >>"$dir/$chunksrvprop" < "${chunksrvout}" 2>&1 & - echo $! > "$chunksrvpid" + trap '' EXIT + cd "$dir" || exit + echo "Starting chunk server $i" + trap '' HUP INT + run_with_valgrind \ + ../chunkserver "$chunksrvprop" "$chunksrvlog" \ + >"${chunksrvout}" 2>&1 & + echo $! >"$chunksrvpid" ) || exit - i=`expr $i + 1` + i=$(expr $i + 1) done - rack=`expr $rack + 1` + rack=$(expr $rack + 1) done fi @@ -1133,13 +1125,13 @@ trap 'kill_all_proc "$metasrvdir" $chunkrundirs' EXIT rm -rf 'devtools' cp -a "$bdir/src/cc/devtools" . || exit cp "$srcdir/src/test-scripts/cptest.sh" . || exit - cdirp=`pwd` + cdirp=$(pwd) PATH="${cdirp}/devtools:${PATH}" export PATH meta="-s $metahost -p $metasrvport" export meta - kfstools="`pwd`/tools" + kfstools="$(pwd)/tools" export kfstools cpfromkfsopts="-T $cstimeout -R $csretry" export cpfromkfsopts @@ -1161,12 +1153,12 @@ trap 'kill_all_proc "$metasrvdir" $chunkrundirs' EXIT QFS_CLIENT_CONFIG="FILE:${clientproprs}" export QFS_CLIENT_CONFIG fi - start=`now_seconds` + start=$(now_seconds) while ./cptest.sh "$suf"; do - echo "$suf test passed. $((`now_seconds` - $start)) sec, `date`" - start=`now_seconds` - done > "cptest-$suf.log" 2>&1 & - echo $! > "cptest-$suf.pid" + echo "$suf test passed. $(($(now_seconds) - $start)) sec, $(date)" + start=$(now_seconds) + done >"cptest-$suf.log" 2>&1 & + echo $! >"cptest-$suf.pid" done exit 0 ) || exit 1 @@ -1197,13 +1189,13 @@ fi cp -a "$bdir/src/cc/fanout" . || exit rm -rf 'devtools' cp -a "$bdir/src/cc/devtools" . || exit - cp "$ssrcdir/src/cc/fanout/kfanout_test.sh" . || exit + cp "$ssrcdir/src/cc/fanout/kfanout_test.sh" . || exit if [ x"$csretry" != x -a $csretry -gt 0 ]; then foretry="-y $csretry" else foretry='' fi - cdirp=`pwd` + cdirp=$(pwd) PATH="${cdirp}/fanout:${cdirp}/tools:${cdirp}/devtools:${PATH}" export PATH @@ -1219,8 +1211,8 @@ fi -test-runs 100000 \ -kfanout-extra-opts "-U 1 -c $cstimeout -q 5 $foretry -P 3" \ -cpfromkfs-extra-opts "-R $csretry" \ - > kfanout_test.log 2>&1 & - echo $! > kfanout_test.pid + >kfanout_test.log 2>&1 & + echo $! >kfanout_test.pid exit 0 ) || exit @@ -1237,9 +1229,9 @@ else fi if [ x"$auth" = x'yes' ]; then - smauthconf="$clitestdir/sortmasterauth.prp" - export smauthconf - cat > "$smauthconf" << EOF + smauthconf="$clitestdir/sortmasterauth.prp" + export smauthconf + cat >"$smauthconf" < sortmaster_endurance_test.log 2>&1 & - echo $! > endurance_test.pid || exit + ./endurance_test.sh >sortmaster_endurance_test.log 2>&1 & + echo $! >endurance_test.pid || exit exit 0 ) || exit 1 diff --git a/travis/script.sh b/travis/script.sh index 8d5278d16..f2c1128fc 100755 --- a/travis/script.sh +++ b/travis/script.sh @@ -48,7 +48,7 @@ done DEPS_CENTOS=$DEPS_CENTOS$DEPS_CENTOS_PRIOR_TO_9 DEPS_CENTOS8=$DEPS_CENTOS8$DEPS_CENTOS_PRIOR_TO_9 -MYMVN_URL='https://dlcdn.apache.org/maven/maven-3/3.9.5/binaries/apache-maven-3.9.5-bin.tar.gz' +MYMVN_URL='https://dlcdn.apache.org/maven/maven-3/3.9.11/binaries/apache-maven-3.9.11-bin.tar.gz' MYTMPDIR='.tmp' MYCODECOV="$MYTMPDIR/codecov.sh" @@ -208,13 +208,14 @@ build_ubuntu() { else MYDEPS=$DEPS_UBUNTU fi + APT_GET_CMD="apt-get${UBUNTU_APT_OPTIONS:+ ${UBUNTU_APT_OPTIONS}}" $MYSUDO apt-get update $MYSUDO /bin/bash -c \ - "DEBIAN_FRONTEND='noninteractive' apt-get install -y gnupg" + "DEBIAN_FRONTEND='noninteractive' $APT_GET_CMD install -y gnupg" $MYSUDO apt-key update - $MYSUDO apt-get update + $MYSUDO $APT_GET_CMD update $MYSUDO /bin/bash -c \ - "DEBIAN_FRONTEND='noninteractive' apt-get install -y $MYDEPS" + "DEBIAN_FRONTEND='noninteractive' $APT_GET_CMD install -y $MYDEPS" if [ x"$1" = x'18.04' -o x"$1" = x'20.04' -o x"$1" = x'22.04' \ -o x"$1" = x'24.04' \ -o x"$1" = x'd10' -o x"$1" = x'd11' -o x"$1" = x'd12' ]; then @@ -238,10 +239,13 @@ build_ubuntu32() { } build_debian() { + unset UBUNTU_APT_OPTIONS build_ubuntu "d$1" } build_centos() { + YUM_OPTS= + YUM_UPDATE_FLAG=0 if [ x"$1" = x'5' ]; then # Centos 5 EOL, use vault for now. $MYSUDO sed -i 's/enabled=1/enabled=0/' \ @@ -266,25 +270,29 @@ build_centos() { /etc/yum.repos.d/*.repo $MYSUDO sed -i 's/#\(baseurl.*\)mirror.centos.org\/centos\/\$releasever\//\1vault.centos.org\/7.9.2009\//' \ /etc/yum.repos.d/*.repo - $MYSUDO yum update -y + YUM_UPDATE_FLAG=1 elif [ x"$1" = x'8' ]; then # Centos 8 EOL, use vault for now. $MYSUDO sed -i 's/mirrorlist/#mirrorlist/' \ /etc/yum.repos.d/*.repo $MYSUDO sed -i 's/#\(baseurl.*\)mirror.centos.org\/\$contentdir\//\1vault.centos.org\//' \ /etc/yum.repos.d/*.repo - $MYSUDO yum update -y + YUM_UPDATE_FLAG=1 else - $MYSUDO yum update -y + if [ x"$1" = x'9' -o x"$1" = x'2023' ]; then + YUM_OPTS=--nobest + fi + YUM_UPDATE_FLAG=1 + fi + if [ $YUM_UPDATE_FLAG -eq 1 ]; then + $MYSUDO yum clean all + $MYSUDO yum makecache + $MYSUDO yum update -y $YUM_OPTS fi if [ -f "$MYCENTOSEPEL_RPM" ]; then $MYSUDO rpm -Uvh "$MYCENTOSEPEL_RPM" fi - if [ x"$1" = x'9' -o x"$1" = x'2023' ]; then - YUM_OPTS=--nobest - else - YUM_OPTS= - fi + $MYSUDO yum makecache eval MYDEPS='${DEPS_CENTOS'"$1"'-$DEPS_CENTOS}' $MYSUDO yum install -y $YUM_OPTS $MYDEPS MYPATH=$PATH @@ -402,8 +410,10 @@ if [ x"$BUILD_OS_NAME" = x'linux' ]; then fi fi docker run --rm --dns=8.8.8.8 -t -v "$MYSRCD:$MYSRCD" -w "$MYSRCD" \ + ${UBUNTU_APT_OPTIONS:+-e UBUNTU_APT_OPTIONS="$UBUNTU_APT_OPTIONS"} \ "$DOCKER_IMAGE_PREFIX$DISTRO:$VER" \ - /bin/bash ./travis/script.sh build "$DISTRO" "$VER" "$BTYPE" "$BUSER" + /bin/bash ./travis/script.sh \ + build "$DISTRO" "$VER" "$BTYPE" "$BUSER" fi elif [ x"$BUILD_OS_NAME" = x'osx' ]; then set_build_type "$BTYPE" diff --git a/wiki/Administrator's-Guide.md b/wiki/Administrator's-Guide.md index 97f472dd0..09d22e867 100644 --- a/wiki/Administrator's-Guide.md +++ b/wiki/Administrator's-Guide.md @@ -1,5 +1,7 @@ -Metaserver -========== +# QFS Admistrator's Guide + +## Metaserver + The metaserver is responsible for storing all global QFS file system information, tracking chunk locations, and coordinating chunk replication/recovery. This section will discuss basic metaserver @@ -10,29 +12,33 @@ The metaserver configuration is normally stored in a file called configurations. For the complete set of configuration parameters see the [[Configuration Reference]]. -Running -------- -`metaserver /path/to/MetaServer.prp` +### Running Meta Server + +```sh +metaserver /path/to/MetaServer.prp +``` To initialize file system by creating initial empty file system's checkpoint and log segment -c command line option can be used: -`metaserver -c /path/to/MetaServer.prp` +```sh +metaserver -c /path/to/MetaServer.prp +``` -c option should **not** be used when running meta server with existing QFS file system, in order to prevent data loss in the case when meta server starts with no "latest" checkpoint file. -Checkpoint and Transaction Log Pruning --------------------------------------- +### Checkpoint and Transaction Log Pruning + The directories that store metaserver checkpoints (*metaServer.cpDir*) and transaction logs (*metaServer.logDir*) are pruned periodically by the meta server; otherwise they will fill up and run out of space. Pruning parameters are described in meta server [annotated configuration file](https://github.com/quantcast/qfs/blob/master/conf/MetaServer.prp) section "Meta data (checkpoint and trasaction log) store." Checkpoint and log pruning scrips required by the prior versions are now obsolete have been removed. -Creating Backups ----------------- +### Creating Backups + From time to time the *metaServer.cpDir* and *metaServer.logDir* should be backed up, as they can be used to restore a file system which has had a catastrophic failure. A backup consists of: @@ -45,27 +51,35 @@ files. Given the following configuration: - metaServer.cpDir = /home/qfs0/state/checkpoint - metaServer.logDir = /home/qfs0/state/transactions +```properties +metaServer.cpDir = /home/qfs0/state/checkpoint +metaServer.logDir = /home/qfs0/state/transactions +``` A possible solution would be to periodically do the following: - tar --exclude '*.tmp.??????' -czf /foo/bar/qfs0-backup-`date +%d-%H.tar.gz` checkpoint transactions -C /home/qfs0/state` +```sh +tar --exclude '*.tmp.??????' \ + -czf /foo/bar/qfs0-backup-"$(date +%d-%H.tar.gz)" \ + checkpoint transactions -C /home/qfs0/state +``` **Note**: this simple script includes all checkpoint files, which is inefficient; only the latest checkpoint file is required for the backup. -QFS meta data backup script is available [here](https://github.com/quantcast/qfs/blob/master/scripts/qfs_backup). +[QFS meta data backup script is available here](https://github.com/quantcast/qfs/blob/master/scripts/qfs_backup). + +### Restoring from Backups -Restoring Backups ------------------ To restore a backup, it need only be extracted to the appropriate *metaServer.cpDir* and *metaServer.logDir* directories of a fresh metaserver head node. Using the configuration from the previous example: - cd /home/qfs0/state && tar -xzf /foo/bar/qfs0-backup-31-23.tar.gz +```sh +cd /home/qfs0/state && tar -xzf /foo/bar/qfs0-backup-31-23.tar.gz +``` Once the metaserver is started, it will read the latest checkpoint into memory and replay any transaction logs. Files that were allocated since the backup @@ -79,8 +93,8 @@ modifications since the backup will be lost. **Note:** The location of the *metaServer.cpDir* and *metaServer.logDir* should not change. -Meta Server Replication (VR) ----------------------------- +### Meta Server Replication (VR) + Meta server replication provides fault tolerance at the meta server level. The file system can be configured with multiple meta server nodes, in order to solve single point of failure problem. @@ -95,51 +109,44 @@ requests. The problem description can be found in the VR paper. Configuring replicated meta server group / cluster consists of the following steps. -0. Decide on the number of meta server nodes N. The minimum N is 3. The -maximum number of tolerated nodes failures is N - (N/2 + 1). QFS clients and -chunk servers automatically re-connect to newly elected primary node in the -case when prior primary node becomes unavailable due to node or network -connectivity failure(s). - -1. Assign node IDs. The node ID must be non negative 63 bit integer. Initial -set of nodes must have node with ID 0. The node with lowest ID is elected as a -primaryf. The remaining active nodes are assigned backup status. Node's -"primary order" (32 bit signed integer) can be used to change primary -election. The node with the smallest primary order becomes primary. Node's -primary order takes precedence over node ID. The node ID breaks tie in case when -primary orders are equal. In initial configuration all nodes primary order -must be set to 0. Node primary order is VR configuration parameter, it can be -changed with qfsadmin vr_reconfiguration command. - -**Note that the node ID must be unique, and should never be re-used. Non unique -IDs withing the same meta server group / file system can result file system -loss.** - -2. Configure meta data fetch / sync, and log receiver listeners. The relevant -parameters are described in "Meta meta data initial fetch / synchronization" -section of the meta server annotated configuration file. - -3. Copy or create new file system checkpoint and transaction log segments -on/to the node with ID 0, and ensure that other nodes have empty checkpoint -and transaction log directories. Specify file system ID in the meta server -configuration file with metaServer.metaDataSync.fileSystemId parameter. -File system ID can be obtained from the beginning of the checkpoint -file: the first number on the line with the "filesysteminfo/fsid/" prefix. - -4. Create DNS record with the list of meta server nodes IP addresses, or add -meta server nodes IP addresses to the client configuration file by using -client.metaServerNodes parameter. -Chunk servers, similarly to QFS client, need to be configured with the meta -server DNS name, and / or list of meta server nodes network locations. -Please see chunkServer.meta.nodes parameter description in chunk server -configuration file. - -5. Start meta server on all meta server nodes. The nodes with non 0 IDs should -fetch checkpoint and transaction log from node with ID 0. - -6. Use qfsadmin vr_reconfiguration command to configure replication. VR -configuration stored in the checkpoint and transaction log, and replicated -onto all meta server nodes. +1. Decide on the number of meta server nodes N. The minimum N is 3. The + maximum number of tolerated nodes failures is N - (N/2 + 1). QFS clients and + chunk servers automatically re-connect to newly elected primary node in the + case when prior primary node becomes unavailable due to node or network + connectivity failure(s). +2. Assign node IDs. The node ID must be non negative 63 bit integer. Initial + set of nodes must have node with ID 0. The node with lowest ID is elected as a + primary. The remaining active nodes are assigned backup status. Node's + "primary order" (32 bit signed integer) can be used to change primary + election. The node with the smallest primary order becomes primary. Node's + primary order takes precedence over node ID. The node ID breaks tie in case when + primary orders are equal. In initial configuration all nodes primary order + must be set to 0. Node primary order is VR configuration parameter, it can be + changed with qfsadmin vr_reconfiguration command. + **Note that the node ID must be unique, and should never be re-used. Non unique + IDs withing the same meta server group / file system can result file system + loss.** +3. Configure meta data fetch / sync, and log receiver listeners. The relevant + parameters are described in "Meta meta data initial fetch / synchronization" + section of the meta server annotated configuration file. +4. Copy or create new file system checkpoint and transaction log segments + on/to the node with ID 0, and ensure that other nodes have empty checkpoint + and transaction log directories. Specify file system ID in the meta server + configuration file with metaServer.metaDataSync.fileSystemId parameter. + File system ID can be obtained from the beginning of the checkpoint + file: the first number on the line with the "filesysteminfo/fsid/" prefix. +5. Create DNS record with the list of meta server nodes IP addresses, or add + meta server nodes IP addresses to the client configuration file by using + client.metaServerNodes parameter. + Chunk servers, similarly to QFS client, need to be configured with the meta + server DNS name, and / or list of meta server nodes network locations. + Please see chunkServer.meta.nodes parameter description in chunk server + configuration file. +6. Start meta server on all meta server nodes. The nodes with non 0 IDs should + fetch checkpoint and transaction log from node with ID 0. +7. Use `qfsadmin vr_reconfiguration` command to configure replication. VR + configuration stored in the checkpoint and transaction log, and replicated + onto all meta server nodes. The first step is to add all nodes with their respective transaction log listeners network addresses [locations], the second is to activate nodes. Meta @@ -167,50 +174,58 @@ locations (IP address and port). Please note that DNS name that lists all meta server nodes is to be used with qfsadmin commands in the example below. -For example: ------------- +### Configuring VR example + Add node 0 to VR configuration: - qfsadmin -f qfsadmin.cfg \ - -s \ - -p \ - -F op-type=add-node \ - -F arg-count=1 \ - -F node-id=0 \ - -F args='node0-ip-address node0-log-listener-port-number' \ - vr_reconfiguration +```sh +qfsadmin -f qfsadmin.cfg \ + -s meta-server-host \ + -p meta-server-port \ + -F op-type=add-node \ + -F arg-count=1 \ + -F node-id=0 \ + -F args='node0-ip-address node0-log-listener-port-number' \ + vr_reconfiguration +``` Add node 1 to VR configuration: - qfsadmin -f qfsadmin.cfg \ - -s \ - -p \ - -F op-type=add-node \ - -F arg-count=1 \ - -F node-id=1 \ - -F args='node1-ip-address node1-log-listener-port-number' \ - vr_reconfiguration +```sh +qfsadmin -f qfsadmin.cfg \ + -s meta-server-host \ + -p meta-server-port \ + -F op-type=add-node \ + -F arg-count=1 \ + -F node-id=1 \ + -F args='node1-ip-address node1-log-listener-port-number' \ + vr_reconfiguration +``` Add node 2 to VR configuration: - qfsadmin -f qfsadmin.cfg \ - -s \ - -p \ - -F op-type=add-node \ - -F arg-count=1 \ - -F node-id=2 \ - -F args='node1-ip-address node2-log-listener-port-number' \ - vr_reconfiguration +```sh +qfsadmin -f qfsadmin.cfg \ + -s meta-server-host \ + -p meta-server-port \ + -F op-type=add-node \ + -F arg-count=1 \ + -F node-id=2 \ + -F args='node1-ip-address node2-log-listener-port-number' \ + vr_reconfiguration +``` Activate nodes: - qfsadmin -f qfsadmin.cfg \ - -s \ - -p \ - -F op-type=activate-nodes \ - -F arg-count=3 \ - -F args='0 1 2' \ - vr_reconfiguration +```sh +qfsadmin -f qfsadmin.cfg \ + -s meta-server-host \ + -p meta-server-port \ + -F op-type=activate-nodes \ + -F arg-count=3 \ + -F args='0 1 2' \ + vr_reconfiguration +``` Once VR configured, changing VR configuration does not require file system downtime. New meta server nodes can be added to the configuration, @@ -227,98 +242,102 @@ With default parameters system switch over time (new primary election, and chunk servers and clients connecting to the new primary) should be around 10 seconds. -VR Status ----------- +### VR Status + `qfsadmin vr_get_status` can be used to query VR status of the file system or status of specific meta server node.(with -n parameter) For example: - qfsadmin -s sfsb0.sea1.qc -p 30000 vr_get_status +```sh +qfsadmin -s meta-server-host -p meta-server-port vr_get_status +``` The output will look like the following. The node state will be primary (status might be backup when querying specific node), if everything is OK. - vr.nodeId: 0 - vr.status: 0 - vr.active: 1 - vr.state: primary - vr.primaryId: 0 - vr.epoch: 5 - vr.view: 71 - vr.log: 5 71 120444929 - vr.commit: 5 71 120444929 - vr.lastViewEnd: 5 69 1258 - vr.quorum: 2 - vr.ignoreInvalidVrState: 0 - vr.fileSystemId: 160517748448112759 - vr.clusterKey: mrs-kfs-sort-b - vr.metaMd5: b6f2a25365e37c75b7fdf12914d2392d - vr.viewChangeReason: restart, node: 2 - vr.viewChangeStartTime: 1491775660 - vr.currentTime: 1491877576 - - logTransmitter.channel.0.location: 10.6.46.38 30200 - logTransmitter.channel.0.id: 2 - logTransmitter.channel.0.receivedId: 2 - logTransmitter.channel.0.primaryId: 0 - logTransmitter.channel.0.active: 1 - logTransmitter.channel.0.ack: 5 71 120444929 - logTransmitter.channel.0.sent: 5 71 120444929 - logTransmitter.channel.1.location: 10.6.34.1 30200 - logTransmitter.channel.1.id: 1 - logTransmitter.channel.1.receivedId: 1 - logTransmitter.channel.1.primaryId: 0 - logTransmitter.channel.1.active: 1 - logTransmitter.channel.1.ack: 5 71 120444929 - logTransmitter.channel.1.sent: 5 71 120444929 - logTransmitter.channel.2.location: 10.6.47.1 30200 - logTransmitter.channel.2.id: 0 - logTransmitter.channel.2.receivedId: 0 - logTransmitter.channel.2.primaryId: 0 - logTransmitter.channel.2.active: 1 - logTransmitter.channel.2.ack: 5 71 120444929 - logTransmitter.channel.2.sent: 5 71 120444929 - - logTransmitter.activeUpNodesCount: 3 - logTransmitter.activeUpChannelsCount: 3 - - configuration.primaryTimeout: 4 - configuration.backupTimeout: 8 - configuration.changeViewMaxLogDistance: 65536 - configuration.maxListenersPerNode: 16 - configuration.node.0.id: 0 - configuration.node.0.flags: 2 - configuration.node.0.active: 1 - configuration.node.0.primaryOrder: 0 - configuration.node.0.listener: 10.6.47.1 30200 - configuration.node.1.id: 1 - configuration.node.1.flags: 2 - configuration.node.1.active: 1 - configuration.node.1.primaryOrder: 0 - configuration.node.1.listener: 10.6.34.1 30200 - configuration.node.2.id: 2 - configuration.node.2.flags: 2 - configuration.node.2.active: 1 - configuration.node.2.primaryOrder: 0 - configuration.node.2.listener: 10.6.46.38 30200 +```console +vr.nodeId: 0 +vr.status: 0 +vr.active: 1 +vr.state: primary +vr.primaryId: 0 +vr.epoch: 5 +vr.view: 71 +vr.log: 5 71 120444929 +vr.commit: 5 71 120444929 +vr.lastViewEnd: 5 69 1258 +vr.quorum: 2 +vr.ignoreInvalidVrState: 0 +vr.fileSystemId: 160517748448112759 +vr.clusterKey: mrs-kfs-sort-b +vr.metaMd5: b6f2a25365e37c75b7fdf12914d2392d +vr.viewChangeReason: restart, node: 2 +vr.viewChangeStartTime: 1491775660 +vr.currentTime: 1491877576 + +logTransmitter.channel.0.location: 10.6.46.38 30200 +logTransmitter.channel.0.id: 2 +logTransmitter.channel.0.receivedId: 2 +logTransmitter.channel.0.primaryId: 0 +logTransmitter.channel.0.active: 1 +logTransmitter.channel.0.ack: 5 71 120444929 +logTransmitter.channel.0.sent: 5 71 120444929 +logTransmitter.channel.1.location: 10.6.34.1 30200 +logTransmitter.channel.1.id: 1 +logTransmitter.channel.1.receivedId: 1 +logTransmitter.channel.1.primaryId: 0 +logTransmitter.channel.1.active: 1 +logTransmitter.channel.1.ack: 5 71 120444929 +logTransmitter.channel.1.sent: 5 71 120444929 +logTransmitter.channel.2.location: 10.6.47.1 30200 +logTransmitter.channel.2.id: 0 +logTransmitter.channel.2.receivedId: 0 +logTransmitter.channel.2.primaryId: 0 +logTransmitter.channel.2.active: 1 +logTransmitter.channel.2.ack: 5 71 120444929 +logTransmitter.channel.2.sent: 5 71 120444929 + +logTransmitter.activeUpNodesCount: 3 +logTransmitter.activeUpChannelsCount: 3 + +configuration.primaryTimeout: 4 +configuration.backupTimeout: 8 +configuration.changeViewMaxLogDistance: 65536 +configuration.maxListenersPerNode: 16 +configuration.node.0.id: 0 +configuration.node.0.flags: 2 +configuration.node.0.active: 1 +configuration.node.0.primaryOrder: 0 +configuration.node.0.listener: 10.6.47.1 30200 +configuration.node.1.id: 1 +configuration.node.1.flags: 2 +configuration.node.1.active: 1 +configuration.node.1.primaryOrder: 0 +configuration.node.1.listener: 10.6.34.1 30200 +configuration.node.2.id: 2 +configuration.node.2.flags: 2 +configuration.node.2.active: 1 +configuration.node.2.primaryOrder: 0 +configuration.node.2.listener: 10.6.46.38 30200 +``` Meta server web UI can also be used to obtain VR status. -Removing VR configuration --------------------------- +### Removing VR configuration + VR (meta server replication) configuration stored in checkpoint and transaction logs. It is not possible to remove / reset VR configuration at run time, i.e. without downtime while servicing requests. The following two meta server command line options allow to clear VR configuration or inactivate all meta server nodes: + 1. -clear-vr-config -- append an entry to the end of the transaction log -to clear VR configuration, and exit + to clear VR configuration, and exit 2. -vr-inactivate-all-nodes -- append an entry to the end of the transaction log -to inactivate all VR nodes, and exit + to inactivate all VR nodes, and exit +### Recovering File System by Truncating Transaction Log -Recovering File System by Truncating Transaction Log ----------------------------------------------------- The [`qfsmetalogtruncate.sh`](https://github.com/quantcast/qfs/blob/master/scripts/qfsmetalogtruncate.sh) can be used to recover file system in case if the meta server state diverges between run time and replay due to a hardware malfunction (for example memory @@ -328,7 +347,9 @@ itself as fail stop ("panic") on all backup nodes. In such a case the backup nodes will exit, and the primary might still be up. The backup nodes would have error message in the trace log similar to the following: - 01-01-2022 01:23:45.678 ERROR - (Replay.cc:2298) error block seq: 11381:4203867266:c/3 1 8323a816/a5f956e40/5b15bf44/0/3 1 8323aa10/116/2c75/56b5fa23 +```console +01-01-2022 01:23:45.678 ERROR - (Replay.cc:2298) error block seq: 11381:4203867266:c/3 1 8323a816/a5f956e40/5b15bf44/0/3 1 8323aa10/116/2c75/56b5fa23 +``` The characters between `c/` and the next `/` symbol represent log commit sequence number in hex. In the above example the commit sequence number is @@ -340,10 +361,12 @@ sequence number shown in the error message. For example given the sequence number in the error message the above the following can be executed on each and every relevant meta server node: - qfsmetalogtruncate.sh \ - -l /home/qfs0/state/transactions \ - -c /home/qfs0/state/checkpoint \ - -s '3 1 8323a816' +```sh +qfsmetalogtruncate.sh + -l /home/qfs0/state/transactions + -c /home/qfs0/state/checkpoint + -s '3 1 8323a816' +``` By default the script creates a backup of both meta server transaction log and checkpoint directories. The backup is done by adding current unix time to the @@ -353,9 +376,8 @@ into the backup directories. Backup can be turned off by adding `-b` option. If the scrips runs as root, it will preserve original files and directories ownership (user and group). +### Hitless Meta Server Version Upgrade -Hitless Meta Server Version Upgrade ------------------------------------ Unless specifically mentioned in release notes, QFS meta server versions are backward compatible. Backward compatibility allows to perform "hitless" (no downtime) upgrade of the meta nodes in VR configuration. However hitless @@ -382,19 +404,18 @@ to minimize the time window where quorum of nodes is running new version and the older version is running on the remaining nodes. Examples: + 1. Upgrade meta server in VR configuration of 3 nodes with ids 0, 1, 2, primary -order is 0 for all nodes. Quorum is 3/2+1 = 2. Restart node with ID 2, wait -until it is operational and joins VR. Repeat the prior step for node with ID 1. -Upgrade and restart node 0. + order is 0 for all nodes. Quorum is 3/2+1 = 2. Restart node with ID 2, wait + until it is operational and joins VR. Repeat the prior step for node with ID 1. + Upgrade and restart node 0. 2. Upgrade meta server in VR configuration of 5 nodes with ids 0, 1, 2, 3, 4, -primary order is 0 for all nodes. Quorum is 5/2+1 = 3. Restart node with ID 4, -wait until it is operation and joins VR. Repeat the prior step for node with ID -3, then for node with ID 2. Upgrade and restart nodes 1 and 0 simultaneously. + primary order is 0 for all nodes. Quorum is 5/2+1 = 3. Restart node with ID 4, + wait until it is operation and joins VR. Repeat the prior step for node with ID + 3, then for node with ID 2. Upgrade and restart nodes 1 and 0 simultaneously. +### File System Integrity (`qfsfsck`) - -File System Integrity (`qfsfsck`) ---------------------------------- The `qfsfsck` tool can be employed in three ways: - Verify the integrity of a running file system by identifying lost files and/or @@ -403,46 +424,50 @@ The `qfsfsck` tool can be employed in three ways: - Check the integrity of a file system archive/backup (checkpoint plus a set of transaction logs). -Running File System Integrity Verification ------------------------------------------- +### Running File System Integrity Verification + In order to verify the integrity of a running file system by identifying lost files or files with chunk placement problems, run: - qfsfsck -m metaServer.hostname -p metaServer.port +```sh +qfsfsck -m metaServer.hostname -p metaServer.port +``` The output will look something like this if everything is okay: - Lost files total: 0 - Directories: 280938 - Directories reachable: 280938 100% - Directory reachable max depth: 14 - Files: 1848149 - Files reachable: 1848149 100% - Files reachable with recovery: 1811022 97.9911% - Files reachable striped: 34801 1.88302% - Files reachable sum of logical sizes: 37202811695550 - 1 Files reachable lost: 0 0% - 2 Files reachable lost if server down: 0 0% - 3 Files reachable lost if rack down: 0 0% - 4 Files reachable abandoned: 0 0% - 5 Files reachable ok: 1848149 100% - File reachable max size: 4011606632 - File reachable max chunks: 128 - File reachable max replication: 3 - Chunks: 19497647 - Chunks reachable: 19497647 100% - Chunks reachable lost: 0 0% - Chunks reachable no rack assigned: 0 0% - Chunks reachable over replicated: 0 0% - Chunks reachable under replicated: 0 0% - Chunks reachable replicas: 22715209 116.502% - Chunk reachable max replicas: 3 - Recovery blocks reachable: 1858706 - Recovery blocks reachable partial: 0 0% - Fsck run time: 6.45906 sec. - Files: [fsck_state size replication type stripes recovery_stripes - stripe_size chunk_count mtime path] - Filesystem is HEALTHY +```console +Lost files total: 0 +Directories: 280938 +Directories reachable: 280938 100% +Directory reachable max depth: 14 +Files: 1848149 +Files reachable: 1848149 100% +Files reachable with recovery: 1811022 97.9911% +Files reachable striped: 34801 1.88302% +Files reachable sum of logical sizes: 37202811695550 +1 Files reachable lost: 0 0% +2 Files reachable lost if server down: 0 0% +3 Files reachable lost if rack down: 0 0% +4 Files reachable abandoned: 0 0% +5 Files reachable ok: 1848149 100% +File reachable max size: 4011606632 +File reachable max chunks: 128 +File reachable max replication: 3 +Chunks: 19497647 +Chunks reachable: 19497647 100% +Chunks reachable lost: 0 0% +Chunks reachable no rack assigned: 0 0% +Chunks reachable over replicated: 0 0% +Chunks reachable under replicated: 0 0% +Chunks reachable replicas: 22715209 116.502% +Chunk reachable max replicas: 3 +Recovery blocks reachable: 1858706 +Recovery blocks reachable partial: 0 0% +Fsck run time: 6.45906 sec. +Files: [fsck_state size replication type stripes recovery_stripes +stripe_size chunk_count mtime path] +Filesystem is HEALTHY +``` When there are lost or abandoned files, and/or files with placement problems, they will be placed in one of the four categories listed below. The number @@ -461,14 +486,16 @@ other attributes, followed by the header line seen below: The `fsck_state` identifies which problem category a given file is in. For example: - 1 64517075 2 2 128 0 65536 128 2012-09-22T11:36:40.597073Z /qfs/ops/jarcache/paramDB_9f68d84fac11ecfeab876844e1b71e91.sqlite.gz - 3 56433403 2 2 128 0 65536 128 2011-10-05T15:02:28.057320Z /qfs/ops/jarcache/paramDB_7912225a0775efa45e02cf0a5bb5a130.sqlite.gz - 3 55521703 2 2 128 0 65536 128 2012-08-28T15:02:07.791657Z /qfs/ops/jarcache/paramDB_f0c557f0bb36ac0375c9a8c95c0a51f8.sqlite.gz +```console +1 64517075 2 2 128 0 65536 128 2012-09-22T11:36:40.597073Z /qfs/ops/jarcache/paramDB_9f68d84fac11ecfeab876844e1b71e91.sqlite.gz +3 56433403 2 2 128 0 65536 128 2011-10-05T15:02:28.057320Z /qfs/ops/jarcache/paramDB_7912225a0775efa45e02cf0a5bb5a130.sqlite.gz +3 55521703 2 2 128 0 65536 128 2012-08-28T15:02:07.791657Z /qfs/ops/jarcache/paramDB_f0c557f0bb36ac0375c9a8c95c0a51f8.sqlite.gz +``` means there is one completely lost file, and two other files that could be lost after the failure of a single rack. -Active Checkpoint and Transaction Logs --------------------------------------- +### Active Checkpoint and Transaction Logs + In order to validate the checkpoint and transaction logs of a running metaserver, the *metaServer.checkpoint.lockFileName* parameter must be configured in the metaserver (as it is used to synchronize access to the @@ -481,39 +508,49 @@ make sure there is enough memory available on the head node to do this. To run this check: - qfsfsck -L metaServer.checkpoint.lockFileName -l metaServer.logDir -c metaServer.cpDir +```sh +qfsfsck -L metaServer.checkpoint.lockFileName -l metaServer.logDir -c metaServer.cpDir +``` + +### File System Check Example -Example -------- Given the following configuration: - metaServer.checkpoint.lockFileName /home/qfs0/run/ckpt.lock - metaServer.cpDir = /home/qfs0/state/checkpoint - metaServer.logDir = /home/qfs0/state/transactions +```console +metaServer.checkpoint.lockFileName /home/qfs0/run/ckpt.lock +metaServer.cpDir = /home/qfs0/state/checkpoint +metaServer.logDir = /home/qfs0/state/transactions +``` the check would be executed like so: - qfsfsck -L /home/qfs0/run/ckpt.lock -l /home/qfs0/state/transactions -c /home/qfs0/state/checkpoint +```sh +qfsfsck -L /home/qfs0/run/ckpt.lock \ + -l /home/qfs0/state/transactions \ + -c /home/qfs0/state/checkpoint +``` If everything is okay, the output will look something like this: - 09-25-2012 20:39:01.894 INFO - (restore.cc:97) restoring from checkpoint of 2012-09-25T20:00:26.971544Z - 09-25-2012 20:39:01.894 INFO - (replay.cc:63) open log file: /home/qfs0/state/transactions/log.55710 - 09-25-2012 20:39:24.010 INFO - (restore.cc:97) restoring from checkpoint of 2012-09-25T20:03:09.383993Z - 09-25-2012 20:39:24.010 INFO - (replay.cc:63) open log file: /home/qfs0/state/transactions/log.55710 - 09-25-2012 20:39:24.010 INFO - (replay.cc:559) log time: 2012-09-25T20:00:24.161876Z - 09-25-2012 20:39:24.010 INFO - (replay.cc:559) log time: 2012-09-25T20:09:43.533466Z - 09-25-2012 20:39:24.010 INFO - (replay.cc:63) open log file: /home/qfs0/state/transactions/log.55711 - 09-25-2012 20:39:24.011 INFO - (replay.cc:559) log time: 2012-09-25T20:09:43.533721Z - 09-25-2012 20:39:24.011 INFO - (replay.cc:559) log time: 2012-09-25T20:19:43.829361Z - 09-25-2012 20:39:24.011 INFO - (replay.cc:63) open log file: /home/qfs0/state/transactions/log.55712 - 09-25-2012 20:39:24.011 INFO - (replay.cc:559) log time: 2012-09-25T20:19:43.829674Z - 09-25-2012 20:39:24.012 INFO - (replay.cc:559) log time: 2012-09-25T20:29:44.712673Z +```console +09-25-2012 20:39:01.894 INFO - (restore.cc:97) restoring from checkpoint of 2012-09-25T20:00:26.971544Z +09-25-2012 20:39:01.894 INFO - (replay.cc:63) open log file: /home/qfs0/state/transactions/log.55710 +09-25-2012 20:39:24.010 INFO - (restore.cc:97) restoring from checkpoint of 2012-09-25T20:03:09.383993Z +09-25-2012 20:39:24.010 INFO - (replay.cc:63) open log file: /home/qfs0/state/transactions/log.55710 +09-25-2012 20:39:24.010 INFO - (replay.cc:559) log time: 2012-09-25T20:00:24.161876Z +09-25-2012 20:39:24.010 INFO - (replay.cc:559) log time: 2012-09-25T20:09:43.533466Z +09-25-2012 20:39:24.010 INFO - (replay.cc:63) open log file: /home/qfs0/state/transactions/log.55711 +09-25-2012 20:39:24.011 INFO - (replay.cc:559) log time: 2012-09-25T20:09:43.533721Z +09-25-2012 20:39:24.011 INFO - (replay.cc:559) log time: 2012-09-25T20:19:43.829361Z +09-25-2012 20:39:24.011 INFO - (replay.cc:63) open log file: /home/qfs0/state/transactions/log.55712 +09-25-2012 20:39:24.011 INFO - (replay.cc:559) log time: 2012-09-25T20:19:43.829674Z +09-25-2012 20:39:24.012 INFO - (replay.cc:559) log time: 2012-09-25T20:29:44.712673Z +``` otherwise `qfsfsck` will exit in error. -Object Store (S3) File System Integrity Verification (`qfsobjstorefsck`) ------------------------------------------------------------------------- +### Object Store (S3) File System Integrity Verification (`qfsobjstorefsck`) + The `qfsobjstorefsck` tool can be used to verify object store (S3) bloks inventory. Object store fsck loads checkpoint, replays transaction logs, then reads @@ -530,8 +567,8 @@ In other words, the correct procedure to check "live" file system is to copy / save checkpoint, and transaction logs, then create list of object store blocks, then run this tool. -File System Archive -------------------- +### File System Archive + Checking a file system image backup is very similar to that of checking a running metaserver's checkpoint and transaction logs, except no lock file (*metaServer.checkpoint.lockFileName*) is required. The backup must be @@ -539,42 +576,48 @@ extracted to the same set of paths from which it was archived. Therefore it should be extracted to the location specified by *metaServer.cpDir* and *metaServer.logDir* of its associated metaserver. -Example -------- +### Verification Example + Given the following configuration: - metaServer.cpDir = /home/qfs0/state/checkpoint - metaServer.logDir = /home/qfs0/state/transactions +```console +metaServer.cpDir = /home/qfs0/state/checkpoint +metaServer.logDir = /home/qfs0/state/transactions +``` and an archive located at: `/foo/bar/qfs0-backup-31-23.tar.gz` created from `/home/qfs0/state` The following commands can be used to verify the backup: - mkdir -p /home/qfs0/state - cd /home/qfs0/state - tar -xzf /foo/bar/qfs0-backup-31-23.tar.gz - qfsfsck -l /home/qfs0/state/transactions -c /home/qfs0/state/checkpoint +```sh +mkdir -p /home/qfs0/state && +cd /home/qfs0/state && +tar -xzf /foo/bar/qfs0-backup-31-23.tar.gz && +qfsfsck -l /home/qfs0/state/transactions -c /home/qfs0/state/checkpoint +``` If everything is okay, the output will look something like this: - 09-25-2012 20:39:01.894 INFO - (restore.cc:97) restoring from checkpoint of 2012-09-25T20:00:26.971544Z - 09-25-2012 20:39:01.894 INFO - (replay.cc:63) open log file: /home/qfs0/state/transactions/log.55710 - 09-25-2012 20:39:24.010 INFO - (restore.cc:97) restoring from checkpoint of 2012-09-25T20:03:09.383993Z - 09-25-2012 20:39:24.010 INFO - (replay.cc:63) open log file: /home/qfs0/state/transactions/log.55710 - 09-25-2012 20:39:24.010 INFO - (replay.cc:559) log time: 2012-09-25T20:00:24.161876Z - 09-25-2012 20:39:24.010 INFO - (replay.cc:559) log time: 2012-09-25T20:09:43.533466Z - 09-25-2012 20:39:24.010 INFO - (replay.cc:63) open log file: /home/qfs0/state/transactions/log.55711 - 09-25-2012 20:39:24.011 INFO - (replay.cc:559) log time: 2012-09-25T20:09:43.533721Z - 09-25-2012 20:39:24.011 INFO - (replay.cc:559) log time: 2012-09-25T20:19:43.829361Z - 09-25-2012 20:39:24.011 INFO - (replay.cc:63) open log file: /home/qfs0/state/transactions/log.55712 - 09-25-2012 20:39:24.011 INFO - (replay.cc:559) log time: 2012-09-25T20:19:43.829674Z - 09-25-2012 20:39:24.012 INFO - (replay.cc:559) log time: 2012-09-25T20:29:44.712673Z +```console +09-25-2012 20:39:01.894 INFO - (restore.cc:97) restoring from checkpoint of 2012-09-25T20:00:26.971544Z +09-25-2012 20:39:01.894 INFO - (replay.cc:63) open log file: /home/qfs0/state/transactions/log.55710 +09-25-2012 20:39:24.010 INFO - (restore.cc:97) restoring from checkpoint of 2012-09-25T20:03:09.383993Z +09-25-2012 20:39:24.010 INFO - (replay.cc:63) open log file: /home/qfs0/state/transactions/log.55710 +09-25-2012 20:39:24.010 INFO - (replay.cc:559) log time: 2012-09-25T20:00:24.161876Z +09-25-2012 20:39:24.010 INFO - (replay.cc:559) log time: 2012-09-25T20:09:43.533466Z +09-25-2012 20:39:24.010 INFO - (replay.cc:63) open log file: /home/qfs0/state/transactions/log.55711 +09-25-2012 20:39:24.011 INFO - (replay.cc:559) log time: 2012-09-25T20:09:43.533721Z +09-25-2012 20:39:24.011 INFO - (replay.cc:559) log time: 2012-09-25T20:19:43.829361Z +09-25-2012 20:39:24.011 INFO - (replay.cc:63) open log file: /home/qfs0/state/transactions/log.55712 +09-25-2012 20:39:24.011 INFO - (replay.cc:559) log time: 2012-09-25T20:19:43.829674Z +09-25-2012 20:39:24.012 INFO - (replay.cc:559) log time: 2012-09-25T20:29:44.712673Z +``` otherwise `qfsfsck` will exit in error. -WORM Mode ---------- +### WORM Mode + WORM (or Write Once, Read Many) is a special file system mode which makes it imposible to delete files from the file system. This feature is useful for protecting critical data from deletion. The `qfstoggleworm` tool is used to @@ -582,11 +625,15 @@ turn WORM mode on and off. To turn WORM mode on do the following: -`qfstoggleworm -s metaServer.host -p metaServer.port -t 1` +```sh +qfstoggleworm -s metaServer.host -p metaServer.port -t 1 +``` Likewise, to turn WORM mode off do the following: -`qfstoggleworm -s metaServer.host -p metaServer.port -t 0` +```sh +qfstoggleworm -s metaServer.host -p metaServer.port -t 0 +``` When a QFS instance is running in WORM mode, a file can only be created if it ends with a `.tmp` suffix. Once stored in the file system, it can then be @@ -595,22 +642,24 @@ instance running in WORM mode, it would have to be created as `foo.bar.tmp` then moved into place as `foo.bar`. Once moved, the file cannot be deleted or modified, unless WORM mode is disabled. -Web Reporting Interface -======================= +### Web Reporting Interface + The QFS web interface `qfsstatus.py` provides a rich set of real-time information, which can be used monitor file system instances. -Configuration -------------- +### Configuration + The web server configuration is normally stored in a file called `webUI.cfg`. See the [[Configuration Reference]] for a complete set of web UI configuration parameters. Also the sample servers used in the examples include a typical web UI configuration. Running -`qfsstatus.py /path/to/webUI.cfg` +```sh +qfsstatus.py /path/to/webUI.cfg +``` + +### Interface -Interface ---------- The following sample image of the web reporting interface is for a QFS instance with 1 metaserver and 1 chunk server, configured with three chunk directories. The host file system size is ~~18GB, out of which ~~10GB is used (not by QFS) @@ -620,21 +669,21 @@ and ~~8GB is available. The following table describes some UI elements: -| vega:20000 | The metaserver host name and port number. | -| ---------- | ----------------------------------------- | -| Chunk Servers Status | Opens a page with chunk servers statistics. One could select various chunk server parameters to be displayed, the refresh interval and delta. | -| Metaserver Status | Opens a page with metaserver statistics. One could select various metaserver parameters to be displayed, the refresh interval and delta. | -| Total space | Total space of the host file system(s) where QFS stores chunks. | -| Used space | Space used by QFS. | -| Free space | Available space in the host file system(s) where QFS stores chunks. When the free space becomes less than a % threshold (given by a metaserver configuration value) the metaserver stops using this chunk directory for chunk placement.| -| WORM mode | Status of the write-once-read-many mode. | -| Nodes | Number of nodes in different states in the file system. | -| Replications | Number of chunks in various states of replication. | -| Allocations | File system-wide count of QFS clients, chunk servers, and so on. | -| Allocations b+tree | Internal b+tree counters. In this example, root directory + dumpster directory make up the 2 in fattr. | -| Chunk placement candidates | Out of all chunk servers, how many are used for chunk placement, which are assigned racks. | -| Disks | Number of disks in the file system. **Note**: our recommendation is to use one chunk directory per physical disk. | -| All Nodes | Table of one row per chunk server, describing a summary for each chunk server. | PING +| vega:20000 | The metaserver host name and port number. | +| -------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Chunk Servers Status | Opens a page with chunk servers statistics. One could select various chunk server parameters to be displayed, the refresh interval and delta. | +| Metaserver Status | Opens a page with metaserver statistics. One could select various metaserver parameters to be displayed, the refresh interval and delta. | +| Total space | Total space of the host file system(s) where QFS stores chunks. | +| Used space | Space used by QFS. | +| Free space | Available space in the host file system(s) where QFS stores chunks. When the free space becomes less than a % threshold (given by a metaserver configuration value) the metaserver stops using this chunk directory for chunk placement. | +| WORM mode | Status of the write-once-read-many mode. | +| Nodes | Number of nodes in different states in the file system. | +| Replications | Number of chunks in various states of replication. | +| Allocations | File system-wide count of QFS clients, chunk servers, and so on. | +| Allocations b+tree | Internal b+tree counters. In this example, root directory + dumpster directory make up the 2 in fattr. | +| Chunk placement candidates | Out of all chunk servers, how many are used for chunk placement, which are assigned racks. | +| Disks | Number of disks in the file system.**Note**: our recommendation is to use one chunk directory per physical disk. | +| All Nodes | Table of one row per chunk server, describing a summary for each chunk server. | A metaserver or chunk server ping can be used to dump the file system status. All of the information presented by the web interface is available via a ping. @@ -642,42 +691,51 @@ This makes it fairly easy to build automation around a QFS file system. You can use `qfsping` to ping a metaserver: -`qfsping -m -s metaServer.hostname -p metaServer.portg` +```sh +qfsping -m -s metaServer.hostname -p metaServer.portg +``` The command is similar for a chunk server ping: -`qfsping -c -s chunkServer.hostname -p chunkServer.port` +```sh +qfsping -c -s chunkServer.hostname -p chunkServer.port +``` Parsing the output of a ping is beyond the scope of this document but the Python web interface `qfsstatus.py` provides an example of this and more. -Ugrading from previous release (`logcompactor`) ------------------------------------------------ +### Ugrading from previous release (`logcompactor`) + The recommended procedure is to use logcompactor from previous release to create checkpoint and possibly single log segment, then use logcompactor from the new release to convert file system meta data into new format. -`logcomactor -l state/transactions -c state/checkpoint -T state/transactions_new -C state/checkpoint_new` +```sh +logcomactor -l state/transactions -c state/checkpoint \ + -T state/transactions_new -C state/checkpoint_new +``` + +## Chunk Server -Chunk Server -============ The chunk server is the workhorse of QFS file system and is responsible for storing and retrieving file chunk data. This section will discuss basic chunk server administration. -Configuration -------------- +### Chunk Server Configuration + The chunk server configuration is normally stored in a file called `ChunkServer.prp`. The [[Deployment Guide]] includes several minimal sample configurations. For the complete set of configuration parameters see the [[Configuration Reference]]. -Running -------- -`chunkserver /path/to/ChunkServer.prp` +### Running Chunk Server + +```sh +chunkserver /path/to/ChunkServer.prp +``` + +### Hibernation -Hibernation ------------ Hibernation is used to temporarily take a chunk server offline, such as for maintenance of the physical server. When a chunk server is hibernated, the metaserver will not actively attempt to re-replicate or recover chunks hosted by @@ -687,37 +745,44 @@ chunks will be passively recovered if they're necessary to fulfill a request. This feature is useful in preventing replication/recovery storms when performing node or rack level maintenance. -qfshibernate ------------- +### qfshibernate + The `qfshibernate` tool is used to hibernate a chunk server: -`qfshibernate -m chunkServer.metaServer.hostname -p chunkServer.metaServer.port -c chunkServer.hostname -d chunkServer.clientPort -s delay (in seconds)` +```sh +qfshibernate -m chunkServer.metaServer.hostname -p chunkServer.metaServer.port \ + -c chunkServer.hostname -d chunkServer.clientPort -s delay # (in seconds) +``` + +### Chunk Server Hibernate Example -Example -------- Given the following metaserver configuration: - chunkServer.metaServer.hostname = 192.168.1.1 - chunkServer.metaServer.port = 10000 +```properties +chunkServer.metaServer.hostname = 192.168.1.1 +chunkServer.metaServer.port = 10000 +``` To hibernate a chunk server at 192.168.10.20 (*chunkServer.hostname*) running on a client port of 1635 (*chunkServer.clientPort*) for 30 minutes one would execute the following command: - qfshibernate -m 192.168.1.1 -p 10000 -c 192.168.10.20 -d 1635 -s 1800 +```sh +qfshibernate -m 192.168.1.1 -p 10000 -c 192.168.10.20 -d 1635 -s 1800 +``` This would instruct the metaserver at 192.168.1.1:10000 to hibernate the chunk server at 192.168.10.20:1635 for 1800 seconds or 30 minutes. Upon hibernation the chunk server will exit. -Notes ------ +### Notes + - Running qfshibernate again with the same chunk server will update hibernation window. - The longer the hibernation window, the greater the likelihood of data loss. A window of no more than an hour is recommended for this reason. -Evacuation ----------- +### Evacuation + Evacuation can be used to permanently or temporarily retire a chunk server volume. It is recommended that evacuation be used instead of hibernation if the expected down time exceeds one hour. @@ -728,21 +793,25 @@ safely remove all chunks from each chunk directory where the *evacuate* file is present. Once a chunk directory is evacuated, the chunk server will rename the *evacuate* file to *evacuate.done*. -Example -------- +### Example + To evacuate a chunk server with following chunk directories configured: -`chunkServer.chunkDir /mnt/data0/chunks /mnt/data1/chunks /mnt/data2/chunks` +```sh +chunkServer.chunkDir /mnt/data0/chunks /mnt/data1/chunks /mnt/data2/chunks +``` one could use the following script: - 1.!/bin/bash - for data in /mnt/data*; do - chunkdir=$data/chunks - if [ -e $chunkdir ]; then - touch $chunkdir/evacuate - fi - done +```sh +#!/bin/bash +for data in /mnt/data*; do + chunkdir=$data/chunks + if [ -e $chunkdir ]; then + touch $chunkdir/evacuate + fi +done +``` This will cause the chunk server to evacuate all chunks from `/mnt/data0/chunks`, `/mnt/data1/chunks`, and `/mnt/data2/chunks`. As each chunk @@ -751,7 +820,9 @@ directory is evacuated, the chunk server will rename its *evacuate* file to To check the status of the evacuation: -`cd /mnt && find -name evacuate.done | wc -l` +```sh +cd /mnt && find -name evacuate.done | wc -l +``` Once the count returned equals 3, all chunk directories have been evacuated and it's safe to stop the chunk server. @@ -759,29 +830,28 @@ it's safe to stop the chunk server. **Note**: the metaserver web UI will also list all chunk server evacuations and their status. -Client Tools -============ +## Client Tools + QFS includes a set of client tools to make it easy to access the file system. This section describes those tools. -| Tool | Purpose | Notes | -| ---- | ------- | ----- | -|`cpfromqfs`| Copy files from QFS to a local file system or to stdout | Supported options: skipping holes, setting of write buffer size, start and end offsets of source file, read ahead size, op retry count, retry delay and retry timeouts, partial sparse file support. See `./cpfromqfs -h` for more.| -|`cptoqfs`| Copy files from a local file system or stdin to QFS | Supported options: setting replication factor, data and recovery stripe counts, stripes size, input buffer size, QFS write buffer size, truncate/delete target files, create exclusive mode, append mode, op retry count, retry delay and retry timeouts. See `./cptoqfs -h` for more.| -|`qfscat`| Output the contents of file(s) to stdout | See `./qfscat -h` for more information.| -|`qfsput`| Reads from stdin and writes to a given QFS file |See `./qfsput -h` for more information.| -|`qfsdataverify`| Verify the replication data of a given file in QFS| The `-c` option compares the checksums of all replicas. The `-d` option verifies that all N copies of each chunk are identical. Note that for files with replication 1, this tool performs **no** verification. See `./qfsdataverify -h` for more.| -|`qfsfileenum`| Prints the sizes and locations of the chunks for the given file| See `./qfsfileenum -h` for more information.| -|`qfsping`| Send a ping to metaserver or chunk server | Doing a metaserver ping returns list of chunk servers that are up and down. It also returns the usage stats of each up chunk server.\\Doing a chunk server ping returns a the chunk server stats. See `./qfsping -h` for more.| -|`qfshibernate`| Hibernates a chunk server for the given number of seconds | See `./qfshibernate -h` for more information.| -|`qfsshell`| Opens a simple client shell to execute QFS commands | By default this opens an interactive shell. One can bypss the interactive shell and execute commands directly by using the `-q` option. See `./qfsshell -h` for more.| -|`qfsstats`|Reports qfs statistics | The `-n` option is used to control the interval between reports. The RPC stats are also reported if the `-t` option is used. See `./qfsstats -h` for more.| -|`qfstoggleworm`|Set the WORM (write once read many) mode of the file system | | - -Related Documents -================= +| Tool | Purpose | Notes | +| --------------- | --------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `cpfromqfs` | Copy files from QFS to a local file system or to stdout | Supported options: skipping holes, setting of write buffer size, start and end offsets of source file, read ahead size, op retry count, retry delay and retry timeouts, partial sparse file support. See `./cpfromqfs -h` for more. | +| `cptoqfs` | Copy files from a local file system or stdin to QFS | Supported options: setting replication factor, data and recovery stripe counts, stripes size, input buffer size, QFS write buffer size, truncate/delete target files, create exclusive mode, append mode, op retry count, retry delay and retry timeouts. See `./cptoqfs -h` for more. | +| `qfscat` | Output the contents of file(s) to stdout | See `./qfscat -h` for more information. | +| `qfsput` | Reads from stdin and writes to a given QFS file | See `./qfsput -h` for more information. | +| `qfsdataverify` | Verify the replication data of a given file in QFS | The `-c` option compares the checksums of all replicas. The `-d` option verifies that all N copies of each chunk are identical. Note that for files with replication 1, this tool performs **no** verification. See `./qfsdataverify -h` for more. | +| `qfsfileenum` | Prints the sizes and locations of the chunks for the given file | See `./qfsfileenum -h` for more information. | +| `qfsping` | Send a ping to metaserver or chunk server | Doing a metaserver ping returns list of chunk servers that are up and down. It also returns the usage stats of each up chunk server.\\Doing a chunk server ping returns a the chunk server stats. See `./qfsping -h` for more. | +| `qfshibernate` | Hibernates a chunk server for the given number of seconds | See `./qfshibernate -h` for more information. | +| `qfsshell` | Opens a simple client shell to execute QFS commands | By default this opens an interactive shell. One can bypss the interactive shell and execute commands directly by using the `-q` option. See `./qfsshell -h` for more. | +| `qfsstats` | Reports qfs statistics | The `-n` option is used to control the interval between reports. The RPC stats are also reported if the `-t` option is used. See `./qfsstats -h` for more. | +| `qfstoggleworm` | Set the WORM (write once read many) mode of the file system | | + +## Related Documents + - [[Deployment Guide]] - [[Configuration Reference]] - ![Quantcast](//pixel.quantserve.com/pixel/p-9fYuixa7g_Hm2.gif?labels=opensource.qfs.wiki) diff --git a/wiki/Binary-Distributions.md b/wiki/Binary-Distributions.md index b6ecc7cb9..6c772112d 100644 --- a/wiki/Binary-Distributions.md +++ b/wiki/Binary-Distributions.md @@ -6,7 +6,7 @@ QFS](//github.com/quantcast/qfs#trying-qfs) for how to use them. | Release | Ubuntu 14.04 (Trusty Tahr) | Ubuntu 16.04 LTS (Xenial Xerus) | Ubuntu 18.04 LTS (Bionic Beaver) | Ubuntu 20.04.2.0 LTS (Focal Fossa) | Ubuntu 22.04 LTS (Jammy Jellyfish) | Ubuntu 24.04 LTS x86_64 | Ubuntu 24.04 LTS arm64 | Debian 9 Stretch | Debian 10 Buster | Debian 11 Bullseye | Debian 12 Bookworm x86_64 | Debian 12 Bookworm arm64 | CentOS 6 | CentOS 7 | CentOS 8 | Rocky Linux 9 x86_64 | Rocky Linux 9 arm64 | Amazon Linux 2023 x86_64 | Amazon Linux 2023 arm64 | Mac OS X | Mac OS X arm64 | | ---------------- | ------------------------------ | ------------------------------- | -------------------------------- | ---------------------------------- | ---------------------------------- | ------------------------------ | ------------------------------------ | ------------------------- | --------------------------- | --------------------------- | --------------------------- | --------------------------------- | -------------------------- | -------------------------- | -------------------------- | ------------------------- | ------------------------------- | ----------------------------- | ----------------------------------- | ------------------------ | ------------------------------ | -| [master][master] | [download][master,ubuntu14.04] | [download][master,ubuntu16.04] | [download][master,ubuntu18.04] | [download][master,ubuntu20.04] | [download][master,ubuntu22.04] | [download][master,ubuntu24.04] | [download][master,ubuntu24.04-arm64] | End of life | [download][master,debian10] | [download][master,debian11] | [download][master,debian12] | [download][master,debian12-arm64] | [download][master,centos6] | [download][master,centos7] | [download][master,centos8] | [download][master,rocky9] | [download][master,rocky9-arm64] | [download][master,amazon2023] | [download][master,amazon2023-arm64] | [download][master,macos] | [download][master,macos-arm64] | +| [master][master] | [download][master,ubuntu14.04] | [download][master,ubuntu16.04] | [download][master,ubuntu18.04] | [download][master,ubuntu20.04] | [download][master,ubuntu22.04] | [download][master,ubuntu24.04] | [download][master,ubuntu24.04-arm64] | End of life | End of life | [download][master,debian11] | [download][master,debian12] | [download][master,debian12-arm64] | [download][master,centos6] | [download][master,centos7] | [download][master,centos8] | [download][master,rocky9] | [download][master,rocky9-arm64] | [download][master,amazon2023] | [download][master,amazon2023-arm64] | [download][master,macos] | [download][master,macos-arm64] | | [2.2.7][2.2.7] | [download][2.2.7,ubuntu14.04] | [download][2.2.7,ubuntu16.04] | [download][2.2.7,ubuntu18.04] | [download][2.2.7,ubuntu20.04] | [download][2.2.7,ubuntu22.04] | [download][2.2.7,ubuntu24.04] | | End of life | [download][2.2.7,debian10] | [download][2.2.7,debian11] | [download][2.2.7,debian12] | | [download][2.2.7,centos6] | [download][2.2.7,centos7] | [download][2.2.7,centos8] | [download][2.2.7,rocky9] | | | | [download][2.2.7,macos] | [download][2.2.7,macos-arm64] | | [2.2.6][2.2.6] | [download][2.2.6,ubuntu14.04] | [download][2.2.6,ubuntu16.04] | [download][2.2.6,ubuntu18.04] | [download][2.2.6,ubuntu20.04] | [download][2.2.6,ubuntu22.04] | | | End of life | [download][2.2.6,debian10] | | | | [download][2.2.6,centos6] | [download][2.2.6,centos7] | [download][2.2.6,centos8] | | | | | [download][2.2.6,macos] | | | [2.2.5][2.2.5] | [download][2.2.5,ubuntu14.04] | [download][2.2.5,ubuntu16.04] | [download][2.2.5,ubuntu18.04] | [download][2.2.5,ubuntu20.04] | | | | [download][2.2.5,debian9] | [download][2.2.5,debian10] | | | | [download][2.2.5,centos6] | [download][2.2.5,centos7] | [download][2.2.5,centos8] | | | | | [download][2.2.5,macos] | | @@ -121,7 +121,6 @@ QFS](//github.com/quantcast/qfs#trying-qfs) for how to use them. [2.1.2,debian9]: https://s3.amazonaws.com/quantcast-qfs/qfs-debian-9-2.1.2-x86_64.tgz [2.1.1,debian9]: https://s3.amazonaws.com/quantcast-qfs/qfs-debian-9-2.1.1-x86_64.tgz [2.1.0,debian9]: https://s3.amazonaws.com/quantcast-qfs/qfs-debian-9-2.1.0-x86_64.tgz -[master,debian10]: https://s3.amazonaws.com/quantcast-qfs/qfs-debian-10-master-x86_64.tgz [2.2.7,debian10]: https://s3.amazonaws.com/quantcast-qfs/qfs-debian-10-2.2.7-x86_64.tgz [2.2.6,debian10]: https://s3.amazonaws.com/quantcast-qfs/qfs-debian-10-2.2.6-x86_64.tgz [2.2.5,debian10]: https://s3.amazonaws.com/quantcast-qfs/qfs-debian-10-2.2.5-x86_64.tgz diff --git a/wiki/Code-Contribution-Policy.md b/wiki/Code-Contribution-Policy.md index 5e351dd0a..532ce9269 100644 --- a/wiki/Code-Contribution-Policy.md +++ b/wiki/Code-Contribution-Policy.md @@ -1,52 +1,54 @@ +# Code Contribution policy + Thank you for your interest in contributing to qfs! We welcome code contributions to qfs. The following is a set of guidelines and suggestions to make it easier for you to contribute to qfs.Please make them in the form of from your fork, and bear in mind the following guidelines and suggestions. -Before Starting Work --------------------- +## Before Starting Work + Before starting your work, make sure to check our [issue tracker][it] for tickets regarding your suggested change. It's possible someone could have already reported your bug or suggestion and the community has already discussed it in detail. If your code change is complex, please consider communicating with QFS community -at qfs-devel@googlegroups.com to get feedback before submitting the pull +at to get feedback before submitting the pull request. This will save you time and effort in case there are design discussions to be had. -Submitting Changes ------------------- +## Submitting Changes + Please submit code changes as [pull requests][pr]. A separate pull request should be submitted for each separate issue. In addition, for each commit you make, ensure the following: - - Each commit deals with only a single idea. - - Commit messages have a summary line that begins with its scope (eg: `fuse: +- Each commit deals with only a single idea. +- Commit messages have a summary line that begins with its scope (eg: `fuse: fix compiler warning`). Depending on the complexity of the change, a detailed description of the commit should follow in the next lines. - - If you have multiple small commits, consider [rebasing][rebase] to +- If you have multiple small commits, consider [rebasing][rebase] to consolidate them into a single commit. Furthermore, when making your pull request, ensure the following: - - Your change is based on the current master branch (rebase if necessary). - - The code builds properly and tests all pass - - You have added appropriate unit and integration tests for your - changes/fixes. See the [[Developer-Documentation]] for more information on +- Your change is based on the current master branch (rebase if necessary). +- The code builds properly and tests all pass +- You have added appropriate unit and integration tests for your + changes/fixes. See the [[Developer Documentation]] for more information on writing unit and integration tests. - - Code changes are well documented and readable. You should update the +- Code changes are well documented and readable. You should update the documentation within the wiki directory if necessary. - - Your changes conform to the style guide listed below. - - In the pull request comments, please describe the code change and any tests +- Your changes conform to the style guide listed below. +- In the pull request comments, please describe the code change and any tests you’ve done. Pull requests will be reviewed and accepted (or responded to) by Quantcast as soon as possible. -Style Guide ------------ +## Style Guide + QFS is written in C++. We try to follow the style guide below as much as possible. QFS has evolved over time, so there may be existing source files that do not adhere to this style guide, but all new code follows it and we're in the @@ -57,7 +59,8 @@ Any new C++ code added should conform to the style and rules outlined below. **Golden Rule**: functions, variables, and file names should be descriptive. -### General guidlines +### General Guidelines + - Files should have line lengths of 80 characters. - No trailing white spaces. - No tabs; use spaces. Indent 4 spaces at a time. @@ -85,20 +88,24 @@ Any new C++ code added should conform to the style and rules outlined below. - Classes should have public, protected, and private sections in that order. ### Typedefs and Class Names + - Typedefs: Camel case (eg: `MyTimeType`) - Class/Struct names: Camel case (eg: `class ChunkServer`) ### File names + - Class-implementation files: UpperCase (eg: `KfsClient.cc/h`) - Non-class-implementation files, with main: lower\_case (eg: `server_main.cc`) - Non-class-implementation files, w/o main: lower_case (eg: `kfsops.cc/h`) ### Methods + - Global: Please do not add global methods. - Namespace Scope: Camel case (eg: `GetSampleSeq()`) - Class method: Camel case (eg: `GetClassName()`) ### Variables + - Local variables: Use camel case, e.g. `camelCase` - Class member: Use camel case with an `m` prefix, e.g. `mMemberName` - Static variable: Use camel case with an `s` prefix, e.g. `sReaderCount` diff --git a/wiki/Configuration-Reference.md b/wiki/Configuration-Reference.md index e1eedfefb..8a4f4e546 100644 --- a/wiki/Configuration-Reference.md +++ b/wiki/Configuration-Reference.md @@ -1,74 +1,64 @@ -Configuration Reference -======================= +# Configuration Reference + +## Client Tool -Client Tool ------------ The following parameters may be included in the configuration file passed to the -qfs client tool (e.g. `bin/tools/qfs`). +QFS client tool (e.g. `bin/tools/qfs`). + +| parameter | default | description | +| ------------------------ | -------------------------- | ------------------------------------------------------------------------------------------------------------------ | +| fs.msgLogWriter.logLevel | INFO | trace log level, one of the following: DEBUG, INFO, NOTICE, ERROR, CRIT, ALERT, FATAL | +| fs.trash.minPathDepth | 5 | file or directories that path depth less than this value will not be moved into trash unless dfs.force.remove=true | +| fs.trash.current | Current | the name of the current trash checkpoint directory | +| fs.trash.homesPrefix | /user | home directories prefix | +| fs.trash.interval | 60 | interval in seconds between emptier runs | +| fs.trash.trash | .Trash | the name of the trash directory | +| dfs.force.remove | false | see fs.trash.minPathDepth | +| fs.euser | process effective id | Numeric effective user id | +| fs.egroup | process effective group id | Numeric effective group id | -| parameter | default | description | -| --------- | ------- | ----------- | -| fs.msgLogWriter.logLevel | INFO | trace log level, one of the following: DEBUG, INFO, NOTICE, ERROR, CRIT, ALERT, FATAL -| fs.trash.minPathDepth | 5 | file or directories that path depth less than this value will not be moved into trash unless dfs.force.remove=true -| fs.trash.current | Current | the name of the current trash checkpoint directory -| fs.trash.homesPrefix | /user | home directories prefix -| fs.trash.interval | 60 | interval in seconds between emptier runs -| fs.trash.trash | .Trash | the name of the trash directory -| dfs.force.remove | false | see fs.trash.minPathDepth -| fs.euser | process effective id | Numeric effective user id -| fs.egroup | process effective group id | Numeric effective group id +## Metaserver -Metaserver ----------- -An annotated configuration file for metaserver can be found -[here](https://github.com/quantcast/qfs/blob/master/conf/MetaServer.prp). -The configuration file includes a description and a default value for each +[Metaserver annotated configuration file](https://github.com/quantcast/qfs/blob/master/conf/MetaServer.prp). +The configuration file includes a description and a default value for each metaserver configuration parameter. Metaserver configuration file is organized into two sections of parameters. The parameters in the first section are static in the sense that they can not be -modified at run time but require a metaserver restart for the changes -to take effect. On the other hand, the parameters in the second section -can be changed and start to take effect at run time by editing the configuration -file and sending the metaserver process a SIGHUP signal. -Please look for the line in the metaserver configuration file -that starts with "The parameters below this line" to be able to tell -the different sections. +modified at run time but require a metaserver restart for the changes to take +effect. On the other hand, the parameters in the second section can be changed +and start to take effect at run time by editing the configuration file and +sending the metaserver process a SIGHUP signal. Please look for the line in the +metaserver configuration file that starts with "The parameters below this line" +to be able to tell the different sections. **NOTE:** In order to restore a parameter to its default at run time, the default value must be explicitly set in the configuration file. In other words, commenting out the parameter will not have any effect until a restart. -Chunk Server ------------- +## Chunk Server -An annotated configuration file for chunk server can be found -[here](https://github.com/quantcast/qfs/blob/master/conf/ChunkServer.prp). -The configuration file includes a description and a default value for each +[Chunk server annotated configuration](https://github.com/quantcast/qfs/blob/master/conf/ChunkServer.prp). +The configuration file includes a description and a default value for each chunk server configuration parameter. -Chunk server parameters are categorized into two. -The first set of parameters are static in the sense -that a chunk server restart is required for any changes on them -to take effect. These parameters are defined in chunk server -configuration file only. The second set of parameters -are the ones which can be changed dynamically at run time. -These parameters can be defined both in chunk server -and metaserver configuration file. Configuration parameters -in the metaserver configuration file take precedence over the parameters -defined in chunk server configuration files, i.e. the -values defined in chunk server configuration file are overridden -dynamically by the metaserver during the initial handshake. -Please look for the line in metaserver configuration file -that reads "Chunk servers configuration parameters." to see -the list of these parameters. +Chunk server parameters are categorized into two. The first set of parameters +are static in the sense that a chunk server restart is required for any changes +on them to take effect. These parameters are defined in chunk server +configuration file only. The second set of parameters are the ones which can be +changed dynamically at run time. These parameters can be defined both in chunk +server and metaserver configuration file. Configuration parameters in the +metaserver configuration file take precedence over the parameters defined in +chunk server configuration files, i.e. the values defined in chunk server +configuration file are overridden dynamically by the metaserver during the +initial handshake. Please look for the line in metaserver configuration file +that reads "Chunk servers configuration parameters." to see the list of these +parameters. -Web Monitor ------------ +## Web Monitor -An annotated configuration file for the web reporting interface can be -found [here](https://github.com/quantcast/qfs/blob/master/webui/server.conf). +[Web reporting interface annotated configuration file](https://github.com/quantcast/qfs/blob/master/webui/server.conf). **NOTE:** One should not have to modify the settings in the `[chunk]` section of the web server configuration file. diff --git a/wiki/Deployment-Guide.md b/wiki/Deployment-Guide.md index 5fedd4ffe..065373e91 100644 --- a/wiki/Deployment-Guide.md +++ b/wiki/Deployment-Guide.md @@ -1,3 +1,5 @@ +# QFS Deployment Guide + The Quantcast File System (QFS) is designed to run on a cluster of nodes built with commodity hardware, which includes desktop grade disk drives and 1- to 10-gigabit ethernet interconnects. The design is focused on fault tolerance, @@ -12,8 +14,8 @@ encoding must be specified; QFS does not impose a default method. The deployments discussed in this document maximize fault tolerance for when either method is most commonly used. -Components ----------- +## Components + QFS consists of several software components: - **metaserver**: The brains of the operation, the *metaserver* hosts the file @@ -25,8 +27,8 @@ QFS consists of several software components: ![Communication Flows](images/Deployment-Guide/qfs-communication-flows.png) -Metaserver ---------- +## Metaserver + The metaserver stores the file system image in memory. This includes: - All file metadata including directory structure, file names, mtime, etc ... @@ -51,8 +53,8 @@ file system status including (but not limited to): - Capacity information. - General file system health information. -Chunk Server ------------- +## Chunk Server + The chunk server is the workhorse of QFS. All QFS client I/O operations go through the chunk servers, which store data chunks of up to 64 megabytes in size on their host file systems. The chunk servers also replicate and recover chunks @@ -69,8 +71,8 @@ servers use XFS direct I/O which bypasses the OS buffer cache to improve performance. The XFS file space reservation feature is used to minimize fragmentation. -Clients -------- +## Clients + QFS ships with several client tools: - A Java interface for Hadoop @@ -81,9 +83,10 @@ The client communicates with the metaserver to create files, modify file attributes, and to retrieve file-to-chunk mapping information, which it then uses to access file chunks via the chunk servers. -Simple Cluster --------------- +## Simple Cluster + ### Using Chunk Replication + The simplest cluster configuration is replication mode. When a file is created with replication *k*, chunks will be replicated *k* times. The downside to this is that you also will need *k* times as much storage. In the example below, a @@ -106,50 +109,55 @@ chunk replica. The *metaServer.rackPrefixes* [[configuration parameter|Configuration-Reference]] is used to organize placement groups. -#### Configuration +#### Simple Cluster Configuration -##### MetaServer.prp +##### Simple Cluster MetaServer.prp - # port used by clients to connect to the metaserver - metaServer.clientPort = 20000 +```properties +# port used by clients to connect to the metaserver +metaServer.clientPort = 20000 - # port used by chunk servers to connect to the metaserver - metaServer.chunkServerPort = 30000 +# port used by chunk servers to connect to the metaserver +metaServer.chunkServerPort = 30000 - # chunk placement groups by IP address or first three octets - metaServer.rackPrefixes = 192.168.1.1 1 192.168.1.2 2 192.168.1.3 3 +# chunk placement groups by IP address or first three octets +metaServer.rackPrefixes = 192.168.1.1 1 192.168.1.2 2 192.168.1.3 3 - # create new file system if no transaction logs or checkpoints are found - metaServer.createEmptyFs = 1 +# create new file system if no transaction logs or checkpoints are found +metaServer.createEmptyFs = 1 - # location to write transaction logs - metaServer.logDir = /home/qfsm/transaction_logs +# location to write transaction logs +metaServer.logDir = /home/qfsm/transaction_logs - # location to write checkpoints, this needs be pruned periodically - metaServer.cpDir = /home/qfsm/checkpoint +# location to write checkpoints, this needs be pruned periodically +metaServer.cpDir = /home/qfsm/checkpoint - # unique cluster id - metaServer.clusterKey = my-fs-unique-identifier +# unique cluster id +metaServer.clusterKey = my-fs-unique-identifier +``` -##### ChunkServer.prp +##### Simple Cluster ChunkServer.prp + +```properties +# address of the metaserver, host names should not be used +chunkServer.metaServer.hostname 192.168.0.1 - # address of the metaserver, host names should not be used - chunkServer.metaServer.hostname 192.168.0.1 +# metaserver port for chunk server to use +chunkServer.metaServer.port = 30000 - # metaserver port for chunk server to use - chunkServer.metaServer.port = 30000 +# chunk server client listener port +chunkServer.clientPort = 22000 - # chunk server client listener port - chunkServer.clientPort = 22000 +# locations to store chunk data, independent spindles should be +# used +chunkServer.chunkDir = /mnt/data0 /mnt/data1 - # locations to store chunk data, independent spindles should be - used - chunkServer.chunkDir = /mnt/data0 /mnt/data1 +# unique cluster id +chunkServer.clusterKey = my-fs-unique-identifier +``` - # unique cluster id - chunkServer.clusterKey = my-fs-unique-identifier +##### Simple Cluster Notes -##### Notes - DNS based host names are not supported; instead, IPv4 addresses should be used. - The metaserver checkpoint directory (*metaServer.cpDir*) needs to be @@ -161,6 +169,7 @@ placement groups. and *clusterKey*. ### Using Reed-Solomon Encoding + Using Reed-Solomon encoding for fault tolerance is far more space efficient than replication. Rather than writing multiple copies of each chunk, the client library generates parity information that allows the chunk to be reconstructed @@ -178,11 +187,12 @@ the client sets the fault tolerance of each file individually when it is created. #### Reed-Solomon \ -| Value | Description | -| ----- | ----------- | -| k | replication factor, normally 1 with n 3, and more than 1 with n == 0 | -| m | data stripe count, valid range is 1 to 64 with n == 3, and 255 with n == 0 | -| n | recovery stripe count, only 0 and 3 are presently valid, with n == 0 pure striping -- no recovery | + +| Value | Description | +| ----- | ------------------------------------------------------------------------------------------------- | +| k | replication factor, normally 1 with n 3, and more than 1 with n == 0 | +| m | data stripe count, valid range is 1 to 64 with n == 3, and 255 with n == 0 | +| n | recovery stripe count, only 0 and 3 are presently valid, with n == 0 pure striping -- no recovery | Currently the only extensively tested encodings are *\* and replication 3 (expressed as *\*). QFS supports increasing the number @@ -193,23 +203,27 @@ This means that the same number of recovery stripes are responsible for a larger number of data stripes. #### Replication vs Reed-Solomon + The currently supported RS encoding is *\*; this uses only 50% more space than the chunk data, but allows for the loss of three chunks. By comparison, chunk replication with a factor of three (*\*), uses 200% more space than the chunk data and only allows for the loss of 2 chunks. -| encoding | file size | space used | fault tolerance | -| -------- | --------- | ---------- | --------------- | -| replication 3 \ | 6 MB | 24 MB | up to 3 chunks | -| \ | 6 MB | 9 MB | up to 3 chunks | +| encoding | file size | space used | fault tolerance | +| -------------------------- | --------- | ---------- | --------------- | +| replication 3 \ | 6 MB | 24 MB | up to 3 chunks | +| \ | 6 MB | 9 MB | up to 3 chunks | **Some useful formulas:** - disk usage = file size x ((data stripes + recovery stripes) / data stripes) x replication) - effective capacity = raw capacity x (data stripes / ((data stripes + recovery stripes) x replication)) +```text +disk usage = file size x ((data stripes + recovery stripes) / data stripes) x replication) +effective capacity = raw capacity x (data stripes / ((data stripes + recovery stripes) x replication)) +``` + +#### Reed-Solomon Cluster Layout -#### Layout A minimum of 9 chunk servers is required for ideal fault tolerant chunk placement. This is because the supported Reed-Solomon encoding, *\*, has a block size of 9 (data stripe count 6 + recovery stripe count 3). As in @@ -240,48 +254,54 @@ encoding, some files may suffer data loss with even a single machine failure. ![Reed Solomon encoding](images/Deployment-Guide/qfs-cluster-in-a-rack-rs.png) -#### Configuration -##### MetaServer.prp +#### Reed-Solomon Cluster Configuration - # port used by clients to connect to the metaserver - metaServer.clientPort 20000 +##### Reed-Solomon Cluster MetaServer.prp - # port used by chunk servers to connect to metaserver - metaServer.chunkServerPort = 30000 +```properties +# port used by clients to connect to the metaserver +metaServer.clientPort 20000 - # chunk placement groups by IP address or first three octets - metaServer.rackPrefixes = 192.168.1.1 1 192.168.1.2 2 192.168.1.3 3 192.168.1.4 4 192.168.1.5 5 192.168.1.6 6 192.168.1.7 7 192.168.1.8 8 192.168.1.9 9 +# port used by chunk servers to connect to metaserver +metaServer.chunkServerPort = 30000 - # create new file system if no transaction logs or checkpoints are found - metaServer.createEmptyFs = 1 +# chunk placement groups by IP address or first three octets +metaServer.rackPrefixes = 192.168.1.1 1 192.168.1.2 2 192.168.1.3 3 192.168.1.4 4 192.168.1.5 5 192.168.1.6 6 192.168.1.7 7 192.168.1.8 8 192.168.1.9 9 - # location to write transaction logs - metaServer.logDir = /home/qfsm/transaction_logs +# create new file system if no transaction logs or checkpoints are found +metaServer.createEmptyFs = 1 - # location to write checkpoints, this needs be pruned periodically - metaServer.cpDir = /home/qfsm/checkpoint +# location to write transaction logs +metaServer.logDir = /home/qfsm/transaction_logs - # unique cluster id - metaServer.clusterKey = my-fs-unique-identifier +# location to write checkpoints, this needs be pruned periodically +metaServer.cpDir = /home/qfsm/checkpoint -##### ChunkServer.prp +# unique cluster id +metaServer.clusterKey = my-fs-unique-identifier +``` - # IP address of the metaserver, host names should not be used - chunkServer.metaServer.hostname 192.168.0.1 +##### Reed-Solomon Cluster ChunkServer.prp - # metaserver port for chunk server to use - chunkServer.metaServer.port = 30000 +```properties +# IP address of the metaserver, host names should not be used +chunkServer.metaServer.hostname 192.168.0.1 - # chunk server client listener port - chunkServer.clientPort = 22000 +# metaserver port for chunk server to use +chunkServer.metaServer.port = 30000 - # locations to store chunk data, independent spindles should be used - chunkServer.chunkDir = /mnt/data0 /mnt/data1 +# chunk server client listener port +chunkServer.clientPort = 22000 - # unique cluster id - chunkServer.clusterKey = my-fs-unique-identifier +# locations to store chunk data, independent spindles should be used +chunkServer.chunkDir = /mnt/data0 /mnt/data1 + +# unique cluster id +chunkServer.clusterKey = my-fs-unique-identifier +``` + +#### Reed-Solomon Cluster Notes -#### Notes - DNS based host names are not supported, instead IPv4 addresses should be used. - The metaserver checkpoint directory (*metaServer.cpDir*) needs to be periodically pruned. Each checkpoint file will be approximately the size of @@ -291,6 +311,8 @@ encoding, some files may suffer data loss with even a single machine failure. servers for multiple QFS file systems, as long as each has its own metaserver and *clusterKey*. Advanced Cluster +## Large Scale Cluster + Here we discuss a larger scale QFS deployment, organized into racks with dedicated networking and power. Each rack hosts 22 chunk server nodes: @@ -300,7 +322,8 @@ There is also a head node rack to host the metaserver: ![Metaserver rack](images/Deployment-Guide/qfs-meta-server-rack.png) -### Layout +### Large Scale Cluster Layout + Racks are natural failure groups, as at any given time they could have isolated network or power failures. As such, racks make perfect chunk placement groups. As discussed earlier, the supported encoding is *\*, with 6 data and @@ -321,63 +344,84 @@ without a single file being lost. The thing to remember, however, is that drives are failing all the time; in all probability the system would only tolerate one or two racks out of service. Configuration -##### MetaServer.prp +### Large Cluster Configuration - # port used by clients to connect to the metaserver - metaServer.clientPort 20000 +#### MetaServer.prp - # port used by chunk servers to connect to the metaserver - metaServer.chunkServerPort = 30000 +```properties +# port used by clients to connect to the metaserver +metaServer.clientPort 20000 - # chunk placement groups by IP address or first three octets - metaServer.rackPrefixes = 192.168.1 1 192.168.2 2 192.168.3 3 192.168.4 4 192.168.5 5 192.168.6 6 192.168.7 7 192.168.8 8 192.168.9 9 +# port used by chunk servers to connect to the metaserver +metaServer.chunkServerPort = 30000 - # create new file system if no transaction logs or checkpoints are found - metaServer.createEmptyFs = 1 +# chunk placement groups by IP address or first three octets +metaServer.rackPrefixes = 192.168.1 1 192.168.2 2 192.168.3 3 192.168.4 4 192.168.5 5 192.168.6 6 192.168.7 7 192.168.8 8 192.168.9 9 - # location to write transaction logs - metaServer.logDir = /home/qfsm/transaction_logs +# create new file system if no transaction logs or checkpoints are found +metaServer.createEmptyFs = 1 - # location to write checkpoints, this needs be pruned periodically - metaServer.cpDir = /home/qfsm/checkpoint +# location to write transaction logs +metaServer.logDir = /home/qfsm/transaction_logs - # unique cluster id - metaServer.clusterKey = my-fs-unique-identifier +# location to write checkpoints, this needs be pruned periodically +metaServer.cpDir = /home/qfsm/checkpoint + +# unique cluster id +metaServer.clusterKey = my-fs-unique-identifier +``` ##### ChunkServer.prp - # address of the metaserver, host names should not be used - chunkServer.metaServer.hostname 192.168.0.1 +```properties +# address of the metaserver, host names should not be used +chunkServer.metaServer.hostname 192.168.0.1 + +# metaserver port for chunk server to use +chunkServer.metaServer.port = 30000 - # metaserver port for chunk server to use - chunkServer.metaServer.port = 30000 +# chunk server client listener port +chunkServer.clientPort = 22000 - # chunk server client listener port - chunkServer.clientPort = 22000 +# locations to store chunk data, independent spindles should be used +chunkServer.chunkDir = /mnt/data0 /mnt/data1 - # locations to store chunk data, independent spindles should be used - chunkServer.chunkDir = /mnt/data0 /mnt/data1 +# unique cluster id +chunkServer.clusterKey = my-fs-unique-identifier +``` - # unique cluster id - chunkServer.clusterKey = my-fs-unique-identifier +## FUSE -FUSE ----- You can use the `qfs_fuse` binary directly or via /etc/fstab. 1. Direct usage: - - Mount using `$ sudo ./qfs_fuse :20000 /mnt/qfs -o allow_other,ro` - - Unmount using `$ sudo umount /mnt/qfs` -1. Editing /etc/fstab to mount automatically at startup: - - Create a symlink to qfs\_fuse `$ ln -s /sbin/mount.qfs` - - Add the following line to /etc/fstab:`:20000 /mnt/qfs qfs ro,allow_other 0 0` + - Mount using + + ```sh + sudo ./qfs_fuse :20000 /mnt/qfs -o allow_other,ro + ``` + + - Unmount using + + ```sh + sudo umount /mnt/qfs + ``` + +2. Editing /etc/fstab to mount automatically at startup: + - Create a symlink to qfs_fuse + + ```sh + ln -s /sbin/mount.qfs + ``` + + - Add the following line to /etc/fstab: + `:20000 /mnt/qfs qfs ro,allow_other 0 0` Due to licensing issues, you can include FUSE only if it is licensed under LGPL or any other license that is compatible with Apache 2.0 license. +## Best Practices -Best Practices --------------- - Use a reliable service manager for both the meta and chunk servers such as [daemontools](http://cr.yp.to/daemontools.html). daemontools has the added benefit of log service management. @@ -396,8 +440,8 @@ Best Practices - Do not locate your metaserver head node in the same rack as your chunk servers. -Related Documents ------------------ +## Related Documents + - [[Administrator's Guide]] - [[Configuration Reference]] diff --git a/wiki/Developer-Documentation.md b/wiki/Developer-Documentation.md index d796c84c5..e7acedbe8 100644 --- a/wiki/Developer-Documentation.md +++ b/wiki/Developer-Documentation.md @@ -1,32 +1,34 @@ -Prerequisites -============= +# Developer Documentation + +## Prerequisites + To compile and run QFS you need to have the following software packages installed in your development system. -| RHEL/CentOS | Debian/Ubuntu | OS X | Cygwin | Notes | -|------------------|----------------------|---------|--------------------|-----------------------------------------------------------------------------------------------------------------------------------------------| -| `gcc-g++` | `gcc` | | `gcc-g++` | | -| `make` | `make` | | `make` | | -| `git` | `git` | | `git` | version 1.7.10 or higher | -| `cmake` | `cmake` | `cmake` | `cmake` | version 2.8.4 or higher | -| `maven` | `maven2` | `maven` | | version 3.0.3 or higher | -| `boost-devel` | `libboost-regex-dev` | `boost` | `libboost-devel` | version 1.3.4 or higher (for mac, may need to install boost with `'-no_single'` option if only the `/opt/local/lib/*-mt.dylib` are installed) | -| `krb5-devel` | `libkrb5-dev` | | `libkrb5-devel` | | -| `openssl-devel` | `libssl-dev` | | `openssl-devel` | | -| `python-devel` | `python-dev` | | | for python bindings | -| `fuse-devel` | `libfuse-dev` | | | for FUSE bindings | -| `java-openjdk` | `default-jdk` | | | for java access | -| `java-devel` | | | | for java access | -| `libuuid-devel` | | | | | -| | | `Xcode` | | | -| | | | `bzip2` | | -| | | | `autoconf` | | -| | | | `automake` | | -| | | | `libstdc++6-devel` | | - -Repository Organization -======================= -``` +| RHEL/CentOS | Debian/Ubuntu | OS X | Cygwin | Notes | +| --------------- | -------------------- | ------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------- | +| `gcc-g++` | `gcc` | | `gcc-g++` | | +| `make` | `make` | | `make` | | +| `git` | `git` | | `git` | version 1.7.10 or higher | +| `cmake` | `cmake` | `cmake` | `cmake` | version 2.8.4 or higher | +| `maven` | `maven2` | `maven` | | version 3.0.3 or higher | +| `boost-devel` | `libboost-regex-dev` | `boost` | `libboost-devel` | version 1.3.4 or higher (for mac, may need to install boost with `'-no_single'` option if only the `/opt/local/lib/*-mt.dylib` are installed) | +| `krb5-devel` | `libkrb5-dev` | | `libkrb5-devel` | | +| `openssl-devel` | `libssl-dev` | | `openssl-devel` | | +| `python-devel` | `python-dev` | | | for python bindings | +| `fuse-devel` | `libfuse-dev` | | | for FUSE bindings | +| `java-openjdk` | `default-jdk` | | | for java access | +| `java-devel` | | | | for java access | +| `libuuid-devel` | | | | | +| | | `Xcode` | | | +| | | | `bzip2` | | +| | | | `autoconf` | | +| | | | `automake` | | +| | | | `libstdc++6-devel` | | + +## Repository Organization + +```text QFS top-level directory │ ├──── benchmarks @@ -73,8 +75,8 @@ Repository Organization └──── test-scripts (Scripts to test QFS servers and components) ``` -Compiling The Source -==================== +## Compiling The Source + The top-level Makefile automatically compiles QFS and generates the server binaries, client tools and client libraries. This section has information that gives you greater control over the compile process. This section also provides @@ -90,8 +92,8 @@ files in QFS. The mode of execution of a QFS client application is as follows: client library will fail-over to another chunk server that has the data; this fail-over is transparent to the application. -Compiling the C++ Code ------------------- +## Compiling the C++ Code + Compiling the C++ code produces the metaserver, chunkserver, client, and admin tool binaries. It also produces the C++ client library. We use **`cmake`** to build the C++ code. You can use the top-level Makefile as a wrapper around cmake @@ -101,6 +103,7 @@ Once the build is complete, you will find the build artifacts in the `build/debug` directory. ### Types of Builds + The default build type is a release build. You can execute a debug build by running `make BUILD_TYPE=debug CMAKE_OPTIONS=`. The build artifacts for this build will be available in the `build/debug` directory. @@ -112,18 +115,20 @@ compile flags of "-O2 -g"). Having binaries with debugging info in production simplifies debugging should a problem arise in a production environment. ### Verbose Build Output + To build with verbose output, use the environment variable `VERBOSE=true` to build. For example, use `VERBOSE=true make`. ### `make test` Targets + To run qfs tests, use `make test`. This will ensure that all core functionality is intact. Note that this test invokes the metaserver and chunk servers locally and performs various checks, it may take a couple of minutes to complete. If you are running this from a partition that is nearly full, the test may fail. Please refer to `maxSpaceUtilizationThreshold` in [[Configuration Reference]]. - ### Developing a C++ client + To develop a c++ client, see the sample code in the `examples/cc/qfssample_main.cc` file. The QFS client library API is defined in `src/cc/libclient/KfsClient.h`. @@ -134,8 +139,8 @@ dependencies. Note that default build will contain libraries built with the recommended that you use the libraries built with "Release" option with your applications. -Compiling Java Side -------------------- +## Compiling Java Side + Compile the Java code to get the QFS access jar (which contains the wrapper calls to native C++ via JNI; this allows Java apps to access files stored in QFS) and the Apache Hadoop QFS plugin jar. The Apache Hadoop QFS plugin includes @@ -146,6 +151,7 @@ Apache Maven is used to build Java jars. Use the top-level makefile (`make java`) to build the java jars. ### Developing a Java Client + For Java applications, we use the JNI interface to get at the C++ QFS client library code from Java applications. One should refer to the Java client example at `examples/java/QfsSample.java`. The QFS Java client library API is @@ -158,28 +164,33 @@ addition, to execute the client, `build/release/lib` should be in the `LD_LIBRARY_PATH` (or `DYLD_LIBRARY_PATH`, if it is Mac OS X). To build, - $ cd ~/code/qfs/examples/java - $ qfsjar=`echo ../../build/qfs-access/qfs-access*.jar` - $ javac -classpath "$qfsjar" QfsSample.java +```sh +cd ~/code/qfs/examples/java && +qfsjar=`echo ../../build/qfs-access/qfs-access*.jar` && +javac -classpath "$qfsjar" QfsSample.java +``` To execute, - $ libdir="`cd ../../build/release/lib && pwd`" - $ export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${libdir}" - $ qfsjar=`echo ../../build/qfs-access/qfs-access*.jar` - $ java -Djava.library.path="$libdir" -classpath ".:$qfsjar" QfsSample 127.0.0.1 20000 +```sh +libdir="`cd ../../build/release/lib && pwd`" && +export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${libdir}" && +qfsjar=`echo ../../build/qfs-access/qfs-access*.jar` && +java -Djava.library.path="$libdir" -classpath ".:$qfsjar" QfsSample 127.0.0.1 20000 +``` + +## Compiling Python Side -### Compiling Python Side ------------------------------------- Python applications can access QFS by using the python extension module. This section describes how to build and install the python extension module. To build the python module, use the command `make python`. If build succeeds, then python wheel `build/release/python-qfs/dist/qfs*.whl` will be created at the end. ### Developing a Python Client -Python applications use the python QFS extension module `qfs`. The example program -`examples/python/qfssample.py` illustrates how to write a Python client for QFS. -The module requires python 3.6 or later version. + +Python applications use the python QFS extension module `qfs`. The example +program `examples/python/qfssample.py` illustrates how to write a Python client +for QFS. The module requires python 3.6 or later version. All required QFS libraries are included with QFS python module. Relative run linker paths are used in QFS shared libraries, therefore additional run time @@ -193,5 +204,3 @@ then run the example: `python examples/python/qfssample.py examples/python/qfssample.cfg` ![Quantcast](//pixel.quantserve.com/pixel/p-9fYuixa7g_Hm2.gif?labels=opensource.qfs.wiki) - -[gt]: https://github.com/google/googletest diff --git a/wiki/External-Resources.md b/wiki/External-Resources.md index de3188e9b..bd76d3d5d 100644 --- a/wiki/External-Resources.md +++ b/wiki/External-Resources.md @@ -1,6 +1,10 @@ -This page provides external links to the interesting work/resources that the community put together using QFS. -We believe that these links can be useful to everybody who want to get on-hands experience with QFS and learn more about it. -We thank the contributors of this list. Please create a pull request if you'd like to see a link to your work as well. +# QFS External Resources + +This page provides external links to the interesting work/resources that the +community put together using QFS. We believe that these links can be useful to +everybody who want to get on-hands experience with QFS and learn more about it. +We thank the contributors of this list. Please create a pull request if you'd +like to see a link to your work as well. * [QFS on the ODROID XU4 Cluster](http://diybigdata.net/2016/07/quantcast-file-system-and-spark-on-odroid-cluster/) * [Notes on QFS design in Chinese](http://blog.csdn.net/lpstudy/article/details/51457250) diff --git a/wiki/Home.md b/wiki/Home.md index 8cf706191..73d4415f3 100644 --- a/wiki/Home.md +++ b/wiki/Home.md @@ -1,5 +1,5 @@ -Welcome to the QFS Wiki -================================================================================ +# Welcome to the QFS Wiki + Quantcast File System (QFS) is a high-performance, fault-tolerant, distributed file system developed to support MapReduce processing, or other applications reading and writing large files sequentially. This wiki provides various pieces @@ -28,8 +28,8 @@ pages is available on the right sidebar. - [[External Resources]] - [[QFS on S3]] -Updating the QFS Wiki ---------------------- +## Updating the QFS Wiki + The community is encouraged to update the QFS wiki as they wish. However, public editing has been turned off on the github wiki itself. Instead, the wiki documents are mirrored in the [`wiki`][wiki] directory of the QFS source code. diff --git a/wiki/Introduction-To-QFS.md b/wiki/Introduction-To-QFS.md index 6fc580197..3aa87f94a 100644 --- a/wiki/Introduction-To-QFS.md +++ b/wiki/Introduction-To-QFS.md @@ -1,5 +1,7 @@ -Background ----------- +# Introduction to QFS + +## Background + Hadoop and other batch processing frameworks run best against a file system designed for their data access pattern: sequential reads and sequential writes of large files of at least tens of megabytes, and often gigabytes. @@ -16,6 +18,7 @@ experimental as well as commercial projects. ![QFS Architecture](images/Architecture/qfs_architecture.png) QFS consists of 3 components: + - **Metaserver:** A central metadata server that manages the file system's directory structure and mappings of files to physical storage. - **Chunk Server:** The distributed component of the distributed file system. @@ -31,8 +34,8 @@ STL, and Boost libraries. The server components have been used in production on 64-bit x86 architectures running Linux CentOS 5 and 6, and the client library has been tested on CentOS 5 and 6, OSX 10.X, Cygwin, and Debian/Ubuntu. -QFS Features ------------- +## QFS Features + - **Incremental Scalability:** Chunk Servers can be added to the system in an incremental fashion. When a chunk server is added, it establishes a connection to the metaserver and becomes part of the system. No metaserver restarts are @@ -122,8 +125,8 @@ QFS Features node and communication failures by automatically determining set of connected and usable meta server nodes, and switching over to it. -Key Known Issues And Limitations --------------------------------- +## Key Known Issues And Limitations + - The maximum value for a file's degree of replication is 64 (assuming resources exist). - The metaserver currently does not replicate chunks whenever files become "hot" @@ -141,8 +144,8 @@ Key Known Issues And Limitations does not attempt to buffer the data to allow simultaneous read/write access to the same chunk. -References ----------- +## References + QFS builds upon some of the ideas outlined in the Google File System (GFS) paper [(SOSP 2003)](http://static.googleusercontent.com/external_content/untrusted_dlcp/research.google.com/en/us/archive/gfs-sosp2003.pdf). diff --git a/wiki/Migration-Guide.md b/wiki/Migration-Guide.md index 0bd0c2c4a..27dc805f5 100644 --- a/wiki/Migration-Guide.md +++ b/wiki/Migration-Guide.md @@ -1,23 +1,32 @@ +# QFS Migration Guide + This page provides information on how to use QFS with Apache Hadoop. -Obtaining the Hadoop QFS plugin -------------------------------- +## Obtaining the Hadoop QFS plugin + With QFS 1.0.1, we have simplified the integration of QFS with Apache Hadoop deployment. One only needs to copy the Hadoop QFS plugin jar and set the java library path in order to use QFS as the backing store for Hadoop. You can obtain the Hadoop QFS plugin jar in one of two ways: + - If there is a [[QFS binary tarball|Binary-Distributions]] for your platform, then the tarball already has the jars and native libraries. - Obtain the QFS tarball, say, $QFSTAR.tgz - $ tar -xvzf $QFSTAR.tgz && cd $QFSTAR - $ ls -1 lib/hadoop*.jar + Obtain the QFS tarball, say, `$QFSTAR.tgz` + + ```sh + tar -xvzf "$QFSTAR".tgz && cd "$QFSTAR" && + ls -1 lib/hadoop*.jar + ``` + + ```console lib/hadoop-0.23.4-qfs-1.0.1.jar lib/hadoop-1.0.2-qfs-1.0.1.jar lib/hadoop-1.0.4-qfs-1.0.1.jar lib/hadoop-1.1.0-qfs-1.0.1.jar lib/hadoop-2.0.2-alpha-qfs-1.0.1.jar + ``` - If there is no pre-built tarball for your platform, you could obtain the QFS source and build the tarball yourself. As long as you have the pre-requisite @@ -25,17 +34,20 @@ You can obtain the Hadoop QFS plugin jar in one of two ways: to run `make tarball` from the QFS source directory, and this will produce the $QFSTAR.tgz in the build directory. -Using the Hadoop QFS plugin with your Hadoop deployment -------------------------------------------------------- +## Using the Hadoop QFS plugin with your Hadoop deployment + When the Hadoop QFS jar is in your class path and the QFS native libraries are loadable, accessing QFS from Hadoop is as simple as, - $ cd ${HADOOP_HOME} - $ bin/hadoop fs -Dfs.qfs.impl=com.quantcast.qfs.hadoop.QuantcastFileSystem \ - -Dfs.default.name=qfs://localhost:20000 \ - -Dfs.qfs.metaServerHost=localhost \ - -Dfs.qfs.metaServerPort=20000 \ - -ls / +```sh +cd "${HADOOP_HOME}" && +bin/hadoop fs \ + -Dfs.qfs.impl=com.quantcast.qfs.hadoop.QuantcastFileSystem \ + -Dfs.default.name=qfs://localhost:20000 \ + -Dfs.qfs.metaServerHost=localhost \ + -Dfs.qfs.metaServerPort=20000 \ + -ls / +``` In the example above, the sample QFS metaserver (see [[Getting Started|Home]]) listens on port 20000 on `localhost`. @@ -57,35 +69,40 @@ setting `fs.qfs.impl` to `com.quantcast.qfs.hadoop.QuantcastFileSystem` and `fs.default.name` (or `fs.defaultFS`, in newer Hadoop versions) to `qfs://:`, as shown in the example above. -Migrating HDFS Data to QFS --------------------------- +## Migrating HDFS Data to QFS + If you have existing data in HDFS that you want to copy to QFS in order to use QFS as your backing store, you could run a distributed copy provided by Apache Hadoop. In the following example, `namehost:8020` is the host name and port number of the namenode of an HDFS instance and `metahost:20000` is the -corresponding location of a QFS metaserver.\\ - - $ cd ${HADOOP_HOME} - $ bin/hadoop distcp -Dfs.qfs.impl=com.quantcast.qfs.hadoop.QuantcastFileSystem \ - -Dfs.default.name=qfs://localhost:20000 \ - -Dfs.qfs.metaServerHost=localhost \ - -Dfs.qfs.metaServerPort=20000 \ - hdfs://localhost:8020/hdfs_dir/70MFile qfs://localhost:20000/qfs_dir/70Mcopy +corresponding location of a QFS metaserver. + +```sh +cd "${HADOOP_HOME}" && +bin/hadoop distcp \ + -Dfs.qfs.impl=com.quantcast.qfs.hadoop.QuantcastFileSystem \ + -Dfs.default.name=qfs://localhost:20000 \ + -Dfs.qfs.metaServerHost=localhost \ + -Dfs.qfs.metaServerPort=20000 \ + hdfs://localhost:8020/hdfs_dir/70MFile qfs://localhost:20000/qfs_dir/70Mcopy +``` Note that this is a map-reduce job, and therefore there should be job trackers and task trackers available to do the distibuted copy. -Submitting Jobs that use QFS ----------------------------- +## Submitting Jobs that use QFS + If you want to submit a job to Apache Hadoop that would use QFS, you could follow this example: - $ cd ${HADOOP_HOME} - $ bin/hadoop jar hadoop-examples-1.0.3.jar randomwriter \ - -Dfs.qfs.impl=com.quantcast.qfs.hadoop.QuantcastFileSystem \ - -Dfs.default.name=qfs://metahost:20000 \ - -Dfs.qfs.metaServerHost=metahost \ - -Dfs.qfs.metaServerPort=20000 \ - /tmp/randomOut +```sh +cd "${HADOOP_HOME}" && +bin/hadoop jar hadoop-examples-1.0.3.jar randomwriter \ + -Dfs.qfs.impl=com.quantcast.qfs.hadoop.QuantcastFileSystem \ + -Dfs.default.name=qfs://metahost:20000 \ + -Dfs.qfs.metaServerHost=metahost \ + -Dfs.qfs.metaServerPort=20000 \ + /tmp/randomOut +``` ![Quantcast](//pixel.quantserve.com/pixel/p-9fYuixa7g_Hm2.gif?labels=opensource.qfs.wiki) diff --git a/wiki/Performance-Comparison-to-HDFS.md b/wiki/Performance-Comparison-to-HDFS.md index 070f032a4..b32884f59 100644 --- a/wiki/Performance-Comparison-to-HDFS.md +++ b/wiki/Performance-Comparison-to-HDFS.md @@ -1,18 +1,20 @@ +# Performance Comparison to HDFS + First off, QFS gives you 50% space reduction over 3-way replication in HDFS, through erasure coding. At the same time it increases the number of simultaneous failures tolerated from 2 to 3. Moreover, it's significantly faster at scale, for the workload we tested on. -| Full Benchmark Results | MStress Benchmark Results | -| ---------------------- | ------------------------- | +| Full Benchmark Results | MStress Benchmark Results | +| ------------------------------------------------------------------ | --------------------------------------------------------------------- | | ![Full Benchmark Results](images/Benchmarking/write-read-sort.png) | ![MStress Benchmark Results](images/Benchmarking/mstress-results.png) | -| Aggregate network bandwidth during one round of tests | -| ----------------------------------------------------- | +| Aggregate network bandwidth during one round of tests | +| -------------------------------------------------------------------------------- | | ![Cluster behavior during write/read tests](images/Benchmarking/throughputs.png) | -Read and Write Throughput Benchmarks ------------------------------------- +## Read and Write Throughput Benchmarks + We ran **read** and **write** benchmarks on 20 TB of uncompressed data in two configurations: Hadoop+HDFS and Hadoop+QFS. In each configuration we ran two tests, a 20 TB **read** test and a 20 TB **write** test. We used replication 3 @@ -32,8 +34,8 @@ At that point Hadoop+QFS ran faster as workers read concurrently from 6 chunk servers due to striping, whereas Hadoop+HDFS read from a single chunk server and was bound by the speed of a single disk. -Metaserver Benchmarks ---------------------- +## Metaserver Benchmarks + The QFS metaserver (name node in HDFS) is the core of the file system. Its responsiveness and scalability affect the throughput and scalability of the whole cluster. At Quantcast we see production metaservers handling up to 100,000 @@ -46,8 +48,8 @@ whereas QFS does not. QFS had better throughput overall, and particularly so for **create** and **stat** operations. It completed the same workload faster and with less CPU consumption. -Small Print ------------ +## Small Print + Although we did use HDFS for years, we know QFS better. We don't claim that the results are representative for QFS or HDFS performance in general. These are simply the results that we measured in good faith on our production platform. diff --git a/wiki/QFS-Client-Reference.md b/wiki/QFS-Client-Reference.md index 9ccfb0c91..b338444fe 100644 --- a/wiki/QFS-Client-Reference.md +++ b/wiki/QFS-Client-Reference.md @@ -1,3 +1,5 @@ +# QFS Client Reference + This page provides additional information on QFS best practices and how to use the QFS C++/Java Client API and command line tools. @@ -67,7 +69,9 @@ of tiers. Tier settings can be specified in metaserver configuration file by `chunkServer.storageTierPrefixes` field. For example, - chunkServer.storageTierPrefixes = /mnt/ram 10 /mnt/flash 11 +```properties +chunkServer.storageTierPrefixes = /mnt/ram 10 /mnt/flash 11 +``` tells the metaserver that the devices that are mounted to /mnt/ram and /mnt/flash in chunkservers belong to tiers 10 and 11, respectively. Then, a hot @@ -90,7 +94,9 @@ file and specify the min and max tier values during creation of the new copy. Invoking the *qfs* command-line tool like below displays which chunkservers contain the data between \th and \th bytes in a file: - qfs -fs -dloc +```sh +qfs -fs -dloc +``` ### What is the best way to handle non-uniformity in storage nodes caused by differing disk counts and disk capacities? @@ -122,115 +128,131 @@ large. ## QFS Client Properties -Throughout the related text, we refer to two values; write-stride and read-stride. -write-stride is defined as _(number of data stripes + number of recovery stripes) * stripe size_, -whereas read-stride is defined as _number of data stripes * stripe size_. +Throughout the related text, we refer to two values; write-stride and +read-stride. write-stride is defined as *(number of data stripes + number of +recovery stripes) * stripe size*, whereas read-stride is defined as *number of +data stripes * stripe size*. ### File Properties -* *ioBufferSize:* Serves as write-behind threshold and governs when buffered data -gets actually written. During file creation/opening, QFS client sets _ioBufferSize_ -to _defaultIOBufferSize_ (see _defaultIOBufferSize_ below for details). During file -creation/opening, QFS client ensures that _ioBufferSize_ for Reed-Solomon files does -not go below (_number of data stripes_+_number of recovery stripes_)\*_targetDiskIoSize_ -and it is write-stride size aligned. After file creation/opening, users can overwrite -the value set by QFS client by calling `KfsClient::SetIoBufferSize(int fd, size_t size)`. -Note that `KfsClient::SetIoBufferSize(int fd, size_t size)` does not have an effect on +* *ioBufferSize:* Serves as write-behind threshold and governs when buffered +data gets actually written. During file creation/opening, QFS client sets +*ioBufferSize* to *defaultIOBufferSize* (see *defaultIOBufferSize* below for +details). During file creation/opening, QFS client ensures that *ioBufferSize* +for Reed-Solomon files does not go below (*number of data stripes*+*number of +recovery stripes*)\**targetDiskIoSize* and it is write-stride size aligned. +After file creation/opening, users can overwrite the value set by QFS client by +calling `KfsClient::SetIoBufferSize(int fd, size_t size)`. Note that +`KfsClient::SetIoBufferSize(int fd, size_t size)` does not have an effect on previously submitted write requests. -* *readAheadBufferSize:* Defines the minimum number of bytes read from a file regardless -of the actual number of bytes that a read call intents to read.  During file -creation/opening, QFS client automatically sets _readAheadBufferSize_ to _defaultReadAheadBufferSize_ -(see _defaultReadAheadBufferSize_ below for details). During file creation/opening, QFS client -ensures that _readAheadBufferSize_ for Reed-Solomon files does not go below _number of -data stripes_\*_targetDiskIoSize_ and it is read-stride size aligned. After file -creation/opening, users can overwrite the value set by QFS client by -calling `KfsClient::SetReadAheadSize(int fd, size_t size)`. Note that -`KfsClient::SetReadAheadSize(int fd, size_t size)` does not have an effect -on previously submitted read requests. - -* *diskIOReadSize:* Defines the maximum number of bytes read from a file each time -data is received from a chunk server. Consequently, it also controls the size of -the disk IO read operation at the chunk server -- how many bytes that the chunk server -reads from the underlying storage device at each access. Currently, it is set to -_maxReadSize_ (see below _maxReadSize_ for details) for all types of read operations -(regular read, read-ahead, prefetch read). +* *readAheadBufferSize:* Defines the minimum number of bytes read from a file +regardless of the actual number of bytes that a read call intents to read. + During file creation/opening, QFS client automatically sets +*readAheadBufferSize* to *defaultReadAheadBufferSize* (see +*defaultReadAheadBufferSize* below for details). During file creation/opening, +QFS client ensures that *readAheadBufferSize* for Reed-Solomon files does not go +below *number of data stripes*\**targetDiskIoSize* and it is read-stride size +aligned. After file creation/opening, users can overwrite the value set by QFS +client by calling `KfsClient::SetReadAheadSize(int fd, size_t size)`. Note that +`KfsClient::SetReadAheadSize(int fd, size_t size)` does not have an effect on +previously submitted read requests. + +* *diskIOReadSize:* Defines the maximum number of bytes read from a file each +time data is received from a chunk server. Consequently, it also controls the +size of the disk IO read operation at the chunk server -- how many bytes that +the chunk server reads from the underlying storage device at each access. +Currently, it is set to *maxReadSize* (see below *maxReadSize* for details) for +all types of read operations (regular read, read-ahead, prefetch read). * *diskIOWriteSize:* Defines the maximum number of bytes written to a file each time data is sent to a chunk server. Consequently, it also controls the size of the disk IO write operation at the chunk server -- how many bytes that the chunk -server writes to the underlying storage device at each access. For 3x Replication -files, _diskIOWriteSize_ is set to _ioBufferSize_, whereas for Reed-Solomon files -it is set to _ioBufferSize_ / _(number of data stripes+number of recovery stripes)_ -and it is checksum block size aligned. Note that _diskIOWriteSize_ can’t go beyond -4MB or QFS client’s global _maxWriteSize_ (see _maxWriteSize_ below for details). +server writes to the underlying storage device at each access. For 3x +Replication files, *diskIOWriteSize* is set to *ioBufferSize*, whereas for +Reed-Solomon files it is set to *ioBufferSize* / *(number of data stripes+number +of recovery stripes)* and it is checksum block size aligned. Note that +*diskIOWriteSize* can’t go beyond 4MB or QFS client’s global *maxWriteSize* (see +*maxWriteSize* below for details). ### Global Client Properties -* *targetDiskIoSize*: Ensures a minimum value for _ioBufferSize_ and _readAheadBufferSize_ -of a Reed-Solomon file during file creation/opening (see _ioBufferSize_ and -_readAheadBufferSize_ above for details), so that size of each disk IO for reads/writes -in a chunk server satisfies the target value. Users can set _targetDiskIoSize_ during -QFS client initialization by setting QFS_CLIENT_CONFIG environment variable to -client.targetDiskIoSize=\. Otherwise, _targetDiskIoSize_ is set to 1MB. - -* *defaultIOBufferSize:* Used to set _ioBufferSize_ of a file during file creation/opening -(see _ioBufferSize_ above for details). When necessary conditions are satisfied, it is also -used to set _defaultReadAheadBufferSize_ (see _defaultReadAheadBufferSize_ below for details). -Users can set _defaultIOBufferSize_ during QFS client initialization by setting QFS_CLIENT_CONFIG -environment variable to client.defaultIoBufferSize=\. Note that if users don’t provide -a value or the provided value is less than checksum block size (64KB), _defaultIOBufferSize_ -is set to _max(1MB, targetDiskIoSize)_. Once QFS client is initialized, users can overwrite -the value of _defaultIOBufferSize_ by calling `KfsClient::SetDefaultIoBufferSize(size_t size)`. -Note that `KfsClient::SetDefaultIoBufferSize(size_t size)` will not have an effect on already -created or opened files. - -* *defaultReadAheadBufferSize:* Used to set _readAheadBufferSize_ of a file during file -creation/opening (see _readAheadBufferSize_ above for details). Users can set -_defaultReadAheadBufferSize_ during QFS client initialization by setting QFS_CLIENT_CONFIG -environment variable to client.defaultReadAheadBufferSize=\. Note that _defaultReadAheadBufferSize_ -is set to _defaultIOBufferSize_, if users don’t provide a value and _defaultIOBufferSize_ -is greater than checksum block size (64KB). Otherwise, it is set to 1MB. Once QFS client is -initialized, users can overwrite the value of _defaultReadAheadBufferSize_ by calling `KfsClient::SetDefaultReadAheadSize(size_t size)`. -Note that `KfsClient::SetDefaultReadAheadSize(size_t size)` -will not have an effect on already created or opened files. - -* *maxReadSize:* Provides a maximum value for _diskIOReadSize_ of a file. Users can set _maxReadSize_ -during QFS client initialization by setting QFS_CLIENT_CONFIG environment variable to -client.maxReadSize=\. If users don’t provide a value or the provided value is less -than the checksum block size (64KB), _maxReadSize_ is set to _max(4MB, targetDiskIoSize)_. - -* *maxWriteSize:* Provides a maximum value for _diskIOWriteSize_ of a file. Users can set -_maxWriteSize_ during QFS client initialization by setting QFS_CLIENT_CONFIG environment -variable to client.maxWriteSize=\_._ If users don’t provide a value, -_maxWriteSize_ is set to _targetDiskIoSize_. - -* *randomWriteThreshold:* Users can set _randomWriteThreshold_ during QFS client -initialization by setting QFS_CLIENT_CONFIG environment variable to client.randomWriteThreshold=\. -If users don’t provide a value, _randomWriteThreshold_ is set to _maxWriteSize_ -(if provided in the environment variable). - -* *connectionPool*: A flag that tells whether a chunk server connection pool should -be used by QFS client. This is used to reduce the number of chunk server connections -and presently used only with radix sort with write append. Users can set -_connectionPool_ during QFS client initialization by setting QFS_CLIENT_CONFIG -environment variable to client.connectionPool=\. Default value is false. - -* *fullSparseFileSupport*: A flag that tells whether the filesystem might be hosting -sparse files. When it is set, a short read operation does not produce an error, but -instead is accounted as a read on a sparse file. Users can set _fullSparseFileSupport_ -during QFS client initialization by setting QFS_CLIENT_CONFIG environment variable to -client.fullSparseFileSupport=\. Once QFS client is initialized, users can -change the current value by calling `KfsClient::SetDefaultFullSparseFileSupport(bool flag)`. +* *targetDiskIoSize*: Ensures a minimum value for *ioBufferSize* and +*readAheadBufferSize* of a Reed-Solomon file during file creation/opening (see +*ioBufferSize* and *readAheadBufferSize* above for details), so that size of +each disk IO for reads/writes in a chunk server satisfies the target value. +Users can set *targetDiskIoSize* during QFS client initialization by setting +QFS_CLIENT_CONFIG environment variable to `client.targetDiskIoSize=`. +Otherwise, *targetDiskIoSize* is set to 1MB. + +* *defaultIOBufferSize:* Used to set *ioBufferSize* of a file during file +creation/opening (see *ioBufferSize* above for details). When necessary +conditions are satisfied, it is also used to set *defaultReadAheadBufferSize* +(see *defaultReadAheadBufferSize* below for details). Users can set +*defaultIOBufferSize* during QFS client initialization by setting +QFS_CLIENT_CONFIG environment variable to `client.defaultIoBufferSize=`. +Note that if users don’t provide a value or the provided value is less than +checksum block size (64KB), *defaultIOBufferSize* is set to *max(1MB, +targetDiskIoSize)*. Once QFS client is initialized, users can overwrite the +value of *defaultIOBufferSize* by calling +`KfsClient::SetDefaultIoBufferSize(size_t size)`. Note that +`KfsClient::SetDefaultIoBufferSize(size_t size)` will not have an effect on +already created or opened files. + +* *defaultReadAheadBufferSize:* Used to set *readAheadBufferSize* of a file +during file creation/opening (see *readAheadBufferSize* above for details). +Users can set *defaultReadAheadBufferSize* during QFS client initialization by +setting QFS_CLIENT_CONFIG environment variable to +`client.defaultReadAheadBufferSize=`. +Note that *defaultReadAheadBufferSize* is set to *defaultIOBufferSize*, if users +don’t provide a value and *defaultIOBufferSize* is greater than checksum block +size (64KB). Otherwise, it is set to 1MB. Once QFS client is initialized, users +can overwrite the value of *defaultReadAheadBufferSize* by calling +`KfsClient::SetDefaultReadAheadSize(size_t size)`. +Note that `KfsClient::SetDefaultReadAheadSize(size_t size)` will not have an +effect on already created or opened files. + +* *maxReadSize:* Provides a maximum value for *diskIOReadSize* of a file. Users +can set *maxReadSize* during QFS client initialization by setting +QFS_CLIENT_CONFIG environment variable to `client.maxReadSize=`. If users +don’t provide a value or the provided value is less than the checksum block size +(64KB), *maxReadSize* is set to *max(4MB, targetDiskIoSize)*. + +* *maxWriteSize:* Provides a maximum value for *diskIOWriteSize* of a file. +Users can set *maxWriteSize* during QFS client initialization by setting +QFS_CLIENT_CONFIG environment variable to `client.maxWriteSize=`*.* If +users don’t provide a value, *maxWriteSize* is set to *targetDiskIoSize*. + +* *randomWriteThreshold:* Users can set *randomWriteThreshold* during QFS client +initialization by setting QFS_CLIENT_CONFIG environment variable to +`client.randomWriteThreshold=`. If users don’t provide a value, +*randomWriteThreshold* is set to *maxWriteSize* (if provided in the environment +variable). + +* *connectionPool*: A flag that tells whether a chunk server connection pool +should be used by QFS client. This is used to reduce the number of chunk server +connections and presently used only with radix sort with write append. Users can +set *connectionPool* during QFS client initialization by setting +QFS_CLIENT_CONFIG environment variable to `client.connectionPool=`. Default value is false. +* *fullSparseFileSupport*: A flag that tells whether the filesystem might be +hosting sparse files. When it is set, a short read operation does not produce an +error, but instead is accounted as a read on a sparse file. Users can set +*fullSparseFileSupport* during QFS client initialization by setting +QFS_CLIENT_CONFIG environment variable to +`client.fullSparseFileSupport=`. Once QFS client is initialized, users +can change the current value by calling +`KfsClient::SetDefaultFullSparseFileSupport(bool flag)`. Default value is false. + ## Read and Write Functions ### `KfsClient::Read(int fd, char* buf, size_t numBytes)` -Used for blocking reads with a read-ahead logic managed by QFS client. + +Used for blocking reads with a read-ahead logic managed by QFS client. QFS client performs the following steps in order. -* Checks if the current read call could be served from an +* Checks if the current read call could be served from an ongoing prefetch read. The number of remaining bytes to read is updated accordingly. @@ -239,18 +261,19 @@ the existing content in the read-ahead buffer. The number of remaining bytes to read is updated accordingly. * Next, if the number of remaining bytes is sufficiently less -than _read-ahead buffer size_, a new read-ahead operation is issued +than *read-ahead buffer size*, a new read-ahead operation is issued and the remaining bytes to read for the current read call are served from the new content of the read-ahead buffer. Note that QFS client allocates an additional buffer for read-ahead operations on a file. * If previous step is skipped, QFS client creates a regular blocking -read operation. Note that it does not make a copy of the source buffer. +read operation. Note that it does not make a copy of the source buffer. Once the read is over, it issues a new read-ahead operation for subsequent read calls on the same file. This read-ahead operation is performed in a non-blocking fashion. ### `KfsClient::ReadPrefetch(int fd, char* buf, size_t numBytes)` + Used for non-blocking reads in which user provides a prefetch buffer. QFS client does not make a copy of the prefetch buffer, so users should ensure that the provided prefetch buffer is not used until the @@ -259,45 +282,48 @@ a blocking read call. If data prefetched is less than what is asked in blocking read, the blocking read call will read the remaining data. ### `KfsClient::WriteAsync(int fd, const char* buf, size_t numBytes)` + *Note:* This mode of write is yet to be fully supported. Used for non-blocking writes. QFS client doesn’t make a copy of the user provided source buffer, so user should not use the source buffer until the write gets completed. Completion handling is done by invoking -`KfsClient::WriteAsyncCompletionHandler(int fd)`. This function will wait +`KfsClient::WriteAsyncCompletionHandler(int fd)`. This function will wait until all of the non-blocking write requests on that file complete. ### `KfsClient::Write(int fd, const char* buf, size_t numBytes)` + Used for blocking writes. However, how QFS client actually performs the write depends on 1) the number of bytes that we want to write with the -current call (denoted as _numBytes_ below), 2) the number of pending -bytes to be written from previous calls (denoted as _pending_ below) and +current call (denoted as *numBytes* below), 2) the number of pending +bytes to be written from previous calls (denoted as *pending* below) and 3) write-behind threshold. Following two cases are possible. -* _numBytes + pending < write-behind threshold_: QFS client makes a +* *numBytes + pending < write-behind threshold*: QFS client makes a copy of the source buffer. Write operation is delayed until the number of pending bytes (including the bytes from the current write call) exceeds -_write-behind threshold_ by subsequent write calls or until user calls +*write-behind threshold* by subsequent write calls or until user calls `KfsClient::Sync(int fd)`. -* _numBytes + pending >= write-behind threshold_: QFS client makes -a copy of the source buffer only if _write-behind threshold_ is greater than +* *numBytes + pending >= write-behind threshold*: QFS client makes +a copy of the source buffer only if *write-behind threshold* is greater than zero. Write is performed in a blocking fashion. ## Append Operations ### RecordAppend -RecordAppend can be used by a single writer to append to a replicated file. There is -no support for append to RS files or object store files. +RecordAppend can be used by a single writer to append to a replicated file. +There is no support for append to RS files or object store files. The file should be opened using the mode O_APPEND or O_WRONLY | O_APPEND. -When RecordAppend is used to write a record, the entire record is guaranteed to be -written to the same chunk. If the record will not fit in the current chunk, a new chunk -is started. Note that this will generally create a sparse file since the new chunk will -start at the next chunk boundary location. +When RecordAppend is used to write a record, the entire record is guaranteed to +be written to the same chunk. If the record will not fit in the current chunk, a +new chunk is started. Note that this will generally create a sparse file since +the new chunk will start at the next chunk boundary location. For example, using default settings, the sequence + ```cpp char data[] = { '1', '2', '3' }; int fd = client->Open(filename, O_CREAT | O_EXCL); @@ -309,24 +335,28 @@ client->RecordAppend(fd, &data[1], 1); client->RecordAppend(fd, &data[2], 1); client->Close(fd); ``` -will likely result in a file with 2 chunks, each replicated 3 times, and with a size of -134217728 bytes. -A chunk can be in two states - "unstable" (dirty/nonreadable), and "stable" (read only). -The metaserver handles the transition of a chunk from unstable to stable and this process -may take some time. +will likely result in a file with 2 chunks, each replicated 3 times, and with a +size of 134217728 bytes. + +A chunk can be in two states - "unstable" (dirty/nonreadable), and "stable" +(read only). The metaserver handles the transition of a chunk from unstable to +stable and this process may take some time. For more information, see src/cc/chunk/AtomicRecordAppender.cc -When a file is opened for append, any chunk that is written will be unstable for a while even -if the file is closed. If the file is reopened for append, then new data may be appended to -an existing unstable chunk rather than creating a new chunk. +When a file is opened for append, any chunk that is written will be unstable for +a while even if the file is closed. If the file is reopened for append, then new +data may be appended to an existing unstable chunk rather than creating a new +chunk. -An attempt to read a chunk that is not stable will stall the read until the chunk is stable. +An attempt to read a chunk that is not stable will stall the read until the +chunk is stable. -Reading the above file will result in a short read error unless sparse file support is -enabled. This can be accomplished by calling `KfsClient::SetDefaultFullSparseFileSupport(bool flag)` -to enable it for all files or by calling `KfsClient::SetFullSparseFileSupport(int fd, bool flag)` -for an open file before doing any reads. +Reading the above file will result in a short read error unless sparse file +support is enabled. This can be accomplished by calling +`KfsClient::SetDefaultFullSparseFileSupport(bool flag)` to enable it for all +files or by calling `KfsClient::SetFullSparseFileSupport(int fd, bool flag)` for +an open file before doing any reads. Using @@ -341,28 +371,38 @@ for(int i=0; iClose(fd); ``` + would output - Byte at offset 0 is 31 - Byte at offset 67108864 is 32 - Byte at offset 67108865 is 33 - Read 134217728 bytes +```console +Byte at offset 0 is 31 +Byte at offset 67108864 is 32 +Byte at offset 67108865 is 33 +Read 134217728 bytes +``` -`KfsClient::SkipHolesInFile(int fd)` can be used to both indicate spare files support and to -request that the holes in the files be skipped. Calling SkipHolesInFile instead of -SetFullParseFileSupport in the above code would output +`KfsClient::SkipHolesInFile(int fd)` can be used to both indicate spare files +support and to request that the holes in the files be skipped. Calling +SkipHolesInFile instead of SetFullParseFileSupport in the above code would +output - Byte at offset 0 is 31 - Byte at offset 1 is 32 - Byte at offset 2 is 33 - Read 3 bytes +```console +Byte at offset 0 is 31 +Byte at offset 1 is 32 +Byte at offset 2 is 33 +Read 3 bytes +``` ### Writes with O_APPEND -If a file is opened with O_APPEND, then a Write behaves the same as a RecordAppend. + +If a file is opened with O_APPEND, then a Write behaves the same as a +RecordAppend. ### Emulating a traditional append -To do a traditional append to end of file with a single writer, open the file for write then -seek to the end of file before writing. + +To do a traditional append to end of file with a single writer, open the file +for write then seek to the end of file before writing. + ```cpp char data[] = { '1', '2', '3' }; int fd = client->Open(filename, O_CREAT | O_EXCL); @@ -375,10 +415,13 @@ client->Write(fd, &data[1], 1); client->Write(fd, &data[2], 1); client->Close(fd); ``` -will result in a file with a single chunk replicated 3 times and a size of 3 bytes. + +will result in a file with a single chunk replicated 3 times and a size of 3 +bytes. ### AtomicRecordAppend + AtomicRecordAppend can be used by multiple writers to append to a file. -Please see the comments on src/cc/chunk/AtomicRecordAppender.cc and src/cc/meta/LayoutManager.cc -for more information. +Please see the comments on src/cc/chunk/AtomicRecordAppender.cc and +src/cc/meta/LayoutManager.cc for more information. diff --git a/wiki/QFS-Kerberos-Security-Design.md b/wiki/QFS-Kerberos-Security-Design.md index 60dc11caf..b3d88f207 100644 --- a/wiki/QFS-Kerberos-Security-Design.md +++ b/wiki/QFS-Kerberos-Security-Design.md @@ -1,14 +1,18 @@ +# QFS Kerberos Security Design + ## Introduction + The security scope includes the following concepts with respect to the network communication between hosts in the distributed QFS file system: 1. Authentication -1. Authorization -1. Integrity -1. Confidentiality -1. Authentication delegation +2. Authorization +3. Integrity +4. Confidentiality +5. Authentication delegation ## Authentication + Authentication is attained by using Kerberos 5 system, asymmetric key cryptography with X509 certificates, or TLS-PSK. TLS-PSK is used to attain Integrity and Confidentiality with Kerberos and QFS authentication delegation @@ -17,17 +21,17 @@ methods. 1. Depending on the specifics of the particular run time and environment, and the desired protection level, Authentication, Integrity, and Confidentiality can be selected by the QFS configuration. -1. The minimum security level is authentication only between individual chunk +2. The minimum security level is authentication only between individual chunk servers, and between chunk servers and the clients. This level assumes that the network communication between QFS client and QFS nodes cannot be tampered with (no man in the middle attack is possible), or observed by a 3rd party. This could be a reasonable assumption for a "private" cluster network. -1. Integrity and Confidentiality might be turned off for communications between +3. Integrity and Confidentiality might be turned off for communications between client and chunk servers, as well as between chunk servers, in order to reduce CPU utilization. With authentication enabled, turning off integrity - and confidentiality between between the client and meta server is not + and confidentiality between the client and meta server is not supported. -1. It should be noted that Kerberos 5 system relies on the host file system +4. It should be noted that Kerberos 5 system relies on the host file system access control and therefore any unauthorized unrestricted (root) access to one of the hosts in the Kerberos 5 system would constitute a major security breach, and can void authentication, integrity, and confidentiality. In this @@ -35,90 +39,91 @@ methods. ("kerberized") service designs. For this reason and to enforce local (host) file system access control the map / reduce tasks need to run under [delegated] user account started the job. -1. Typically meta and chunk servers each will have their own unique (host +5. Typically meta and chunk servers each will have their own unique (host specific) Kerberos 5 service keys used for mutual authentication between QFS client, meta and chunk servers, and, in turn, if configured, to ensure integrity and confidentiality. Customarily such service keys are stored in the Kerberos 5 KDC database, and in the corresponding host specific "key tab" files. The distinct chunk server Kerberos principals are required for “revocation” / "blacklisting" to work. -1. With X509 authentication method typically meta server and chunk servers each +6. With X509 authentication method typically meta server and chunk servers each have their own X509 certificate with distinct “common name”, issued by typically the same trusted “authority”, and the corresponding private key. The distinct common names are required for “revocation” / “blacklisting” to work. -1. QFS uses standard Kerberos 5 mechanisms to obtain the necessary Kerberos 5 +7. QFS uses standard Kerberos 5 mechanisms to obtain the necessary Kerberos 5 tickets. For example "kinit" with user password, or kinit with "key tab" if the client is running on behalf of some other "service". -1. Depending on the configuration, the chunk servers and the meta server perform +8. Depending on the configuration, the chunk servers and the meta server perform Kerberos 5 mutual authentication, or X509 SSL/TLS mutual authentication. TLS/PSK with a single fixed “secret” key is supported, but is not recommended, as with this method unique server identity (other than the shared “secret” key) is not supported and “revocation” / “blacklisting” cannot work. -1. Each chunk server creates and periodically updates the QFS keys, and +9. Each chunk server creates and periodically updates the QFS keys, and communicates the newly created key to the meta server. The meta server uses the most recently communicated key to create QFS chunk server authentication and chunk access tokens. As a consequence the tokens can be used only with one chunk server. -1. The original ("job launch") client running under user account, and meta - server can use Kerberos 5 or X509 mutual authentication. The map / reduce job - launch client obtains the QFS authentication delegation token and forwards - this token to the job tracker. -1. QFS meta server creates QFS authentication delegation tokens that can be - forwarded to QFS clients and / or other parties to act on behalf - (impersonate) a user with QFS. -1. By default only clients authenticated with the Kerberos 5 or X509 mechanisms - are allowed to request QFS authentication delegation tokens. -1. QFS meta server creates QFS chunk server access tokens. -1. The meta server performs mutual authentication with QFS authentication - delegation tokens. -1. The chunk servers only perform mutual client authentication with QFS chunk - server access tokens. -1. Chunk server to chunk server authentication uses QFS keys created by the - destination chunk server and communicated to the meta server. -1. Map / reduce tasks use QFS authentication delegation tokens to authenticate - with the meta server. -1. Job tracker performs Kerberos 5 or X509 mutual authentication with the meta - server. -1. Task tracker performs Kerberos 5 mutual authentication with the job tracker. -1. Alternatively job tracker and task tracker can use Kerberos 5 mutual - authentication with the QFS meta server and use QFS to establish a secure - communication channel between them. For example, the job tracker, sort - controller, and sort master can use QFS to establish TLS-PSK shared keys. - Generic support to establish shared secrets between the services could be - added to the meta server in the form of drop boxes stored entirely in - metadata. However, this is currently not implemented. It is not clear - how/whether this would work with respect to metadata backups. -1. All file system components including meta server, chunk servers, and QFS - clients enforce the level of protection required by the configuration to - combat against protection level downgrade attacks. For example if the - initiator (client) and / or the responder (server) require mutual - authentication, confidentiality, and integrity to be enabled, initiator and / - or responder will declare a communication failure, unless all of these - requirements are fulfilled. +10. The original ("job launch") client running under user account, and meta + server can use Kerberos 5 or X509 mutual authentication. The map / reduce job + launch client obtains the QFS authentication delegation token and forwards + this token to the job tracker. +11. QFS meta server creates QFS authentication delegation tokens that can be + forwarded to QFS clients and / or other parties to act on behalf + (impersonate) a user with QFS. +12. By default only clients authenticated with the Kerberos 5 or X509 mechanisms + are allowed to request QFS authentication delegation tokens. +13. QFS meta server creates QFS chunk server access tokens. +14. The meta server performs mutual authentication with QFS authentication + delegation tokens. +15. The chunk servers only perform mutual client authentication with QFS chunk + server access tokens. +16. Chunk server to chunk server authentication uses QFS keys created by the + destination chunk server and communicated to the meta server. +17. Map / reduce tasks use QFS authentication delegation tokens to authenticate + with the meta server. +18. Job tracker performs Kerberos 5 or X509 mutual authentication with the meta + server. +19. Task tracker performs Kerberos 5 mutual authentication with the job tracker. +20. Alternatively job tracker and task tracker can use Kerberos 5 mutual + authentication with the QFS meta server and use QFS to establish a secure + communication channel between them. For example, the job tracker, sort + controller, and sort master can use QFS to establish TLS-PSK shared keys. + Generic support to establish shared secrets between the services could be + added to the meta server in the form of drop boxes stored entirely in + metadata. However, this is currently not implemented. It is not clear + how/whether this would work with respect to metadata backups. +21. All file system components including meta server, chunk servers, and QFS + clients enforce the level of protection required by the configuration to + combat against protection level downgrade attacks. For example if the + initiator (client) and / or the responder (server) require mutual + authentication, confidentiality, and integrity to be enabled, initiator and / + or responder will declare a communication failure, unless all of these + requirements are fulfilled. ## Authorization + The existing UNIX like QFS file access permissions are used for Authorization purposes. 1. QFS meta server performs authorization for the user name obtained as a result of the QFS client and the meta server mutual authentication, or QFS user ID obtained from the QFS delegation token. -1. Effective user name is derived from Kerberos 5 principal, client’s X509 +2. Effective user name is derived from Kerberos 5 principal, client’s X509 certificate common name, or from QFS user ID stored in QFS delegation token. The meta server is assigned its own specific Kerberos ”service" key, and/or X509 certificate, and X509 certificates of the trusted authorities. The client obtains the corresponding Kerberos 5 service ticket, or uses conventional X509 certificate verification in order to complete the mutual authentication with the meta server. -1. Effective group. Kerberos 5, and X509 certificate has no concept of groups, +3. Effective group. Kerberos 5, and X509 certificate has no concept of groups, therefore a different mechanism must be used to establish QFS group membership. The meta server obtains group membership from the standard UNIX group database. Specifically, the "local" database (/etc/groups) on the meta server host. If required, in the future, a different group membership mechanism can be implemented. -1. The meta server uses the meta server host’s UNIX user and group databases to +4. The meta server uses the meta server host’s UNIX user and group databases to obtain the mapping of user and group names to user and group ID. The mapping is required due to absence of user and group ID concepts in Kerberos 5 client principal, and client’s X509 certificate. The primary reason to use such a @@ -127,7 +132,7 @@ purposes. backward compatibility with the previous meta server checkpoint and transaction log versions by providing initial user and group IDs to names mapping. -1. With authentication enabled the QFS client library uses the same user and +5. With authentication enabled the QFS client library uses the same user and group names to numeric IDs mapping as the meta server by maintaining a cached view of the meta server’s respective tables. To maintain backward compatibility the client host’s local user and group database is used to map @@ -135,7 +140,9 @@ purposes. the same user and group IDs as the meta server. ## Delegation + QFS authentication delegation and access tokens. + 1. QFS authentication delegation token is a tuple: { User ID, Random Sequence, QFS Key ID, Flags, Issued Time, Expiration Time Delta, Signature }. The signature is HMAC SHA-1 of all fields, except the signature fields itself, @@ -144,8 +151,7 @@ QFS authentication delegation and access tokens. field. The implicit subject field is not part of the token. Presently implicit subject field is used with chunk access tokens, where the subject is QFS chunk ID, or QFS write ID. - -1. The meta and chunk servers create and store in memory QFS Key ID and the +2. The meta and chunk servers create and store in memory QFS Key ID and the corresponding QFS Keys as a dictionary of key value pairs: Key = {QFS Key ID} Value = { QFS Key }. QFS Key consists of randomly generated 384 bits (48 bytes), and QFS Key ID consists of randomly generated 8 bytes (64 bit). @@ -156,53 +162,45 @@ QFS authentication delegation and access tokens. discarded. The QFS Key lifetime implicitly defines the QFS Delegation token renewal interval, and QFS Access tokens maximum expiration time. The key lifetime cannot be lower than QFS lease expiration time — 5 minutes. - -1. The QFS delegation tokens must be renewed before the corresponding QFS Key +3. The QFS delegation tokens must be renewed before the corresponding QFS Key expires. The meta server renews the token by replacing the Key ID and re-computing the Signature field with the current key; all other token fields remain the same. Only clients authenticated with Kerberos 5 or X509 authentication methods are allowed to renew the delegation tokens. - -1. Delegation token cancellation. The meta server maintains a set of all +4. Delegation token cancellation. The meta server maintains a set of all currently valid cancelled tokens. The token Key-ID and Signature fields are not stored in this set entry. Doing so allows a single entry to match all possible renewed token variants created with different QFS Key and QFS Key IDs. To cancel a delegation token the default configuration requires the client to be authenticated with Kerberos 5 or X509 authentication methods and present a valid token to be cancelled along with the token’s session key. - -1. The meta server can be configured with the list of user or group names that +5. The meta server can be configured with the list of user or group names that are allowed to renew and cancel delegation tokens that were issued to users other than the currently authenticated user. For example, the configuration might allow job tracker [user] to renew and cancel delegation tokens that were issued to users different than job tracker user. - -1. The delegation and access tokens, along with the corresponding QFS Key are +6. The delegation and access tokens, along with the corresponding QFS Key are used to create the QFS authenticated session key. The session key is created by computing SHA-384 over the QFS Token and QFS Key concatenation. - -1. The meta server computes and communicates to the client the QFS session key +7. The meta server computes and communicates to the client the QFS session key along with the QFS delegation token. - -1. The Delegation Token and Session Key are used with TLS PSK authentication. +8. The Delegation Token and Session Key are used with TLS PSK authentication. The QFS client uses session key as a shared secret, and passes the token to the meta server in the TLS client hello message. - -1. Chunk server access tokens only differ from the delegation token in the +9. Chunk server access tokens only differ from the delegation token in the Flags field value. The chunk server access tokens and the corresponding session keys are used by the client or other chunk servers acting as client to authenticate with the chunk server and, unless explicitly disabled by configuration, attain communication secrecy and integrity. - -1. Chunk access token differs from the delegation token in the Flags field - value. The token’s signature “includes” implicit chunk or write ID. The chunk - access tokens are used to authorize chunk access with the chunk server, with - the minimum communication security and integrity levels configured on the - client and meta server. - -1. A number of Flags bits are currently reserved for future extensions. +10. Chunk access token differs from the delegation token in the Flags field + value. The token’s signature “includes” implicit chunk or write ID. The + chunk access tokens are used to authorize chunk access with the chunk + server, with the minimum communication security and integrity levels + configured on the client and meta server. +11. A number of Flags bits are currently reserved for future extensions. ## Chunk Access + Authentication and authorization with chunk server and chunk access tokens. 1. The meta server creates and returns to the QFS client chunk server and chunk @@ -214,24 +212,21 @@ Authentication and authorization with chunk server and chunk access tokens. access token. The token’s chunk access (read or write) permissions, and communication security and integrity must match requested chunk access mode, and communication (client connection) security and integrity level. - -1. When chunk is created or opened for write (“allocated” in QFS terms) the +2. When chunk is created or opened for write (“allocated” in QFS terms) the chunk server access and chunk access tokens are communicated by the meta server to the chunk server at the head of the synchronous replication chain, along with the chunk write lease. This is needed in order to allow to establish synchronous replication chain. The chunk server session keys for the chunk servers past the head of the chain are encrypted using the most recent QFS Key of the corresponding chunk server. - -1. The client can request chunk server access and chunk access token from the +3. The client can request chunk server access and chunk access token from the chunk server at the head of the replication chain, if the access tokens issued by the meta server are about to expire. The chunk server will create the tokens as long as the chunk write lease remains valid. The chunk authorization (file permissions) are re-evaluated during periodic (every five minutes) chunk write lease renewals performed by the chunk server at the head of the synchronous replication chain. - -1. The chunk server acting as recovery client is authenticated by the meta +4. The chunk server acting as recovery client is authenticated by the meta server using same delegation token and session key mechanism that is used for the client authentication delegation. The chunk server and chunk access tokens used for re-replication and recovery have “chunk server access” flag @@ -239,6 +234,7 @@ Authentication and authorization with chunk server and chunk access tokens. server’s clients listener ip and port. ## Implementation Considerations + 1. Using higher level SASL API was considered. SASL presents more generic API, and has a generic enough "plugin" system that allows adding possible new "security" mechanisms. The considered Cyrus SASL implementation adds @@ -255,8 +251,7 @@ Authentication and authorization with chunk server and chunk access tokens. future SASL plugging authentication mechanism will unlikely be useful or easily adoptable / switchable for the task at hand. For these reasons it is hard to recommend using SASL on top of GSSAPI. - -1. TLS Kerberos 5 ciphers were also considered. One problem with the TLS +2. TLS Kerberos 5 ciphers were also considered. One problem with the TLS Kerberos 5 ciphers is that the existing standard / RFC and openssl implementation do not include AES ciphers. With AES-NI instruction set AES encryption looks very attractive: 128 bit AES outperforms RC4 cipher with 770 @@ -270,42 +265,36 @@ Authentication and authorization with chunk server and chunk access tokens. openssl to support AES encrypted tickets, relying on such a patch for all supported platforms would increase code maintenance overhead, and decrease code "portability" and adoption. - -1. Initial prototyping and studying the example code suggested that the +3. Initial prototyping and studying the example code suggested that the increase in complexity by using Kerberos 5 API directly instead of using GSSAPI is fairly small. The practical implications of the differences between the two popular Kerberos implementations (MIT and Heimdal) for the purpose of QFS implementation are fairly small, and it is relatively straightforward to support both. On the other hand, GSSAPI layer does does not seem optimal and provides limited access to the underlying Kerberos library. - -1. In order to reduce chunk server and client cpu utilization, secrecy and +4. In order to reduce chunk server and client cpu utilization, secrecy and integrity of client and chunk server communications can be turned off by changing the meta server configuration. In this case the TLS layer is shutdown, and bypassed, after the connection is established and authenticated. In such case after the authentication finishes the existing generic QFS network IO code path is used with no modifications. - -1. With Kerberos 5 authentication the client, and/or chunk server +5. With Kerberos 5 authentication the client, and/or chunk server authenticating with the meta server, issues standard AP_REQ messages to the meta server. The meta server validates AP_REQ, obtains a kerberos principal, and then validates derived from Kerberos principal user name. The session key in the Kerberos ticket is used as the shared key with TLS-PSK exchange. The Kerberos session key is retrieved from the Kerberos 5 ticket by the service (meta server) and the client. - -1. Backward compatibility. Only QFS client library and the corresponding +6. Backward compatibility. Only QFS client library and the corresponding "client" tools / utilities will maintain backward compatibility with the previous QFS versions with no authentication support. For example one of the existing meta server RPCs is used by the client to detect authentication support. The status code of LOOKUP RPC performed on root directory is used for this purpose. - -1. To reduce meta server cpu utilization the number of client connections to +7. To reduce meta server cpu utilization the number of client connections to the meta server from a single client is reduced from 2 to 1, at the cost of a minor complexity increase. - -1. Kerberos 5 infrastructure / KDC load and performance consideration. +8. Kerberos 5 infrastructure / KDC load and performance consideration. Services', for example, chunk servers', service tickets should have reasonably long lifetimes in order to keep KDC load at a reasonable level. Moreover, the service tickets on a large enough cluster might need to be @@ -314,11 +303,13 @@ Authentication and authorization with chunk server and chunk access tokens. for chunk server authentication to the meta server instead. ## Examples + Write Path with Replication -![qfs_security_example](https://cloud.githubusercontent.com/assets/412533/7740543/ddcdeff6-ffa5-11e4-8ac6-a3f7b6675756.png) +![qfs_security_example](images/QFS-Kerberos-Security-Design/Write-Path-with-Replication.png) + +### Details -### Details: - **Step 3.** The client sends AP_REQ Kerberos message, as a payload of the QFS authenticate RPC. - **Step 4.** Meta server returns authentication RPC reply, that says "use diff --git a/wiki/QFS-on-S3.md b/wiki/QFS-on-S3.md index a16fae483..4e29784b0 100644 --- a/wiki/QFS-on-S3.md +++ b/wiki/QFS-on-S3.md @@ -1,16 +1,19 @@ # QFS on S3 Guide + Starting with version 1.2.0, QFS provides the ability to create a filesystem instance that is backed by Amazon S3. This guide documents the instructions for how to enable Amazon S3 support on QFS. For a general discussion of existing features, design and a performance comparison with EMRFS, we encourage you to read [the blog post](https://www.quantcast.com/blog/quantcast-file-system-on-amazon-s3/). -## For the Impatient! +## For the Impatient + [sample_setup.py](https://github.com/quantcast/qfs/blob/master/examples/sampleservers/sample_setup.py) has been updated, so that you can quickly spin up a local QFS on S3 instance and try it out. Before invoking the script, first, uncomment and fill in the following fields in [sample_setup.cfg](https://github.com/quantcast/qfs/blob/master/examples/sampleservers/sample_setup.cfg) file: -``` + +```properties # S3 properties: for S3 support, uncomment and # set the correct values depending on your AWS S3 bucket # and IAM settings. @@ -18,10 +21,13 @@ bucketName = accessKeyId = secretAccessKey = ``` + Once this is done, run the following command: + +```sh +"$QFS_SOURCE_DIR"/examples/sampleservers/sample_setup.py --object-store -a install ``` -$QFS_SOURCE_DIR/examples/sampleservers/sample_setup.py --object-store -a install -``` + If the remaining fields on sample_setup.cfg are left unchanged, this creates an QFS instance that consists of one metaserver and three chunkservers (serving as S3 access proxies) all running on the localhost. If you want to @@ -30,50 +36,64 @@ please see the following files: ~/qfsbase/meta/conf/MetaServer.prp and ~/qfsbase Next, use the existing QFS tool and run the following command to copy a text file under the root directory of the QFS instance that we just created: + +```sh +"$QFS_BUILD_DIR"/bin/tools/cptoqfs -s 127.0.0.1 -p 20000 -d TestFile.txt -k /TestFile.txt ``` -$QFS_BUILD_DIR/bin/tools/cptoqfs -s 127.0.0.1 -p 20000 -d TestFile.txt -k /TestFile.txt -``` + Alternatively, you can run the binary of [qfssample_main.cc](https://github.com/quantcast/qfs/blob/master/examples/cc/qfssample_main.cc), which performs various operations against a given filesystem instance including file creation, read, write, rename and deletion. + +```sh +"$QFS_BUILD_DIR"/bin/examples/qfssample -s localhost -p 20000 ``` -$QFS_BUILD_DIR/bin/examples/qfssample -s localhost -p 20000 -``` + Once done testing, you can bring down the QFS instance with the following command: -``` -$QFS_SOURCE_DIR/examples/sampleservers/sample_setup.py --object-store -a uninstall + +```sh +"$QFS_SOURCE_DIR"/examples/sampleservers/sample_setup.py --object-store -a uninstall ``` ## Enabling S3 Support on QFS + Next, we want to show how to enable Amazon S3 support on an QFS instance by modifying chunkserver and metaserver configuration files manually. The changes for each is presented below. -### chunkserver configuration changes: -1. *Add and set [chunkServer.objectDir](https://github.com/quantcast/qfs/blob/master/conf/ChunkServer.prp#L310) parameter.* +### chunkserver configuration changes - chunkServer.objectDir is similar to chunkServer.chunkDir parameter. The difference is that -instead of pointing to an existing directory in chunkserver, it points to an hypothetical -directory in which chunkservers can place S3 files. For example, the corresponding line -for this parameter can read: - ``` +1. *Add and set [chunkServer.objectDir](https://github.com/quantcast/qfs/blob/master/conf/ChunkServer.prp#L310) +parameter.* chunkServer.objectDir is similar to chunkServer.chunkDir parameter. +The difference is tha instead of pointing to an existing directory in +chunkserver, it points to an hypothetical directory in which chunkservers can +place S3 files. For example, the corresponding line for this parameter can read: + + ```properties chunkServer.objectDir = s3://myfirstbucket. ``` - In the example, the part after "s3://" in the specified object directory, i.e. "myfirstbucket." -(including the . sign at the end), is treated as the configuration suffix. Metaserver -associates the configuration suffix with an S3 bucket name and AWS credentials to access -the given bucket. Corresponding metaserver configuration changes are described in the next section. - This is the only required parameter that needs to be set up in chunkserver configuration file for S3 support. For the optional S3 parameters in chunkserver configuration file, please visit [here](https://github.com/quantcast/qfs/blob/master/conf/ChunkServer.prp#L290). + + In the example, the part after "s3://" in the specified object directory, +i.e. "myfirstbucket." (including the . sign at the end), is treated as the +configuration suffix. Metaserver associates the configuration suffix with an S3 +bucket name and AWS credentials to access the given bucket. Corresponding +metaserver configuration changes are described in the next section. This is the +only required parameter that needs to be set up in chunkserver configuration +file for S3 support. For the optional S3 parameters in chunkserver configuration +file, [please visit](https://github.com/quantcast/qfs/blob/master/conf/ChunkServer.prp#L290). \* **One can specify multiple object directories with chunkServer.objectDir parameter. -Please refer to [Specifying Multiple Object Store Directories](#MultipleObjectStoreSection) section in this guide for more details.** +Please refer to [Specifying Multiple Object Store Directories](#specifying-multiple-object-store-directories) section in this guide for more details.** + +### metaserver configuration changes -### metaserver configuration changes: 1. *Add and set [metaServer.objectStoreEnabled](https://github.com/quantcast/qfs/blob/master/conf/MetaServer.prp#L1200) parameter.* This parameter enables object store in metaserver. For example: - ``` + + ```properties metaServer.objectStoreEnabled = 1 ``` + 2. *For each configuration suffix specified in the chunkserver configuration file, specify the [chunkServer.diskQueue.<object-store-directory-prefix>bucketName](https://github.com/quantcast/qfs/blob/master/conf/MetaServer.prp#L1245), [chunkServer.diskQueue.<object-store-directory-prefix>accessKeyId](https://github.com/quantcast/qfs/blob/master/conf/MetaServer.prp#L1248), @@ -81,7 +101,8 @@ and [chunkServer.diskQueue.<object-store-directory-prefix>secretAccessKey] parameters.* For example, using the same object directory specified in chunkserver configuration example, one can have: - ``` + + ```properties metaServer.diskQueue.myfirstbucket.bucketName = metaServer.diskQueue.myfirstbucket.accessKeyId = metaServer.diskQueue.myfirstbucket.secretAccessKey = @@ -96,7 +117,8 @@ and [metaServer.maxReplicasPerRSFile](https://github.com/quantcast/qfs/blob/mast These two parameters tell metaserver to store all files in S3 object store, regardless of the file create parameters sent by QFS clients. For example: - ``` + + ```properties metaServer.maxReplicasPerFile = 0 metaServer.maxReplicasPerRSFile = 0 ``` @@ -108,8 +130,9 @@ In their absence, QFS clients need to specifically indicate each time that a file is to be stored in S3 object store. For instance, to copy a file in S3 object store, one would need to append “-r 0” argument to cptoqfs call: - ``` - $QFS_BUILD_DIR/bin/tools/cptoqfs -s 127.0.0.1 -p 20000 -d TestFile.txt -k /TestFile.txt -r 0 + + ```sh + "$QFS_BUILD_DIR"/bin/tools/cptoqfs -s 127.0.0.1 -p 20000 -d TestFile.txt -k /TestFile.txt -r 0 ``` “-r 0” tells the metaserver to use 0 replicas for TestFile.txt and that @@ -123,12 +146,12 @@ may not succeed depending on chunkserver configurations. web UI reveals all metaserver configuration parameters including AWS credentials used to access the specified bucket. One can prevent this by setting this parameter like the following: - ``` + + ```properties metaServer.pingDoNotShow = chunkServer.diskQueue.myfirstbucket.accessKeyId chunkServer.diskQueue.myfirstbucket.secretAccessKey ``` - For the other optional S3 parameters in the metaserver configuration file, please visit [here](https://github.com/quantcast/qfs/blob/master/conf/MetaServer.prp#L1196). - + For the other optional S3 parameters in the metaserver configuration file, [please visit](https://github.com/quantcast/qfs/blob/master/conf/MetaServer.prp#L1196). That is it! Once these parameters are set, one can start up the QFS metaserver and chunkservers the same way he/she would do for a regular QFS instance. @@ -138,7 +161,7 @@ chunkservers the same way he/she would do for a regular QFS instance. The following figure shows how the chunkservers (access proxies), which are configured with having no chunk directories, but object directories, of a local QFS on S3 instance are displayed in QFS web UI: -Web UI Default +![Web UI Default](images/S3-Guide/webui_default.png) By default, QFS web UI displays the chunkservers that do not have any associated chunk directories on local storage in light red color. Although this helps identifying @@ -151,28 +174,30 @@ To override the default behaviour, one can set the in the web UI configuration file. If enabled, web UI makes the assumption that the user intends to use the chunkservers as S3 access proxy only, and display them regularly: -Web UI when object mode is on +![Web UI when object mode is on](images/S3-Guide/webui_object_store_mode.png) -## Specifying Multiple Object Store Directories +## Specifying Multiple Object Store Directories QFS allows users to specify more than one object store directories in the chunkserver configuration file and then associate each object store directory with a different S3 bucket along with the IAM configurations to access the bucket. One crucial point here is that different object store directories must be assigned to different storage tiers. Otherwise, QFS returns an error for object store reads -and writes until the configuration is fixed. For more details, please see -[here](https://github.com/quantcast/qfs/blob/master/conf/ChunkServer.prp#L296). +and writes until the configuration is fixed. For more details, +[please see here](https://github.com/quantcast/qfs/blob/master/conf/ChunkServer.prp#L296). The following example illustrates how chunkserver and metaserver configuration files can be modified to make use of multiple object store directories. *chunkserver configuration file:* -``` -chunkServer.objectDir = s3://myfirstbucket. s3://mysecondbucket. + +```properties +chunkServer.objectDir = s3://myfirstbucket. s3://mysecondbucket. ``` *metaserver configuration file:* -``` + +```properties chunkServer.objecStorageTierPrefixes = s3://mysecondbucket. 14 # by default, an object store directory is assigned to tier 15, so we skip specifying the tier for "myfirstbucket". chunkServer.diskQueue.myfirstbucket.bucketName = chunkServer.diskQueue.myfirstbucket.accessKeyId = @@ -182,8 +207,8 @@ chunkServer.diskQueue.mysecondbucket.accessKeyId = chunkServer.diskQueue.mysecondbucket.secretAccessKey = ``` -## Using Amazon S3 and Chunkserver Local Storage Together - +## Using Amazon S3 and Chunkserver Local Storage Together + QFS offers the flexibility to either create a filesystem instance that can solely read/write from/to Amazon S3, or to use a mixed mode in which both Amazon S3 and local storage resources are made use of for file storage. Which mode is currently @@ -191,11 +216,11 @@ being used by a QFS instance depends on the co-existence of chunkServer.chunkDir chunkServer.objectDir parameters in chunkserver configuration file. Specification of both parameters allows the metaserver to use both resources (S3 and local) for file storage/retrieval. On the other hand, if chunkServer.chunkDir is omitted, -the filesystem instance will be backed by Amazon S3 only. +the filesystem instance will be backed by Amazon S3 only. + +## Further Help and Suggestions -## Further Help and Suggestions - If you can not find the information you are looking for in this document, -or have suggestions for how the guide can be improved, please reach out to us. +or have suggestions for how the guide can be improved, please reach out to us. ![Quantcast](//pixel.quantserve.com/pixel/p-9fYuixa7g_Hm2.gif?labels=opensource.qfs.wiki) diff --git a/wiki/README b/wiki/README index a5769de09..8c133d51e 100644 --- a/wiki/README +++ b/wiki/README @@ -1,3 +1,5 @@ +# README + This repository is a subtree merge of the quantcast/qfs.git/wiki directory. The community is encouraged to update the QFS wiki as they wish. However, public editing has been turned off on the github wiki itself. Instead, the wiki @@ -11,4 +13,3 @@ for a few things: In order to update the wiki on Github with the latest content within the quantcast/qfs.git/wiki directory, simply run `make` in this directory. - diff --git a/wiki/Release-Notes-2.2.8.md b/wiki/Release-Notes-2.2.8.md new file mode 100644 index 000000000..54fe0fabf --- /dev/null +++ b/wiki/Release-Notes-2.2.8.md @@ -0,0 +1,182 @@ +# QFS Version 2.2.8 Release Notes + +## Major Features and Improvements + +### Java Compatibility Enhancements + +- **Java 9+ Support**: Implemented new-style Java cleanup using `ref.Cleaner` + while maintaining backward compatibility with pre-Java 9 versions +- **QFS Access Refactoring**: Split `KfsAccess` into `KfsAccess` and + `KfsAccessBase` classes to support both reference cleaner interface (Java 9+) + and finalize() methods (pre-Java 9) +- **Dual Module Support**: Created `qfs-access-pre-9` module for compatibility + with older Java versions +- **Exception Handling**: Changed release and close methods to raise + `IOException` for consistency + +### Benchmark Improvements + +- **MStress Hadoop 3.x Support**: Reworked MStress benchmark to use Hadoop 3.x + APIs with `FileSystem` class instead of `DFSClient` +- **Self-contained JAR**: Build system now creates self-contained JAR with + dependencies +- **Java 6 Compatibility**: Maintained compatibility by removing diamond + operator usage + +### Platform Support Updates + +- **Amazon Linux 2023**: Added support for Amazon Linux 2023 builds +- **ARM64 Architecture**: Added ARM64 build support +- **Rocky Linux 9**: Improved Rocky Linux 9 build stability with cache + management fixes +- **Ubuntu Updates**: Migrated CI builds from Ubuntu 18.04 to Ubuntu 20.04 + +## Component-Specific Changes + +### QFS Client Library + +- Fixed error handling with object store based chunk write lease maintenance + logic +- Reset retry count after successful write ID allocation +- Removed unused includes in `Writer.cc` +- Fixed typos in method names + +### Meta Server + +- Fixed allocate RPC short RPC flag setting in re-allocate path with object + store based chunks +- Added missing flag setting for proper operation + +### Build System and CMake + +- **Boost Compatibility**: Improved Boost version detection and compatibility + - Fixed boost version conditionals for `BOOST_SP_USE_QUICK_ALLOCATOR` + - Added `Boost_MINOR_VERSION` check in addition to `Boost_MAJOR_VERSION` + - Used `Boost_MAJOR_VERSION` instead of `Boost_VERSION_MAJOR` for older + FindBoost compatibility + - Disabled deprecated allocator options for Boost 1.87+ +- **CMake Improvements**: + - Avoided `LESS_EQUAL` operator for older CMake versions + - Reformatted and improved consistency of CMakeLists.txt files + - Fixed comments and folded long lines +- **Maven Updates**: Updated Maven URLs to latest versions +- **YUM Package Management**: Improved package management for CentOS/Rocky Linux + with cache clearing and `--nobest` option + +### Common Library + +- Added `constexpr` function qualifier definition for C++11+ compatibility +- Fixed template syntax errors for newer Clang versions +- Fixed typo in `StBuffer::Swap()` method +- Used `constexpr` for Base64 and CryptoKeys size calculations + +### IO Library + +- Enhanced Base64 size and padding calculation methods with `constexpr` +- Improved CryptoKeys size calculation methods + +### GitHub Actions and CI + +- Removed support for end-of-life distributions (Debian 10, Ubuntu 18.04) +- Implemented build optimizations to avoid rate limiting +- Limited parallel builds and connections to prevent repository fetch issues +- Re-enabled and stabilized Rocky Linux builds +- Added Amazon Linux and ARM64 build workflows + +### Java Build System + +- Removed extraneous project arguments from Maven commands +- Made variable names consistent in `javabuild.sh` +- Eliminated Java compiler warnings across multiple modules +- Fixed code formatting and style issues + +### Testing and Scripts + +- Reformatted endurance test scripts for consistency +- Fixed `ulimit -h` return value handling in test scripts +- Improved Docker invocation in build scripts +- Fixed syntax errors in various shell scripts + +## Bug Fixes + +### Notable Fixes + +- **Object Store Chunks**: Fixed RPC flag setting for object store based chunk + operations +- **Write Lease Maintenance**: Improved error handling and retry logic for chunk + write operations +- **Template Compatibility**: Fixed template syntax issues with newer compiler + versions + +### Build and Compatibility Fixes + +- Fixed CMake compatibility issues with older versions +- Resolved Boost library detection and usage issues +- Fixed Java finalizer handling for different Java versions +- Corrected Maven command arguments and build script issues + +### Code Quality Improvements + +- Eliminated compiler warnings across C++ and Java codebases +- Fixed code formatting and style issues +- Removed trailing whitespace and folded long lines +- Updated copyright years across affected files + +## Platform and Dependency Updates + +### Supported Platforms + +- **Added**: Amazon Linux 2023, ARM64 architecture +- **Removed**: Debian 10 (end-of-life), Ubuntu 18.04 (CI only) +- **Improved**: Rocky Linux 9, CentOS 9 build stability + +### Dependencies + +- Updated Maven URLs to latest versions +- Improved Boost library compatibility (1.87+ support) +- Enhanced Java version support (Java 6 through Java 9+) + +## Documentation and Maintenance + +### Documentation Updates + +- Updated Binary Distributions wiki page +- Added Amazon Linux 2023 to build status badges +- Updated README files with new platform support information + +### Code Maintenance + +- Updated copyright years across multiple files +- Improved code formatting and consistency +- Enhanced error messages and logging +- Cleaned up unused includes and dependencies + +## Possible Breaking Changes + +### Java API Changes + +- Release and close methods in Java access classes now throw `IOException` +- Some internal class names have been changed (added "Base" suffix to native + classes) +- New module structure with `qfs-access-pre-9` for older Java versions + +### Build System Changes + +- MStress benchmark now requires Hadoop 3.4.1 for full functionality +- Some CMake minimum version requirements may have changed due to syntax updates + +## Migration Notes + +### For Java Users + +- Applications using Java 9+ will automatically use the new reference cleaner + interface +- Pre-Java 9 applications will continue to use the traditional finalize() + approach +- Exception handling may need updates due to new `IOException` throwing behavior + +### For Build Systems + +- Rocky Linux 9 and CentOS 9 builds now use `--nobest` option for yum updates +- Maven URL updates may require build script modifications +- Boost 1.87+ users will see deprecated allocator warnings resolved diff --git a/wiki/Release-Notes.md b/wiki/Release-Notes.md index deaaf098e..e0e357b19 100644 --- a/wiki/Release-Notes.md +++ b/wiki/Release-Notes.md @@ -1,161 +1,168 @@ # QFS release notes +## [QFS version 2.2.8](Release-Notes-2.2.8.md) + +### Release Highlights (2.2.8) + +1. **Java 9+ Compatibility:** Refactoring of Java access classes with dual + support for modern and legacy Java versions +2. **MStress Hadoop 3.x Support:** Updated benchmark tool to work with Hadoop + 3.x APIs +3. **Enhanced Platform Support:** Added Amazon Linux 2023 and ARM64 architecture + support +4. **Build System Improvements:** Better Boost compatibility, CMake fixes, and + CI/CD enhancements + +### Key Components Updated (2.2.8) + +1. **QFS Client Library**: Bug fixes for object store operations and error + handling +2. **Meta Server**: RPC flag setting fixes for object store chunks +3. **Build System**: Extensive CMake and Maven improvements +4. **Java Shim**: Complete modernization for Java 9+ compatibility +5. **Common/IO Libraries**: Compiler compatibility fixes and constexpr + enhancements + +### Platform Changes (2.2.8) + +1. **Added**: Amazon Linux 2023, ARM64 builds +2. **Removed**: Debian 10, Ubuntu 18.04 (EOL) +3. **Improved**: Rocky Linux 9 build stability + ## QFS version 2.2.7 -## New features +### New features (2.2.7) 1. Support for python3. Native / platform independent python code is now -compatible with python2 and python3, including QFS meta server web UI and meta -server backup script. QFS python module / bindings now is only compatible with -python3, and python2 is no longer supported. - + compatible with python2 and python3, including QFS meta server web UI and + meta server backup script. QFS python module / bindings now is only + compatible with python3, and python2 is no longer supported. 2. All required QFS shared libraries are now installed along with QFS python -module, making QFS module installation self contained. On macOS and linux -runtime relative linker paths are now used in order to make installed QFS shared -libraries relocatable. With this explicitly specifying QFS libraries runtime -linkers paths with python module is no longer required. Python QFS module wheel -is now built and included into QFS tarball. + module, making QFS module installation self contained. On macOS and linux + runtime relative linker paths are now used in order to make installed QFS + shared libraries relocatable. With this explicitly specifying QFS libraries + runtime linkers paths with python module is no longer required. Python QFS + module wheel is now built and included into QFS tarball. -## Minor improvements +### Minor improvements (2.2.7) 1. Benchmarks mstress build now uses maven instead of ant, and is now included -in QFS build and tarball by default. - + in QFS build and tarball by default. 2. QFS python module is now built, tested, and included into QFS tarball if -python 3.6 or higher is available. - + python 3.6 or higher is available. 3. ARM Neon support is now enabled for QFS Reed-Solomon and GF complete -libraries on Mac OS with Apple silicon. + libraries on Mac OS with Apple silicon. ## QFS version 2.2.6 -## New features +### New features (2.2.6) 1. Added go language bindings. -## Bug fixes +### Bug fixes (2.2.6) 1. Increase default watchdog poll interval by 0.15 to 1.15 seconds in order to -avoid spurious timeouts during idle time due to 1 second default net manager -sleep interval. Change poll interval dependent default parameters calculation -accordingly. + avoid spurious timeouts during idle time due to 1 second default net manager + sleep interval. Change poll interval dependent default parameters calculation + accordingly. -## Minor improvements +### Minor improvements (2.2.6) 1. Additional object store chunk server / AP assignment modes. One is choose AP -with matching rack only (i.e. with no fall back to choosing from all available -APs), and the other is to choose AP with with matching rack only if client rack -is set / known, otherwise falling back to choosing any available AP. For details -please consult metaServer.readUseProxyOnDifferentHostMode and -metaServer.writeUseProxyOnDifferentHostMode parameters descriptions in annotated -meta server configuration file. + with matching rack only (i.e. with no fall back to choosing from all + available APs), and the other is to choose AP with with matching rack only if + client rack is set / known, otherwise falling back to choosing any available + AP. For details please consult metaServer.readUseProxyOnDifferentHostMode and + metaServer.writeUseProxyOnDifferentHostMode parameters descriptions in + annotated meta server configuration file. 2. Use TLS 1.2 with openssl versions prior to 1.1. 3. Implement script to calculate object store space utilization and count number -of files as function of file access time by scanning meta server checkpoint. + of files as function of file access time by scanning meta server checkpoint. 4. Compile with openssl 3.0 5. Build on Ubuntu 22.04 ## QFS version 2.2.5 -## Bug fixes +### Bug fixes (2.2.5) 1. Meta server: keep up to 8K of the most recently received from synchronous -replication channel(s) "future" log blocks while inactive node is in the process -of actively fetching / syncing log and retry merging these log blocks if / when -log fetch "catches up" to the received log blocks sequence numbers. This -mechanism is intended to handle the case when log sync finishes / exits prior to -reaching the most recently received log block due to high RPC rate / meta server -load. - + replication channel(s) "future" log blocks while inactive node is in the + process of actively fetching / syncing log and retry merging these log blocks + if / when log fetch "catches up" to the received log blocks sequence numbers. + This mechanism is intended to handle the case when log sync finishes / exits + prior to reaching the most recently received log block due to high RPC rate / + meta server load. 2. Meta server: re-schedule log sync on inactive node when log block sequence -exceeds last log sequence in order to handle the case when log sync stops before -the synchronous replication catches up in order to make inactive node state -synchronization more robust under high meta server load. - + exceeds last log sequence in order to handle the case when log sync stops + before the synchronous replication catches up in order to make inactive node + state synchronization more robust under high meta server load. 3. Meta server: fix object store delete queue cleanup with VR enabled on backups -by removing delayed queue processing logic that could prevent queue emptying on -the primary therefore never issuing queue reset RPC, instead use dumpster -cleanup timer to delay blocks removal. Parse object store tiers parameter and -create a bitmap with tiers in use bits set, then use the bitmap to validate file -create RPC, failing RPCs with tiers not no use. Discard object store block -deletes if tier is not in use instead of re-queueing block delete in order to -prevent invalid / stale blocks from staying in the delete queue indefinitely -therefore preventing emptying the delete queue on the backups potentially -resulting in unbounded queue growth. - + by removing delayed queue processing logic that could prevent queue emptying + on the primary therefore never issuing queue reset RPC, instead use dumpster + cleanup timer to delay blocks removal. Parse object store tiers parameter and + create a bitmap with tiers in use bits set, then use the bitmap to validate + file create RPC, failing RPCs with tiers not no use. Discard object store + block deletes if tier is not in use instead of re-queueing block delete in + order to prevent invalid / stale blocks from staying in the delete queue + indefinitely therefore preventing emptying the delete queue on the backups + potentially resulting in unbounded queue growth. 4. Meta server: fix extremely rare primary and backup state diversion with chunk -log in flight RPCs in the case where such RPCs are created while processing -chunk server "bye" (teardown) RPC for a different chunk server, for example, and -with chunk server bye RPC for such server pending in replay or transaction log -queue. - + log in flight RPCs in the case where such RPCs are created while processing + chunk server "bye" (teardown) RPC for a different chunk server, for example, + and with chunk server bye RPC for such server pending in replay or + transaction log queue. 5. Meta server: do not create / log extraneous chunk op in flight and chunk op -completion with object store blocks allocation. - + completion with object store blocks allocation. 6. Meta server: fix view stamped replication reconfiguration swap nodes -sub-command. The bug manifests it self as panic (fail stop) on all active nodes -at the time of the corresponding VR reconfiguration RPC commit, effectively -rendering file system non operational. - + sub-command. The bug manifests it self as panic (fail stop) on all active + nodes at the time of the corresponding VR reconfiguration RPC commit, + effectively rendering file system non operational. 7. Meta server: fix VR view change failure that results in panic in the case -when the node that started view change and was about to become primary preempted -(due to timing out or connectivity failure) by another node that has already -transitioned into primary state. - + when the node that started view change and was about to become primary + preempted (due to timing out or connectivity failure) by another node that + has already transitioned into primary state. 8. Meta server: do not attempt to fetch data from other meta server nodes when -VR ID is not configured, and remove fetch state file, if exists, in such a case. -This change is intended to simplify initial VR configuration setup attempts by -handling operator errors more intuitively / gracefully. - + VR ID is not configured, and remove fetch state file, if exists, in such a + case. This change is intended to simplify initial VR configuration setup + attempts by handling operator errors more intuitively / gracefully. 9. Meta server: fix log writer instrumentation by not attempting to save the -instrumentation data into a file if file name set to an empty string. - + instrumentation data into a file if file name set to an empty string. 10. Meta server: fix VR status propagation from logger to main thread by -updating it on every change in order to get gid or spurious status ring debug -instrumentation trace messages. - + updating it on every change in order to get gid or spurious status ring debug + instrumentation trace messages. 11. Meta server: do not enter replication check if no chunks exist, i.e. if only -object store used. - + object store used. 12. Chunk server: fix rare failure (panic) due to incorrect handling of IO -buffer manager suspend / granted logic in the client connection state machine -run by the client connections servicing thread. - + buffer manager suspend / granted logic in the client connection state machine + run by the client connections servicing thread. 13. Client library: retry replicated file get size if / when chunk server or -replica disappears. - + replica disappears. 14. IO library: fix SSL / TLS "filter" error handling with end of file / stream -close. The problem manifests itself with authentication enabled by excessive CPU -utilization due to continuous retries and socket / connection "leak" as, in -theory, connection might never get out of this state. - + close. The problem manifests itself with authentication enabled by excessive + CPU utilization due to continuous retries and socket / connection "leak" as, + in theory, connection might never get out of this state. 15. Added support for org.apache.hadoop.fs.FileSystem.getScheme(). This method -is used by Spark NLP, and possibly other packages. + is used by Spark NLP, and possibly other packages. -## Minor improvements +### Minor improvements (2.2.5) 1. Meta server: simplify log chunk in flight RPC handling given that now the RPC -can no longer be en-queued after the first teardown attempt when meta chunk -server bye RPC for a given chunk server is en-queued. Implement backward -compatibility handling in replay code path only by adding a boolean to -distinguish chunk in flight RPC created with the new logic. - + can no longer be en-queued after the first teardown attempt when meta chunk + server bye RPC for a given chunk server is en-queued. Implement backward + compatibility handling in replay code path only by adding a boolean to + distinguish chunk in flight RPC created with the new logic. 2. Common library: change watchdog default poll interval to 1 second when max. -timeouts set to negative value in order to sample and report watchdog and poll -entries time overruns. - + timeouts set to negative value in order to sample and report watchdog and + poll entries time overruns. 3. Tools: implement stricter `qfsadmin` command line parameters validation. - 4. Implement meta server transaction log truncation / roll back script intended -to be used for debugging and recovery. - + to be used for debugging and recovery. 5. Add recovery and hitless upgrade sections to QFS Administrator's Guide. - 6. Build Hadoop shim for all stable Hadoop releases, include 3.x code line. ## QFS version 2.2.4 -## Bug fixes +### Bug fixes (2.2.4) 1. Meta server: fix condition reversal in rename RPC WORM mode specific handling logic resulting treating files with .tmp suffix as files with no such suffix @@ -165,17 +172,15 @@ to be used for debugging and recovery. ## QFS version 2.2.3 -## New features +### New features (2.2.3) -1. Chunk server node ID support. Node ID can be configured on [chunk - server](https://github.com/quantcast/qfs/blob/7644e583e40ae69851067f53637e1f1381892690/conf/ChunkServer.prp#L84) - and [QFS - client](https://github.com/quantcast/qfs/blob/7644e583e40ae69851067f53637e1f1381892690/conf/QfsClient.prp#L169). +1. Chunk server node ID support. Node ID can be configured on [chunk server](https://github.com/quantcast/qfs/blob/7644e583e40ae69851067f53637e1f1381892690/conf/ChunkServer.prp#L84) + and [QFS client](https://github.com/quantcast/qfs/blob/7644e583e40ae69851067f53637e1f1381892690/conf/QfsClient.prp#L169). Node ID is intended to be used instead of IP/port for determining if chunk server QFS are co-located on the same network node and use chunk server to serve client requests. -## Bug fixes +### Bug fixes (2.2.3) 1. Meta server: Change view stamped replication state machine to ignore start view change if node's current state is primary, but no log replication @@ -193,7 +198,7 @@ to be used for debugging and recovery. ## QFS version 2.2.2 -## Bug fixes +### Bug fixes (2.2.2) 1. Meta server: fix rare intermittent incorrect extra replicas removal that could be triggered by re-replication and re-balancing. @@ -206,7 +211,7 @@ to be used for debugging and recovery. ## QFS version 2.2.1 -## Bug fixes +### Bug fixes (2.2.1) 1. Fix open socket double accounting in TCP socket accept. The bug manifests itself as chunk server IO failures because the counter is used to limit @@ -214,13 +219,13 @@ to be used for debugging and recovery. ## QFS version 2.2.0 -## New features +### New features (2.2.0) 1. Symbolic links support. In this release QFS client library and Hadoop shim do not support cross file systems symbolic links. -## Bug fixes +### Bug fixes (2.2.0) 1. QFS-350 bug fix. Ensure that Hadoop configuration parameter fs.qfs.createParams value if @@ -230,7 +235,7 @@ to be used for debugging and recovery. ## QFS version 2.1.3 -## Bug fixes +### Bug fixes (2.1.3) 1. Fix DNS resolver's number of open socket accounting bug. This bug might manifest itself in at least one non obvious way: chunk server might @@ -239,106 +244,88 @@ to be used for debugging and recovery. ## QFS version 2.1.2 -## New features +### New features (2.1.2) Watchdog thread polls meta and / or chunk server threads and aborts the process, when configured to do so, in the case if one or more threads appear not to be making progress due to likely server and / or OS malfunction. -## Bug fixes +### Bug fixes (2.1.2) 1. Fix hex integer parser return code in the case when input length is 0. - 2. Chunk server: fix theoretically possible null pointer de-reference, and access after free in the record appender and meta server state machines error handling code paths. - 3. Turn off TLS 1.3 with openssl 1.1.1 by default for PSK only SSL contexts as PSK does not appear to work with it, even though openssl documentation suggests that TLS 1.2 callbacks are intended to work with 1.3. - 4. Meta server: validate chunk server hello rack id, emit error message in the case if rack id is outside of the supported range, and mark rack id as undefined in order treat it as such consistently everywhere including meta server web UI. Annotated configuration files: add valid rack ID range definition, and describe handling of rack ids outside of valid range. - 5. Meta server: implement debug instrumentation that stores pre-configured number of committed RPC status codes, and writes this information into trace and optionally separate file log when transitioning out primary VR state. - 6. Meta server: change user and group DB load / update to allow assigning multiple names to same group numeric ID. - 7. Meta server: fix chunk server RPC transmit after re-authentication. - 8. Java build: automatically determine if the lowest supported release by java compiler is higher than 1.6 and use this release to build QFS java shim. - 9. Tools: do not follow symbolic links on local file system in the case of recursive traversal and / or fetching directory entries attributes in order to make qfs tool behavior more similar to the relevant Unix commands. - 10. Client library: fix condition reversal in the chunk lease renew RPC. - 11. Update build QFS system to work with newer versions of external / system libraries and tools. ## QFS version 2.1.1 -## Bug fixes +### Bug fixes (2.1.1) 1. Fix backward compatibility with chunk server 1.x releases by correctly handling the case where 1.x chunk server is a replication data source for 2.x chunk server. The problem can only occur with mix of 2.x and 1.x chunk server versions. - 2. Omit linux release patch version from the tar file name in order to make links to Travis builds consistent. - 3. Update wiki in qfs.git from qfs.wiki.git. ## QFS version 2.1.0 -## New features +### New features (2.1.0) 1. Non blocking DNS resolver. Resolver implementation at - is used by default. It is possible to configure - QFS to use OS DNS resolver. DNS related configuration options are described - in the annotated [configuration - files](https://github.com/quantcast/qfs/tree/master/conf) - Non blocking DNS resolver allows higher IO concurrency with S3 \[compatible\] - object store. - + [https://github.com/wahern/dns](https://github.com/wahern/dns) is used by + default. It is possible to configure QFS to use OS DNS resolver. DNS related + configuration options are described in the annotated [configuration + files](https://github.com/quantcast/qfs/tree/master/conf) Non blocking DNS + resolver allows higher IO concurrency with S3 \[compatible\] object store. 2. Basic DNS query result cache. The cache is on by default only for S3 object - store. The default cache timeout is 1 second. The cache is intended to improve - S3 object store IO performance and reduce DNS servers load. + store. The default cache timeout is 1 second. The cache is intended to + improve S3 object store IO performance and reduce DNS servers load. -## Bug fixes +### Bug fixes (2.1.0) 1. Fixed homebrew osxfuse build. - 2. Fixed client authentication in the case when meta server configured with no "client" threads. - 3. Fixed file system URL parsing in QFS tool. -## Upgrade from 2.0. release +### Upgrade from 2.0. release The 2.0.1 release is backward and forward compatible with 2.0 release. ## QFS version 2.0 -## New features +### New features (2.0) 1. Meta server replication (VR) is the major new feature in this release. Meta server replication provides automatic meta server fail over. With meta server replication configured QFS does not have single point of failure. - 2. Create exclusive, make directory, remove, remove directory, rename operation are guaranteed to be applied only once in the presence of communication errors. (Idempotent RPCs). - 3. Partial chunk server inventory synchronization. The goal is to reduce chunk servers re-connect to the meta server time with large chunk inventory. @@ -348,82 +335,63 @@ The 2.0.1 release is backward and forward compatible with 2.0 release. For example for file system with 2 billion chunks full inventory synchronization requires around 30 minutes, while partial inventory synchronization typically can complete in a few seconds. - 4. "Off line" fsck can optionally emit the same report "on line" fsck, as chunk inventory stored in file system meta data (checkpoint, and transaction logs). -## Notable changes, and bug fixes +### Notable changes, and bug fixes 1. New version of QFS protocol with more compact RPC representation to minimize network overheads, and CPU utilization. - 2. QFS client read and write support larger than 2GB buffers. - 3. QFS client write pipelining fixed. Now write path with adequately large write behind is no longer latency bound. - 4. Fixed RS (striped) sparse files recovery. - 5. Updated GF complete library version, now includes run time CPU vector features detection, and ARM NEON vector instructions support. - 6. Fixed sporadic file descriptor close in meta server checkpoint write in case when lock file was configured / used. - 7. Fixed bug in S3 block delete state machine that appear in the case when more one upload ID returned by S3. - 8. Fixed re-authentication bug in chunk, meta servers, and client library, that resulted connection stalls / timeouts. - 9. Changed file delete, by always moving files to dumpster first, then deleting file after one lease interval after the most recent read of write lease relinquish or expiration. - 10. File or directory creation in the dumpster directory is not permitted. Only permit super user to move file out of dumpster in order to prevent its deletion. - 11. File delete scheduled at lower / background priority, avoid delete "bursts" in order to maintain low RPCs service latency. - 12. Chunk delete queue used with file truncate to reduce chunk delete bursts with large files, by scheduling chunk deletes at lower than client requests priority. - 13. More compact checkpoint and transaction log format, in order to reduce disk, network bandwidth, and CPU utilization. - 14. Added S3 option that allows to turn off upload ID querying prior to S3 block delete, in order to allow to use potentially more cost effective external to QFS process of finding and removing possible stale multi part uploads. - 15. Meta server crypto keys are now always stored in checkpoint and transaction log. Configuration parameter metaServer.cryptoKeys.keysFileName is deprecated. - 16. WROM mode configuration parameter `metaServer.wormMode` is deprecated, and has no effect. WORM mode is now stored in checkpoint and transaction logs. `logcompactor` has an option to set worm mode when converting checkpoint and transaction log into new format. `qfsadmin` or `qfstoggleworm` can be used to change WORM mode. - 17. Retry QFS client directory listing in the case of parse errors in order to handle possible network errors. - 18. Fix integer overflow in IO buffer pool with pool size equal or greater 4GB. The problem affects both chunk and meta servers. However, typically, only chunk server, if configured with IO buffer pool larger than 4GB, and S3, might use enough buffers for the problem to occur. - 19. Implemented files and directories access time update. By default access time update turned off. For details please see the following parameters description in meta server annotated configuration file: metaServer.ATimeUpdateResolution and metaServer.dirATimeUpdateResolution. -## Upgrade from prior releases +### Upgrade from prior releases Meta server checkpoint and transaction log segments must be converted to new format. `logcompactor` can be used to convert file system meta data. Please -consult [[Administrator's-Guide]] for details. +consult [[Administrator's Guide]] for details. diff --git a/wiki/images/QFS-Kerberos-Security-Design/Write-Path-with-Replication.png b/wiki/images/QFS-Kerberos-Security-Design/Write-Path-with-Replication.png new file mode 100644 index 000000000..db027d0b1 Binary files /dev/null and b/wiki/images/QFS-Kerberos-Security-Design/Write-Path-with-Replication.png differ