diff --git a/.gitignore b/.gitignore index 4fc1545..20a7101 100644 --- a/.gitignore +++ b/.gitignore @@ -27,6 +27,7 @@ *.lib build +release # Executables *.exe diff --git a/CMakeLists.txt b/CMakeLists.txt index ab3d6ca..e3dc5e7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -109,6 +109,7 @@ option(MSGPACK_USE_BOOST "Use Boost libraried" OFF) add_subdirectory(msgpack) option(MLC_ENABLE_SENTENCEPIECE_TOKENIZER "Enable SentencePiece tokenizer" ON) +message(DEBUG "MLC_ENABLE_SENTENCEPIECE_TOKENIZER= ${MLC_ENABLE_SENTENCEPIECE_TOKENIZER}") if(MSVC) set(TOKENIZERS_RUST_LIB "${TOKENIZERS_CPP_CARGO_BINARY_DIR}/tokenizers_c.lib") @@ -153,16 +154,23 @@ add_custom_command( set( TOKENIZER_CPP_SRCS - src/sentencepiece_tokenizer.cc src/huggingface_tokenizer.cc src/rwkv_world_tokenizer.cc ) +if (${MLC_ENABLE_SENTENCEPIECE_TOKENIZER}) + list(APPEND TOKENIZER_CPP_SRCS src/sentencepiece_tokenizer.cc) +endif() + add_library(tokenizers_cpp STATIC ${TOKENIZER_CPP_SRCS}) -target_include_directories(tokenizers_cpp PRIVATE sentencepiece/src) + +if (${MLC_ENABLE_SENTENCEPIECE_TOKENIZER}) + target_include_directories(tokenizers_cpp PRIVATE sentencepiece/src) +endif() target_include_directories(tokenizers_cpp PRIVATE msgpack/include) target_include_directories(tokenizers_cpp PUBLIC ${TOKENIZERS_CPP_INCLUDE}) -if (MLC_ENABLE_SENTENCEPIECE_TOKENIZER STREQUAL "ON") - target_compile_definitions(tokenizers_cpp PUBLIC MLC_ENABLE_SENTENCEPIECE_TOKENIZER) + +if (${MLC_ENABLE_SENTENCEPIECE_TOKENIZER}) + target_compile_definitions(tokenizers_cpp PUBLIC MLC_ENABLE_SENTENCEPIECE_TOKENIZER) endif () target_link_libraries(tokenizers_cpp PRIVATE msgpack-cxx) @@ -178,10 +186,14 @@ if (CMAKE_SYSTEM_NAME STREQUAL "iOS") XCODE_ATTRIBUTE_${XCODE_PROPERTY} ${XCODE_VALUE}) endmacro (set_xcode_property) endif() -add_subdirectory(sentencepiece sentencepiece EXCLUDE_FROM_ALL) +if (${MLC_ENABLE_SENTENCEPIECE_TOKENIZER}) + add_subdirectory(sentencepiece sentencepiece EXCLUDE_FROM_ALL) +endif() add_library(tokenizers_c INTERFACE ${TOKENIZERS_RUST_LIB}) target_link_libraries(tokenizers_c INTERFACE ${TOKENIZERS_RUST_LIB} ${TOKENIZERS_C_LINK_LIBS}) - -target_link_libraries(tokenizers_cpp PRIVATE tokenizers_c sentencepiece-static ${TOKENIZERS_CPP_LINK_LIBS}) +if (${MLC_ENABLE_SENTENCEPIECE_TOKENIZER}) + set(SPLIB "sentencepiece-static") +endif() +target_link_libraries(tokenizers_cpp PRIVATE tokenizers_c ${SPLIB} ${TOKENIZERS_CPP_LINK_LIBS}) target_include_directories(tokenizers_cpp PUBLIC ${TOKENIZERS_CPP_INCLUDE}) diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index faf2d75..91d73de 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -8,6 +8,7 @@ if(NOT MSVC) check_cxx_compiler_flag("-std=c++17" SUPPORT_CXX17) set(CMAKE_CXX_FLAGS "-std=c++17 ${CMAKE_CXX_FLAGS}") set(CMAKE_CUDA_STANDARD 17) + find_package(Threads REQUIRED) else() check_cxx_compiler_flag("/std:c++17" SUPPORT_CXX17) set(CMAKE_CXX_FLAGS "/std:c++17 ${CMAKE_CXX_FLAGS}") @@ -24,4 +25,4 @@ target_include_directories(example PRIVATE ${TOKENZIER_CPP_PATH}/include) # You can link tokenizers_cpp, it will automatically link tokenizers_c # and sentencepiece libary -target_link_libraries(example PRIVATE tokenizers_cpp) +target_link_libraries(example PRIVATE tokenizers_cpp Threads::Threads) diff --git a/example/build_and_run.sh b/example/build_and_run.sh index 19e426d..494e805 100755 --- a/example/build_and_run.sh +++ b/example/build_and_run.sh @@ -1,10 +1,19 @@ #/bin/bash - +echo "Usage: $0 (ENABLE_SENTENCEPIECE_TOKENIZER default value 1=ON)" # build mkdir -p build cd build -cmake .. -make -j8 +echo +echo "cmake ..." +echo "CXX=$CXX" +g++ --version + +ENABLESP=${1:-ON} + +cmake .. -DMLC_ENABLE_SENTENCEPIECE_TOKENIZER=$ENABLESP || exit 1 +echo +echo "make..." +make -j8 || exit 1 cd .. # get example files @@ -26,8 +35,9 @@ fi if [ ! -f "merges.txt" ]; then wget https://huggingface.co/Qwen/Qwen2.5-3B-Instruct/resolve/main/merges.txt fi + cd .. # run echo "---Running example----" -./build/example +./build/example || exit 1 diff --git a/example/example.cc b/example/example.cc index 274b49d..0172abf 100644 --- a/example/example.cc +++ b/example/example.cc @@ -9,6 +9,7 @@ using tokenizers::Tokenizer; std::string LoadBytesFromFile(const std::string& path) { + std::cout << "Loading " << path << std::endl; std::ifstream fs(path, std::ios::in | std::ios::binary); if (fs.fail()) { std::cerr << "Cannot open " << path << std::endl; @@ -60,6 +61,7 @@ void TestTokenizer(std::unique_ptr tok, bool print_vocab = false, std::cout << std::endl; } +#ifdef MLC_ENABLE_SENTENCEPIECE_TOKENIZER // Sentencepiece tokenizer // - dist/tokenizer.model void SentencePieceTokenizerExample() { @@ -80,6 +82,7 @@ void SentencePieceTokenizerExample() { TestTokenizer(std::move(tok), false, true); } +#endif // HF tokenizer // - dist/tokenizer.json @@ -141,7 +144,9 @@ void RWKVWorldTokenizerExample() { } int main(int argc, char* argv[]) { +#ifdef MLC_ENABLE_SENTENCEPIECE_TOKENIZER SentencePieceTokenizerExample(); +#endif HuggingFaceTokenizerExample(); HuggingFaceBPETokenizerExample(); RWKVWorldTokenizerExample();