From d66a8c04f765011448fa79d3e65fe038421315d3 Mon Sep 17 00:00:00 2001 From: serdyuk Date: Fri, 25 Sep 2015 18:05:27 -0400 Subject: [PATCH 1/4] Use distutills --- Makefile | 2 +- .../__init__.py} | 11 ++++---- kaldi-python/{ => kaldi_io}/Makefile | 0 .../{kaldi_io.py => kaldi_io/__init__.py} | 12 +++++---- kaldi-python/{ => kaldi_io}/bp_converters.h | 0 .../{ => kaldi_io}/kaldi_io_internal.cpp | 0 kaldi-python/{ => kaldi_io}/python_wrappers.h | 0 setup.py | 26 +++++++++++++++++++ 8 files changed, 39 insertions(+), 12 deletions(-) rename kaldi-python/{kaldi_argparse.py => kaldi_argparse/__init__.py} (96%) rename kaldi-python/{ => kaldi_io}/Makefile (100%) rename kaldi-python/{kaldi_io.py => kaldi_io/__init__.py} (99%) rename kaldi-python/{ => kaldi_io}/bp_converters.h (100%) rename kaldi-python/{ => kaldi_io}/kaldi_io_internal.cpp (100%) rename kaldi-python/{ => kaldi_io}/python_wrappers.h (100%) create mode 100755 setup.py diff --git a/Makefile b/Makefile index 20c531d..a5703fb 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -SRCDIR = kaldi-python +SRCDIR = kaldi-python/kaldi_io ifndef KALDI_ROOT $(error please set KALDI_ROOT to point ot the base of the kaldi installation) diff --git a/kaldi-python/kaldi_argparse.py b/kaldi-python/kaldi_argparse/__init__.py similarity index 96% rename from kaldi-python/kaldi_argparse.py rename to kaldi-python/kaldi_argparse/__init__.py index a771734..b7f7ad4 100644 --- a/kaldi-python/kaldi_argparse.py +++ b/kaldi-python/kaldi_argparse/__init__.py @@ -1,14 +1,13 @@ -''' +""" Created on Aug 14, 2014 @author: chorows -''' +""" import os import sys import argparse -#import __main__ class AddConfig(argparse.Action): def __init__(self, *args, **kwargs): @@ -17,7 +16,8 @@ def __init__(self, *args, **kwargs): def __call__(self, parser, namespace, values, option_string=None): with open(values,'r') as f: opts = [l.split('#')[0].strip() for l in f] - parser.parse_args(args=opts, namespace=namespace) + parser.parse_args(args=opts, namespace=namespace) + class KaldiArgumentParser(argparse.ArgumentParser): def __init__(self, *args, **kwargs): @@ -44,8 +44,7 @@ def add_standard_arguments(self): grp.add_argument('--print-args', type=bool, default=True, help='Print the command line arguments (to stderr)') #grp.add_argument('--config', action=AddConfig, help='Configuration file with options') grp.add_argument('--config', default=argparse.SUPPRESS, help='Configuration file with options') - - + def parse_known_args(self, args=None, namespace=None): if args is None: args = sys.argv[1:] diff --git a/kaldi-python/Makefile b/kaldi-python/kaldi_io/Makefile similarity index 100% rename from kaldi-python/Makefile rename to kaldi-python/kaldi_io/Makefile diff --git a/kaldi-python/kaldi_io.py b/kaldi-python/kaldi_io/__init__.py similarity index 99% rename from kaldi-python/kaldi_io.py rename to kaldi-python/kaldi_io/__init__.py index 4ce731b..744145c 100644 --- a/kaldi-python/kaldi_io.py +++ b/kaldi-python/kaldi_io/__init__.py @@ -1,4 +1,4 @@ -'''Python Wrappers for Kaldi table IO (:kaldi:`io.html`) +"""Python Wrappers for Kaldi table IO (:kaldi:`io.html`) In Kaldi the archive does not carry information about its contents and the user is required to use the proper Reader or Writer. This module follows this approach and provides wrappers for @@ -232,13 +232,11 @@ | | | | | +--------------------+---------------------+-----------------------+-----------------------+ -''' -''' Created on Jul 31, 2014 @author: chorows -''' +""" import numpy as np @@ -266,6 +264,7 @@ SequentialBaseFloatVectorReader = SequentialFloat32VectorReader BaseFloatVectorWriter = Float32VectorWriter + def get_io_for_dtype(access, dtype, element=''): ''' Get a writer or reader for the given dtype. eg: @@ -280,7 +279,8 @@ def get_io_for_dtype(access, dtype, element=''): 'float32':'Float32', 'float64':'Float64'} dtype = dtypemap[dtype] - return globals()[access + dtype + element] + return globals()[access + dtype + element] + class _Transformed(object): def __init__(self, reader, transform_function, **kwargs): @@ -290,6 +290,7 @@ def __init__(self, reader, transform_function, **kwargs): def __getattr__(self, attr): return getattr(self.reader,attr) + class TransRA(_Transformed): def __init__(self, *args, **kwargs): @@ -300,6 +301,7 @@ def value(self, key): def __getitem__(self, key): return self.value(key) + class TransSeq(_Transformed): def __init__(self, *args, **kwargs): diff --git a/kaldi-python/bp_converters.h b/kaldi-python/kaldi_io/bp_converters.h similarity index 100% rename from kaldi-python/bp_converters.h rename to kaldi-python/kaldi_io/bp_converters.h diff --git a/kaldi-python/kaldi_io_internal.cpp b/kaldi-python/kaldi_io/kaldi_io_internal.cpp similarity index 100% rename from kaldi-python/kaldi_io_internal.cpp rename to kaldi-python/kaldi_io/kaldi_io_internal.cpp diff --git a/kaldi-python/python_wrappers.h b/kaldi-python/kaldi_io/python_wrappers.h similarity index 100% rename from kaldi-python/python_wrappers.h rename to kaldi-python/kaldi_io/python_wrappers.h diff --git a/setup.py b/setup.py new file mode 100755 index 0000000..16272a5 --- /dev/null +++ b/setup.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python + +import os +from distutils.core import setup +from distutils.command.build_py import build_py + +class Make(build_py): + def run(self): + os.system("make") + build_py.run(self) + +setup(name='kaldi-python', + version='1.0', + description='Python interface for kaldi iterators', + author='Jan Chorowski', + url='https://github.com/janchorowski/kaldi-python', + cmdclass={'build_py': Make}, + packages=['kaldi_io', 'kaldi_argparse'], + package_dir={'kaldi_io': 'kaldi-python/kaldi_io', + 'kaldi_argparse': 'kaldi-python/kaldi_argparse'}, + package_data={'kaldi_io': ['kaldi_io_internal.so']}, + scripts=['scripts/apply-global-cmvn.py', + 'scripts/compute-global-cmvn-stats.py', + 'scripts/copy-feats-padded.py', + 'scripts/show-wav-ali.py'], + requires=['numpy']) \ No newline at end of file From ebb6c7186762445de6cfe163998738f72b847e9c Mon Sep 17 00:00:00 2001 From: serdyuk Date: Fri, 25 Sep 2015 18:12:49 -0400 Subject: [PATCH 2/4] Add installation section to readme --- README => README.md | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) rename README => README.md (85%) diff --git a/README b/README.md similarity index 85% rename from README rename to README.md index faa4e01..16c1976 100644 --- a/README +++ b/README.md @@ -1,7 +1,16 @@ -This is a set of Python wrappers for Kaldi input-output classes. +# This is a set of Python wrappers for Kaldi input-output classes. +## Installation + +Simply run +``` +./setup.py install +``` + +## Usage It allows you to do e.g.: +``` In [1]: import kaldi_io In [2]: feat_reader = kaldi_io.SequentialBaseFloatMatrixReader('scp:./mfcc/raw_mfcc_test.1.scp') In [3]: next(feat_reader) @@ -20,4 +29,4 @@ It allows you to do e.g.: 2.52763462] [ 38.64388275 -29.08744812 -9.59657097 ..., -1.66973591 -0.54327661 9.77887821]]) - +``` From c34b15692ff35ff657af3ea73e5328c8e16649cf Mon Sep 17 00:00:00 2001 From: serdyuk Date: Sat, 26 Sep 2015 17:52:33 -0400 Subject: [PATCH 3/4] Move modules to root dir --- Makefile | 2 +- kaldi-python/kaldi_argparse/__init__.py | 77 ---- kaldi-python/kaldi_io/Makefile | 41 -- kaldi-python/kaldi_io/__init__.py | 315 ------------- kaldi-python/kaldi_io/bp_converters.h | 175 -------- kaldi-python/kaldi_io/kaldi_io_internal.cpp | 474 -------------------- kaldi-python/kaldi_io/python_wrappers.h | 126 ------ setup.py | 4 +- 8 files changed, 2 insertions(+), 1212 deletions(-) delete mode 100644 kaldi-python/kaldi_argparse/__init__.py delete mode 100644 kaldi-python/kaldi_io/Makefile delete mode 100644 kaldi-python/kaldi_io/__init__.py delete mode 100644 kaldi-python/kaldi_io/bp_converters.h delete mode 100644 kaldi-python/kaldi_io/kaldi_io_internal.cpp delete mode 100644 kaldi-python/kaldi_io/python_wrappers.h diff --git a/Makefile b/Makefile index a5703fb..a80ec98 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -SRCDIR = kaldi-python/kaldi_io +SRCDIR = kaldi_io ifndef KALDI_ROOT $(error please set KALDI_ROOT to point ot the base of the kaldi installation) diff --git a/kaldi-python/kaldi_argparse/__init__.py b/kaldi-python/kaldi_argparse/__init__.py deleted file mode 100644 index b7f7ad4..0000000 --- a/kaldi-python/kaldi_argparse/__init__.py +++ /dev/null @@ -1,77 +0,0 @@ -""" -Created on Aug 14, 2014 - -@author: chorows -""" - -import os -import sys -import argparse - - -class AddConfig(argparse.Action): - def __init__(self, *args, **kwargs): - argparse.Action.__init__(self, *args, **kwargs) - - def __call__(self, parser, namespace, values, option_string=None): - with open(values,'r') as f: - opts = [l.split('#')[0].strip() for l in f] - parser.parse_args(args=opts, namespace=namespace) - - -class KaldiArgumentParser(argparse.ArgumentParser): - def __init__(self, *args, **kwargs): - kwargs['add_help']=False - #kwargs['fromfile_prefix_chars']='--config=' - version = kwargs.pop('version', None) - super(KaldiArgumentParser, self).__init__(*args, formatter_class=argparse.ArgumentDefaultsHelpFormatter, **kwargs) - self.version = version - - def add_standard_arguments(self): - grp = self.add_argument_group('Standard options') - - default_prefix = '-' - grp.add_argument( - default_prefix+'h', default_prefix*2+'help', - action='help', default=argparse.SUPPRESS, - help=argparse._('show this help message and exit')) - if self.version: - grp.add_argument( - default_prefix+'v', default_prefix*2+'version', - action='version', default=argparse.SUPPRESS, - version=self.version, - help=argparse._("show program's version number and exit")) - grp.add_argument('--print-args', type=bool, default=True, help='Print the command line arguments (to stderr)') - #grp.add_argument('--config', action=AddConfig, help='Configuration file with options') - grp.add_argument('--config', default=argparse.SUPPRESS, help='Configuration file with options') - - def parse_known_args(self, args=None, namespace=None): - if args is None: - args = sys.argv[1:] - expanded_args = [] - - next_arg_is_conf = False - conf_file = None - - for arg in args: - if arg.startswith('--config') or next_arg_is_conf: - if next_arg_is_conf: - conf_file = arg - elif arg.startswith('--config='): - conf_file = arg[9:].strip() #eat --config= - else: - next_arg_is_conf = True - if conf_file: - with open(conf_file,'r') as f: - expanded_args.extend(l.split('#')[0].strip() for l in f) - next_arg_is_conf = False - conf_file = None - else: - expanded_args.append(arg) - return argparse.ArgumentParser.parse_known_args(self, args=expanded_args, namespace=namespace) - - def parse_args(self, args=None, namespace=None): - args = argparse.ArgumentParser.parse_args(self, args=args, namespace=namespace) - if args.print_args: - print >>sys.stderr, os.path.basename(sys.argv[0]), " ".join(sys.argv[1:]) - return args diff --git a/kaldi-python/kaldi_io/Makefile b/kaldi-python/kaldi_io/Makefile deleted file mode 100644 index 3cb127b..0000000 --- a/kaldi-python/kaldi_io/Makefile +++ /dev/null @@ -1,41 +0,0 @@ - -all: -EXTRA_CXXFLAGS = -Wno-sign-compare - -KALDI_SRC = $(KALDI_ROOT)/src - -include $(KALDI_SRC)/kaldi.mk - -BINFILES = - - -OBJFILES = - - -ADDLIBS = $(KALDI_SRC)/lm/kaldi-lm.a $(KALDI_SRC)/decoder/kaldi-decoder.a $(KALDI_SRC)/lat/kaldi-lat.a \ - $(KALDI_SRC)/hmm/kaldi-hmm.a $(KALDI_SRC)/transform/kaldi-transform.a $(KALDI_SRC)/gmm/kaldi-gmm.a \ - $(KALDI_SRC)/tree/kaldi-tree.a $(KALDI_SRC)/matrix/kaldi-matrix.a $(KALDI_SRC)/util/kaldi-util.a \ - $(KALDI_SRC)/base/kaldi-base.a $(KALDI_SRC)/thread/kaldi-thread.a - -TESTFILES = - -PYLIB = $(shell python-config --libs) -PYINC = $(shell python-config --includes) -NPINC = -I$(shell python -c 'import numpy; print numpy.get_include()') - -PYLIBS = kaldi_io_internal.so - -#include $(KALDI_SRC)/makefiles/default_rules.mk - -%.so: %.cpp - g++ -shared -o $@ -Wall -fPIC -I$(KALDI_SRC) $(PYINC) $(NPINC) $(CXXFLAGS) $< $(ADDLIBS) $(LDFLAGS) -L$(PYLIB) $(LOADLIBES) $(LDLIBS) -lpython2.7 -lboost_python -lboost_system - -clean: - -rm -f *.o *.a *.so $(TESTFILES) $(BINFILES) $(TESTOUTPUTS) tmp* *.tmp - -depend: - -$(CXX) -I$(KALDI_SRC) $(PYINC) $(NPINC) -M $(CXXFLAGS) *.cpp > .depend.mk - -test: - -all: $(PYLIBS) diff --git a/kaldi-python/kaldi_io/__init__.py b/kaldi-python/kaldi_io/__init__.py deleted file mode 100644 index 744145c..0000000 --- a/kaldi-python/kaldi_io/__init__.py +++ /dev/null @@ -1,315 +0,0 @@ -"""Python Wrappers for Kaldi table IO (:kaldi:`io.html`) - -In Kaldi the archive does not carry information about its contents and the user is required to -use the proper Reader or Writer. This module follows this approach and provides wrappers for -RandomAccess and Sequential readers, and for the Writers. The classes are instantiated for -each Kaldi type. - -Internally, the wrappers define holders (:kaldi:`io.html#io_sec_holders`) for python types -and instantiates the regular Kaldi templates. In this way, the wrappers are 100% compatible with -Kaldi and support using pipes and subprograms for inputs and outputs. - -The Python readers and writers implement the context api, and are fully usable with the Python -`with` construct. - -Examples: - A matrix to text converter: - - .. code-block:: python - - with kaldi_io.SequentialBaseFloatMatrixReader('ark:mat.ark') as reader: - for name,mat in reader: - print name, mat - - A simple vector generator: - - .. code-block:: python - - with kaldi_io.Int32VectorWriter('ark:| gzip -c vec.ark.gz') as w: - for len in xrange(10): - vec = [len] * len - w['vec_%d' %(len,)] = vec - -Kaldi Reader classes -==================== - -Kaldi provides two types of reader: the Sequential reader which is akin to an iterator and the -Random Access reader which is akin to a dict. Both work with piped data, thus the random access -readers may be required to read and store objects in memory until the proper one is found. More -information is in :kaldi:`io.html#io_sec_bloat`. - -Kaldi programs typically open one Sequential reader (e.g. for the features) and several RandomAccess -readers. For each feature, the random access readers would be used to fetch auxiliary information, while -ensuring that they pertain to the same utterance. This resemples a merge-sort merge phase and works well -if all the files are properly sorted. Citing :kaldi:`data_prep.html#data_prep_data_yourself`: - -.. note:: - - All of these files should be sorted. If they are not sorted, you will get errors when you run the scripts. In The Table concept we explain why this is needed. It has to do with the I/O framework; the ultimate reason for the sorting is to enable something equivalent to random-access lookup on a stream that doesn't support fseek(), such as a piped command. Many Kaldi programs are reading multiple pipes from other Kaldi commands, reading different types of object, and are doing something roughly comparable to merge-sort on the different inputs; merge-sort, of course, requires that the inputs be sorted. Be careful when you sort that you have the shell variable LC_ALL defined as "C", for example (in bash), - - export LC_ALL=C - - If you don't do this, the files will be sorted in an order that's different from how C++ sorts strings, and Kaldi will crash. You have been warned! - -.. py:class:: DataTypeSequentialReader(rx_specifier) - - The SequentialReader mostly ressembles a Python iterator. Therefore it implements the - Iterator protocol: - - .. py:method:: __iter__() - - Returns self - - .. py:method:: next() - - :return: a tuple of: - - * key (string) - * value (type is determined by the reader class) - - Moreover it provides a method to check whether the iterator is empty: - - .. py:method:: done() - - Returns `True` if the iterator is empty - - Kaldi uses a slightly different iteration protocol, which can be accessed using the functions: - - .. py:method:: _kaldi_next() - - Advance the iterator by one value - - .. py:method:: _kaldi_key() - - Returns the key of the cirrent value - - .. py:method:: _kaldi_value() - - Returns the current value (i.e. the value that will be returned on the next call - to :func:`next`) - - For resource management the classes implement: - - .. py:method:: close() - - Closes the reader. - - .. py:method:: is_open() - - Returns `True` is the reader is opened and can be read from - - .. py:method:: __enter__() - .. py:method:: __exit__() - - Implement the `with` context protocol - - -.. py:class:: DataTypeRandomAccessReader(rx_specifier) - - The random access ressembles a Python dict - values are retrieved for a given key value. - Therefore the rader acts in a dict-like manner: - - .. py:method:: __contains__(key) - .. py:method:: has_key(key) - - Returns `True` if key is present in reader. Enabvles the use of the `in` operator. - - .. py:method:: __getitem__(key) - .. py:method:: value(key) - - Returns the value associeted with key - - For resource management the classes implement: - - .. py:method:: close() - - Closes the reader. - - .. py:method:: is_open() - - Returns `True` is the reader is opened and can be read from - - .. py:method:: __enter__() - .. py:method:: __exit__() - - Implement the `with` context protocol - -.. py:class: DataTypeRandomAccessReaderMapped(data_rx_specifier, maping_rx_specifier) - This class implement a random access reader whose keys have been mapped using the mapper. - See :kaldi:`io.html#io_sec_mapped` for more explanation - -Kaldi Writer class -================== - -Th writer stores key-value pairs and thus ressembles a dict. However, unlike a dict -no checks for key duplication are made. The writer will happily store all values using -the same key, which may render them unusable. For best cooperation with KAldi, the keys -should be written sorted in the `C order`. - - -.. py:class:: DataTypeWriter(wx_specifier) - .. py:method:: write(key, value) - .. py:method:: __setitem__(key,value) - - Append to the file the value under key - - .. py:method:: flush() - - Flush the output stream. - - For resource management the classes implement: - - .. py:method:: close() - - Closes the writer. - - .. py:method:: is_open() - - Returns `True` is the writer is opened and can be written to - - .. py:method:: __enter__() - .. py:method:: __exit__() - - Implement the `with` context protocol - -Transformed Readers -=================== - -Very often the value read into Python would need to be further converted. The classes -`TransRA` and `TransSeq` take an appropriate reader and a function that will be used to -transform all objects returned - - -Mapping between Kaldi and Python Objects -======================================== - -The readers and writers are named after the Kaldi type they access. - -+--------------------+---------------------+-----------------------+-----------------------+ -| Kaldi Type | Read Python Type | Writable Python Types | Notes | -| | | | | -+====================+=====================+=======================+=======================+ -|Matrix |NDArray of |Any Python object |BaseFloat is mapped to | -| |appropriate |convertible to an |either float32 (c's | -| |DTYPE. Float32 and |NDarray |float) or float64 (c's | -| |Float64 are used for | |double) based on Kaldi | -| |float and double, | |compile options | -| |respectively. | | | -+--------------------+---------------------+-----------------------+-----------------------+ -|Vector |1-dimensional NDarray|Any Python object |Same as for Matrix | -| |of appropriate type. |convertible to 1d | | -| | |NDarray of appropriate | | -| | |type | | -| | | | | -| | | | | -+--------------------+---------------------+-----------------------+-----------------------+ -|std vector |1-dimensional NDarray|any python iterable | | -| |of int32 | | | -| | | | | -| | | | | -| | | | | -| | | | | -+--------------------+---------------------+-----------------------+-----------------------+ -|std::vector> | |convertible to 1d | | -| | |NDarrays | | -| | | | | -| | | | | -| | | | | -+--------------------+---------------------+-----------------------+-----------------------+ -|std:: |tuple of ints |tuple of ints | | -|pair | | | | -| | | | | -| | | | | -| | | | | -| | | | | -+--------------------+---------------------+-----------------------+-----------------------+ -| | Any Python object | Any Python object |Uses repr/eval in text | -| | | |mode and cPickle in | -| | | |binary mode | -| | | | | -| | | | | -| | | | | -+--------------------+---------------------+-----------------------+-----------------------+ - - -Created on Jul 31, 2014 - -@author: chorows -""" - - -import numpy as np -from kaldi_io_internal import * - -if KALDI_BASE_FLOAT()==np.float64: - RandomAccessBaseFloatMatrixReader = RandomAccessFloat64MatrixReader - RandomAccessBaseFloatMatrixMapped = RandomAccessFloat64MatrixMapped - SequentialBaseFloatMatrixReader = SequentialFloat64MatrixReader - BaseFloatMatrixWriter = Float64MatrixWriter - - RandomAccessBaseFloatVectorReader = RandomAccessFloat64VectorReader - RandomAccessBaseFloatVectorReaderMapped = RandomAccessFloat64VectorReaderMapped - SequentialBaseFloatVectorReader = SequentialFloat64VectorReader - BaseFloatVectorWriter = Float64VectorWriter - -if KALDI_BASE_FLOAT()==np.float32: - RandomAccessBaseFloatMatrixReader = RandomAccessFloat32MatrixReader - RandomAccessBaseFloatMatrixMapped = RandomAccessFloat32MatrixMapped - SequentialBaseFloatMatrixReader = SequentialFloat32MatrixReader - BaseFloatMatrixWriter = Float32MatrixWriter - - RandomAccessBaseFloatVectorReader = RandomAccessFloat32VectorReader - RandomAccessBaseFloatVectorReaderMapped = RandomAccessFloat32VectorReaderMapped - SequentialBaseFloatVectorReader = SequentialFloat32VectorReader - BaseFloatVectorWriter = Float32VectorWriter - - -def get_io_for_dtype(access, dtype, element=''): - ''' - Get a writer or reader for the given dtype. eg: - get_io_for_dtype('Sequential',np.float32,'MatrixReader') - get_io_for_dtype('float32,'MatrixWriter') - ''' - if element=='': #assume we want a writer - access, dtype,element = '',access,dtype - dtypemap = {np.int32:'Int32', - np.float32:'Float32', - np.float64:'Float64', - 'float32':'Float32', - 'float64':'Float64'} - dtype = dtypemap[dtype] - return globals()[access + dtype + element] - - -class _Transformed(object): - def __init__(self, reader, transform_function, **kwargs): - super(_Transformed, self).__init__(**kwargs) - self.reader=reader - self.transform_function = transform_function - - def __getattr__(self, attr): - return getattr(self.reader,attr) - - -class TransRA(_Transformed): - def __init__(self, *args, **kwargs): - super(TransRA, self).__init__(*args, **kwargs) - - def value(self, key): - return self.transform_function(self.reader.value(key)) - - def __getitem__(self, key): - return self.value(key) - - -class TransSeq(_Transformed): - def __init__(self, *args, **kwargs): - super(TransSeq, self).__init__(*args, **kwargs) - - def next(self): - return self.transform_function(self.reader.next()) - - def _kaldi_value(self): - return self.transform_function(self.reader._kaldi_value()) - diff --git a/kaldi-python/kaldi_io/bp_converters.h b/kaldi-python/kaldi_io/bp_converters.h deleted file mode 100644 index 1a38b0b..0000000 --- a/kaldi-python/kaldi_io/bp_converters.h +++ /dev/null @@ -1,175 +0,0 @@ -/* - * bp_converters.h - * - * Created on: Aug 28, 2014 - * Author: chorows - */ - -#ifndef BP_CONVERTERS_H_ -#define BP_CONVERTERS_H_ - -#include - -#include -#include - -#include -#include -#include - - - -namespace kaldi { -// -// Code transformend from http://code.activestate.com/lists/python-cplusplus-sig/16463/ and -// http://misspent.wordpress.com/2009/09/27/how-to-write-boost-python-converters/ -// -template -struct VectorToListBPConverter { - - static PyObject* convert(std::vector const& vec) { - boost::python::list l; - - for (size_t i = 0; i < vec.size(); i++) - l.append(vec[i]); - return boost::python::incref(l.ptr()); - } -}; - -template -struct VectorFromListBPConverter { - VectorFromListBPConverter() { - using namespace boost::python; - using namespace boost::python::converter; - boost::python::converter::registry::push_back( - &VectorFromListBPConverter::convertible, - &VectorFromListBPConverter::construct, type_id >()); - } - - // Determine if obj_ptr can be converted in a std::vector - static void* convertible(PyObject* obj_ptr) { -// if (!PyIter_Check(obj_ptr)) { -// return 0; -// } - return obj_ptr; - } - - // Convert obj_ptr into a std::vector - static void construct( - PyObject* obj_ptr, - boost::python::converter::rvalue_from_python_stage1_data* data) { - - boost::python::object o = boost::python::object(boost::python::handle<>(boost::python::borrowed(obj_ptr))); - boost::python::stl_input_iterator begin(o); - boost::python::stl_input_iterator end; - - // Grab pointer to memory into which to construct the new std::vector - void* storage = ((boost::python::converter::rvalue_from_python_storage< - std::vector >*) data)->storage.bytes; - - // in-place construct the new std::vector using the character data - // extraced from the python object - std::vector& v = *(new (storage) std::vector()); - - v.insert(v.end(), begin, end); - - // Stash the memory chunk pointer for later use by boost.python - data->convertible = storage; - } -}; - -template -struct MapFromDictBPConverter { - MapFromDictBPConverter() { - boost::python::converter::registry::push_back( - &MapFromDictBPConverter::convertible, - &MapFromDictBPConverter::construct, boost::python::type_id()); - } - - // Determine if obj_ptr can be converted in a std::vector - static void* convertible(PyObject* obj_ptr) { - if (!PyDict_Check(obj_ptr)) { - return 0; - } - return obj_ptr; - } - - // Convert obj_ptr into a std::vector - static void construct( - PyObject* obj_ptr, - boost::python::converter::rvalue_from_python_stage1_data* data) { - - boost::python::dict obj(boost::python::handle<>(boost::python::borrowed(obj_ptr))); - boost::python::list keys = obj.keys(); - - // Grab pointer to memory into which to construct the new std::vector - void* storage = ((boost::python::converter::rvalue_from_python_storage< M >*) data)->storage.bytes; - - M& map = *(new (storage) M()); - - boost::python::stl_input_iterator begin(keys); - boost::python::stl_input_iterator end; - - for (;begin!=end; ++begin) { - const typename M::key_type& k = *begin; - const typename M::mapped_type& v = boost::python::extract(obj[k]); - map[k] = v; - } - - // Stash the memory chunk pointer for later use by boost.python - data->convertible = storage; - } -}; - - -template -struct PairToTupleBPConverter { - - static PyObject* convert(std::pair const& p) { - return boost::python::incref(boost::python::make_tuple(p.first, p.second).ptr()); - } -}; - -template -struct PairFromTupleBPConverter { - PairFromTupleBPConverter() { - boost::python::converter::registry::push_back( - &PairFromTupleBPConverter::convertible, - &PairFromTupleBPConverter::construct, boost::python::type_id >()); - } - - // Determine if obj_ptr can be converted in a std::vector - static void* convertible(PyObject* obj_ptr) { - if (!PyTuple_Check(obj_ptr) || PySequence_Length(obj_ptr)!=2) { - return 0; - } - return obj_ptr; - } - - // Convert obj_ptr into a std::vector - static void construct( - PyObject* obj_ptr, - boost::python::converter::rvalue_from_python_stage1_data* data) { - - boost::python::tuple t = boost::python::tuple(boost::python::handle<>(boost::python::borrowed(obj_ptr))); - - // Grab pointer to memory into which to construct the new std::vector - void* storage = ((boost::python::converter::rvalue_from_python_storage< - std::pair >*) data)->storage.bytes; - - // in-place construct the new std::vector using the character data - // extraced from the python object - std::pair& v = *(new (storage) std::pair()); - - v.first=boost::python::extract(t[0]); - v.second=boost::python::extract(t[1]); - - // Stash the memory chunk pointer for later use by boost.python - data->convertible = storage; - } -}; - - -} - -#endif /* BP_CONVERTERS_H_ */ diff --git a/kaldi-python/kaldi_io/kaldi_io_internal.cpp b/kaldi-python/kaldi_io/kaldi_io_internal.cpp deleted file mode 100644 index 96730a5..0000000 --- a/kaldi-python/kaldi_io/kaldi_io_internal.cpp +++ /dev/null @@ -1,474 +0,0 @@ -/* - * kaldi-io.cpp - * - * Created on: Jul 29, 2014 - * Author: chorows - */ - -extern "C" { -#include "Python.h" -#include "numpy/arrayobject.h" -} - -#include -#include - -#include -#include -#include - -#include -#include -#include -#include -#include - -#include "python_wrappers.h" -#include "bp_converters.h" - - -using namespace std; - -namespace bp = boost::python; - -//keep a copy of the cPickle module in cache -struct PickleWrapper { - PickleWrapper() { - bp::object pickle = bp::import("cPickle"); - loads = pickle.attr("loads"); - dumps = pickle.attr("dumps"); - } - - bp::object loads, dumps; -}; - -// -// Holder for Python objects. -// -// In binary model uses Pickle to dump, the object is written as dump_length, pickled_string -// In text mode uses repr/eval (only single line), which works OK for simple types - lists, tuples, ints, but may fail for large arrays (as repr skips elemets for ndarray). -// -class PyObjectHolder { - public: - typedef bp::object T; - - PyObjectHolder() { - } - - static bool Write(std::ostream &os, bool binary, const T &t) { - kaldi::InitKaldiOutputStream(os, binary); // Puts binary header if binary mode. - try { - if (binary) { //pickle the object - bp::object py_string = PW()->dumps(t,-1); - int len = bp::extract(py_string.attr("__len__")()); - const char* string = bp::extract(py_string); - kaldi::WriteBasicType(os, true, len); - os.write(string, len); - } else { //use repr - PyObject* repr = PyObject_Repr(t.ptr()); - os << PyString_AsString(repr) << '\n'; - Py_DECREF(repr); - } - return os.good(); - - } catch (const std::exception &e) { - KALDI_WARN<< "Exception caught writing Table object: " << e.what(); - if (!kaldi::IsKaldiError(e.what())) {std::cerr << e.what();} - return false; // Write failure. - } - } - - bool Read(std::istream &is) { - bool is_binary; - if (!kaldi::InitKaldiInputStream(is, &is_binary)) { - KALDI_WARN << "Reading Table object [integer type], failed reading binary header\n"; - return false; - } - try { - if (is_binary) { - int len; - kaldi::ReadBasicType(is, true, &len); - std::auto_ptr buf(new char[len]); - is.read(buf.get(), len); - bp::str py_string(buf.get(), len); - t_ = PW()->loads(py_string); - } else { - std::string line; - std::getline(is, line); - bp::str repr(line); - t_ = bp::eval(repr); - } - return true; - } catch (std::exception &e) { - KALDI_WARN << "Exception caught reading Table object"; - if (!kaldi::IsKaldiError(e.what())) {std::cerr << e.what();} - return false; - } - } - - static bool IsReadInBinary() {return true;} - - const T &Value() const {return t_;} // if t is a pointer, would return *t_; - - void Clear() {} - - ~PyObjectHolder() {} - -private: - KALDI_DISALLOW_COPY_AND_ASSIGN(PyObjectHolder); - T t_; // t_ may alternatively be of type T*. - static PickleWrapper *PW_; - static PickleWrapper * PW() { - if (!PW_) { - PW_ = new PickleWrapper(); - } - return PW_; - } -}; - -PickleWrapper * PyObjectHolder::PW_ = 0; - - -template -struct MatrixToNdArrayConverter { - typedef kaldi::KaldiObjectHolder > HR; - typedef kaldi::KaldiObjectHolder > HW; - - static inline bp::object kaldi_to_python(const kaldi::Matrix& mat) { - npy_intp dims[2]; - dims[0] = mat.NumRows(); - dims[1] = mat.NumCols(); - int nd = 2; - int arr_type = kaldi::get_np_type(); - PyObject* ao = PyArray_SimpleNew(nd, dims, arr_type); - bp::object arr=bp::object(bp::handle<>( - ao - )); - kaldi::NpWrapperMatrix arr_wrap((PyArrayObject*)arr.ptr()); - arr_wrap.CopyFromMat(mat); - return arr; - } - - static inline kaldi::NpWrapperMatrix* python_to_kaldi(bp::object o) { - PyObject* raw_arr = PyArray_FromAny(o.ptr(),PyArray_DescrFromType(kaldi::get_np_type()), 2, 2, NPY_C_CONTIGUOUS | NPY_FORCECAST, NULL); - //why does this fail: bp::object arr(bp::handle<>(raw_arr)); - bp::object arr=bp::object(bp::handle<>(raw_arr)); - return new kaldi::NpWrapperMatrix((PyArrayObject*)arr.ptr()); - } -}; - -template -struct VectorToNdArrayConverter { - typedef kaldi::KaldiObjectHolder > HR; - typedef kaldi::KaldiObjectHolder > HW; - - static inline bp::object kaldi_to_python(const kaldi::Vector& vec) { - npy_intp dims[1]; - dims[0] = vec.Dim(); - int nd = 1; - - int arr_type = kaldi::get_np_type(); - PyObject* ao = PyArray_SimpleNew(nd, dims, arr_type); - bp::object arr=bp::object(bp::handle<>( - ao - )); - kaldi::NpWrapperVector vec_wrap((PyArrayObject*)arr.ptr()); - vec_wrap.CopyFromVec(vec); - return arr; - } - - static inline kaldi::NpWrapperVector* python_to_kaldi(bp::object o) { - PyObject* raw_arr = PyArray_FromAny(o.ptr(),PyArray_DescrFromType(kaldi::get_np_type()), 1, 1, NPY_C_CONTIGUOUS | NPY_FORCECAST, NULL); - //why does this fail: bp::object arr(bp::handle<>(raw_arr)); - bp::object arr=bp::object(bp::handle<>(raw_arr)); - return new kaldi::NpWrapperVector((PyArrayObject*)arr.ptr()); - } -}; - - - -template -struct VectorToNDArrayBPConverter { - static PyObject* convert(std::vector const& vec) { - npy_intp dims[1]; - dims[0] = vec.size(); - int nd = 1; - int arr_type = kaldi::get_np_type(); - PyObject* ao = PyArray_SimpleNew(nd, dims, arr_type); - bp::object arr=bp::object(bp::handle<>( - ao - )); - std::copy(vec.begin(), vec.end(), (T*)PyArray_DATA(ao)); - return bp::incref(arr.ptr()); - } -}; - - - -template -struct BoostPythonconverter { - typedef HW_ HW; - typedef HR_ HR; - - static inline bp::object kaldi_to_python(const Obj& o) { - return bp::object(o); - } - - static inline Obj * python_to_kaldi(bp::object o) { - return new Obj(bp::extract(o)); - } -}; - -template -class PythonToKaldiHolder { - public: - typedef bp::object T; - typedef typename Converter::HR HR; - typedef typename Converter::HW HW; - - PythonToKaldiHolder() : h_() { - } - - static bool Write(std::ostream &os, bool binary, const T &t) { - try { - auto_ptr obj(Converter::python_to_kaldi(t)); - return HW::Write(os, binary, (*obj)); - } catch (std::exception &e) { - KALDI_WARN << "Exception caught reading Table object"; - if (!kaldi::IsKaldiError(e.what())) {std::cerr << e.what();} - return false; - } - } - - bool Read(std::istream &is) { - if (!h_.Read(is)) - return false; - t_ = Converter::kaldi_to_python(h_.Value()); - return true; - } - - static bool IsReadInBinary() {return true;} - - const T &Value() const {return t_;} // if t is a pointer, would return *t_; - - void Clear() {} - - ~PythonToKaldiHolder() {} - -private: - KALDI_DISALLOW_COPY_AND_ASSIGN(PythonToKaldiHolder); - HR h_; - T t_; // t_ may alternatively be of type T*. -}; - -template -struct VectorHolder { - typedef PythonToKaldiHolder, - kaldi::BasicVectorHolder, kaldi::BasicVectorHolder > > type; - - static void register_converters() { - bp::to_python_converter, kaldi::VectorToListBPConverter >(); - kaldi::VectorFromListBPConverter(); - } -}; - -template -struct VectorNDArrayHolder { - typedef PythonToKaldiHolder, - kaldi::BasicVectorHolder, kaldi::BasicVectorHolder > > type; - - static void register_converters() { - bp::to_python_converter, VectorToNDArrayBPConverter >(); - kaldi::VectorFromListBPConverter(); - } -}; - -template -struct VectorVectorHolder { - typedef PythonToKaldiHolder > , - kaldi::BasicVectorVectorHolder, kaldi::BasicVectorVectorHolder > > type; - - static void register_converters() { - bp::to_python_converter >, kaldi::VectorToListBPConverter > >(); - kaldi::VectorFromListBPConverter >(); - } -}; - -template -struct PairVectorHolder { - typedef PythonToKaldiHolder > , - kaldi::BasicPairVectorHolder, kaldi::BasicPairVectorHolder > > type; - - static void register_converters() { - //register the pair first - bp::to_python_converter, - kaldi::PairToTupleBPConverter >(); - kaldi::PairFromTupleBPConverter(); - - //next register the pair vector - bp::to_python_converter >, - kaldi::VectorToListBPConverter > >(); - kaldi::VectorFromListBPConverter >(); - } -}; - -template -const T& get_self_ref(const T& t) { - return t; -} - -template -void exit(T& t, const bp::object& type, - const bp::object& value, const bp::object& traceback) { - t.Close(); -} - -template -bp::object sequential_reader_next(T& reader) { - if (!reader.IsOpen() || reader.Done()) { - PyErr_SetString(PyExc_StopIteration, "No more data."); - bp::throw_error_already_set(); - } - //if not done, extract the contents - bp::str key(reader.Key()); - bp::object val(reader.Value()); - //move the reading head, the contents will be read with the next call to next! - reader.Next(); - return bp::make_tuple(key,val); -} - -template -class RandomAccessWrapper: public bp::class_ { -public: - template - inline RandomAccessWrapper(char const* name, bp::init_base const& i) - : bp::class_(name, i) { - (*this) - .def("close", &Reader::Close) - .def("is_open", &Reader::IsOpen) - .def("__contains__", &Reader::HasKey) - .def("has_key", &Reader::HasKey) - .def("__getitem__", &Reader::Value, - bp::return_value_policy()) - .def("value", &Reader::Value, - bp::return_value_policy()) - .def("__enter__", &get_self_ref, - bp::return_internal_reference<1>()) - .def("__exit__", &exit) - ; - } -}; - -template -class SequentialReaderWrapper: public bp::class_ { -public: - template - inline SequentialReaderWrapper(char const* name, bp::init_base const& i) - : bp::class_(name, i) { - (*this) - .def("close", &Reader::Close) - .def("is_open", &Reader::IsOpen) - .def("__enter__", &get_self_ref, - bp::return_internal_reference<1>()) - .def("__iter__", &get_self_ref, - bp::return_internal_reference<1>()) - .def("next", sequential_reader_next) - .def("__exit__", &exit) - .def("done", &Reader::Done) - .def("_kaldi_value", &Reader::Value, - bp::return_value_policy()) - .def("_kaldi_next", &Reader::Next) - .def("_kaldi_key", &Reader::Key) - ; - } -}; - -template -class WriterWrapper: public bp::class_ { -public: - template - inline WriterWrapper(char const* name, bp::init_base const& i) - : bp::class_(name, i) { - (*this) - .def("close", &Writer::Close) - .def("is_open", &Writer::IsOpen) - .def("flush", &Writer::Flush) - .def("write", &Writer::Write) - .def("__setitem__", &Writer::Write) - .def("__enter__", &get_self_ref, - bp::return_internal_reference<1>()) - .def("__exit__",&exit) - ; - } -}; - - -PyObject* KALDI_BASE_FLOAT() { - return (PyObject*)PyArray_DescrFromType(kaldi::get_np_type()); -} - -BOOST_PYTHON_MODULE(kaldi_io_internal) -{ - import_array(); - - bp::def("KALDI_BASE_FLOAT", &KALDI_BASE_FLOAT); - - //Python objects - RandomAccessWrapper >("RandomAccessPythonReader", bp::init()); - RandomAccessWrapper >("RandomAccessPythonReaderMapped", bp::init()); - SequentialReaderWrapper >("SequentialPythonReader",bp::init()); - WriterWrapper >("PythonWriter", bp::init()); - - //Matrices as NdArrays - RandomAccessWrapper > > >("RandomAccessFloat64MatrixReader", bp::init()); - RandomAccessWrapper > > >("RandomAccessFloat64MatrixMapped",bp::init()); - SequentialReaderWrapper > > >("SequentialFloat64MatrixReader",bp::init()); - WriterWrapper > > >("Float64MatrixWriter", bp::init()); - - RandomAccessWrapper > > >("RandomAccessFloat32MatrixReader", bp::init()); - RandomAccessWrapper > > >("RandomAccessFloat32MatrixMapped",bp::init()); - SequentialReaderWrapper > > >("SequentialFloat32MatrixReader",bp::init()); - WriterWrapper > > >("Float32MatrixWriter", bp::init()); - - //Vectors as NdArrays - RandomAccessWrapper > > >("RandomAccessFloat64VectorReader", bp::init()); - RandomAccessWrapper > > >("RandomAccessFloat64VectorReaderMapped",bp::init()); - SequentialReaderWrapper > > >("SequentialFloat64VectorReader",bp::init()); - WriterWrapper > > >("Float64VectorWriter", bp::init()); - - RandomAccessWrapper > > >("RandomAccessFloat32VectorReader", bp::init()); - RandomAccessWrapper > > >("RandomAccessFloat32VectorReaderMapped",bp::init()); - SequentialReaderWrapper > > >("SequentialFloat32VectorReader",bp::init()); - WriterWrapper > > >("Float32VectorWriter", bp::init()); - - //Integers - RandomAccessWrapper("RandomAccessInt32Reader", bp::init()); - SequentialReaderWrapper("SequentialInt32Reader",bp::init()); - WriterWrapper("Int32Writer", bp::init()); - - // std::vector as ndarray - VectorNDArrayHolder::register_converters(); - RandomAccessWrapper::type > >("RandomAccessInt32VectorReader", bp::init()); - SequentialReaderWrapper::type > >("SequentialInt32VectorReader",bp::init()); - WriterWrapper::type > >("Int32VectorWriter", bp::init()); - -// Vector of simple types as lists -// VectorHolder::register_converters(); -// RandomAccessWrapper::type > >("RandomAccessInt32VectorReader", bp::init()); -// SequentialReaderWrapper::type > >("SequentialInt32VectorReader",bp::init()); -// WriterWrapper::type > >("Int32VectorWriter", bp::init()); - - - // std::vector > - VectorVectorHolder::register_converters(); - RandomAccessWrapper::type > >("RandomAccessInt32VectorVectorReader", bp::init()); - SequentialReaderWrapper::type > >("SequentialInt32VectorVectorReader",bp::init()); - WriterWrapper::type > >("Int32VectorVectorWriter", bp::init()); - - // std::vector > - PairVectorHolder::register_converters(); - RandomAccessWrapper::type > >("RandomAccessInt32PairVectorReader", bp::init()); - SequentialReaderWrapper::type > >("SequentialInt32PairVectorReader",bp::init()); - WriterWrapper::type > >("Int32PairVectorWriter", bp::init()); - -} diff --git a/kaldi-python/kaldi_io/python_wrappers.h b/kaldi-python/kaldi_io/python_wrappers.h deleted file mode 100644 index a1d817e..0000000 --- a/kaldi-python/kaldi_io/python_wrappers.h +++ /dev/null @@ -1,126 +0,0 @@ -/* - * python_wrappers.h - * - * Created on: Aug 28, 2014 - * Author: chorows - */ - -#ifndef PYTHON_WRAPPERS_H_ -#define PYTHON_WRAPPERS_H_ - -extern "C" { -#include "Python.h" -#include "numpy/arrayobject.h" -} - -#include -#include - -#include -#include -#include - -#include -#include -#include -#include -#include - -namespace kaldi { -//Helper to get proper np type -template -int get_np_type() { - //BOOST_STATIC_ASSERT_MSG(false, "Call one of the explicitly instantiated templates for float or double."); - KALDI_ERR << "Call one of the explicitly instantiated templates for float or double."; - return -1; -} - -template <> -int get_np_type() { - return NPY_DOUBLE; -} - -template <> -int get_np_type() { - return NPY_FLOAT; -} - -template <> -int get_np_type() { - return NPY_INT32; -} - -template -class NpWrapperMatrix : public kaldi::MatrixBase { - public: - NpWrapperMatrix(PyArrayObject* arr) - : kaldi::MatrixBase(), - arr_(arr) { - if (PyArray_NDIM(arr_)!=2) { - PyErr_SetString(PyExc_TypeError, "Can wrap only matrices (2D arrays)"); - boost::python::throw_error_already_set(); - } - if (PyArray_TYPE(arr)!=get_np_type()) { - PyErr_SetString(PyExc_TypeError, "Wrong array dtype"); - boost::python::throw_error_already_set(); - } - npy_intp* dims = PyArray_DIMS(arr_); - npy_intp* strides = PyArray_STRIDES(arr_); - if (strides[1]!=sizeof(Real)) { - PyErr_SetString(PyExc_TypeError, "Wrong array column stride"); - boost::python::throw_error_already_set(); - } - Py_INCREF(arr_); - //why do we have to use this-> in here?? - this->data_ = (Real*)PyArray_DATA(arr); - this->num_rows_ = dims[0]; - this->num_cols_ = dims[1]; - this->stride_ = strides[0]/sizeof(Real); - } - - ~NpWrapperMatrix() { - Py_DECREF(arr_); - } - - protected: - PyArrayObject* arr_; -}; - -template -class NpWrapperVector : public kaldi::VectorBase { - public: - NpWrapperVector(PyArrayObject* arr) - : kaldi::VectorBase(), - arr_(arr) { - if (PyArray_NDIM(arr_)!=1) { - PyErr_SetString(PyExc_TypeError, "Can wrap only vectors (1D arrays)"); - boost::python::throw_error_already_set(); - } - if (PyArray_TYPE(arr)!=get_np_type()) { - PyErr_SetString(PyExc_TypeError, "Wrong array dtype"); - boost::python::throw_error_already_set(); - } - npy_intp* dims = PyArray_DIMS(arr_); - npy_intp* strides = PyArray_STRIDES(arr_); - if (strides[0]!=sizeof(Real)) { - PyErr_SetString(PyExc_TypeError, "Wrong array column stride"); - boost::python::throw_error_already_set(); - } - Py_INCREF(arr_); - //why do we have to use this-> in here?? - this->data_ = (Real*)PyArray_DATA(arr); - this->dim_ = dims[0]; - } - - ~NpWrapperVector() { - Py_DECREF(arr_); - } - - protected: - PyArrayObject* arr_; -}; - -} //namespace kaldi - - -#endif /* PYTHON_WRAPPERS_H_ */ diff --git a/setup.py b/setup.py index 16272a5..870d979 100755 --- a/setup.py +++ b/setup.py @@ -16,11 +16,9 @@ def run(self): url='https://github.com/janchorowski/kaldi-python', cmdclass={'build_py': Make}, packages=['kaldi_io', 'kaldi_argparse'], - package_dir={'kaldi_io': 'kaldi-python/kaldi_io', - 'kaldi_argparse': 'kaldi-python/kaldi_argparse'}, package_data={'kaldi_io': ['kaldi_io_internal.so']}, scripts=['scripts/apply-global-cmvn.py', 'scripts/compute-global-cmvn-stats.py', 'scripts/copy-feats-padded.py', 'scripts/show-wav-ali.py'], - requires=['numpy']) \ No newline at end of file + requires=['numpy']) From 6b894680d669e6f446d9bce4e5acf0f08863552f Mon Sep 17 00:00:00 2001 From: serdyuk Date: Mon, 5 Oct 2015 13:31:52 -0400 Subject: [PATCH 4/4] Return all files --- kaldi_argparse/__init__.py | 77 ++++++ kaldi_io/Makefile | 41 +++ kaldi_io/__init__.py | 315 ++++++++++++++++++++++ kaldi_io/bp_converters.h | 175 ++++++++++++ kaldi_io/kaldi_io_internal.cpp | 474 +++++++++++++++++++++++++++++++++ kaldi_io/python_wrappers.h | 126 +++++++++ 6 files changed, 1208 insertions(+) create mode 100644 kaldi_argparse/__init__.py create mode 100644 kaldi_io/Makefile create mode 100644 kaldi_io/__init__.py create mode 100644 kaldi_io/bp_converters.h create mode 100644 kaldi_io/kaldi_io_internal.cpp create mode 100644 kaldi_io/python_wrappers.h diff --git a/kaldi_argparse/__init__.py b/kaldi_argparse/__init__.py new file mode 100644 index 0000000..b7f7ad4 --- /dev/null +++ b/kaldi_argparse/__init__.py @@ -0,0 +1,77 @@ +""" +Created on Aug 14, 2014 + +@author: chorows +""" + +import os +import sys +import argparse + + +class AddConfig(argparse.Action): + def __init__(self, *args, **kwargs): + argparse.Action.__init__(self, *args, **kwargs) + + def __call__(self, parser, namespace, values, option_string=None): + with open(values,'r') as f: + opts = [l.split('#')[0].strip() for l in f] + parser.parse_args(args=opts, namespace=namespace) + + +class KaldiArgumentParser(argparse.ArgumentParser): + def __init__(self, *args, **kwargs): + kwargs['add_help']=False + #kwargs['fromfile_prefix_chars']='--config=' + version = kwargs.pop('version', None) + super(KaldiArgumentParser, self).__init__(*args, formatter_class=argparse.ArgumentDefaultsHelpFormatter, **kwargs) + self.version = version + + def add_standard_arguments(self): + grp = self.add_argument_group('Standard options') + + default_prefix = '-' + grp.add_argument( + default_prefix+'h', default_prefix*2+'help', + action='help', default=argparse.SUPPRESS, + help=argparse._('show this help message and exit')) + if self.version: + grp.add_argument( + default_prefix+'v', default_prefix*2+'version', + action='version', default=argparse.SUPPRESS, + version=self.version, + help=argparse._("show program's version number and exit")) + grp.add_argument('--print-args', type=bool, default=True, help='Print the command line arguments (to stderr)') + #grp.add_argument('--config', action=AddConfig, help='Configuration file with options') + grp.add_argument('--config', default=argparse.SUPPRESS, help='Configuration file with options') + + def parse_known_args(self, args=None, namespace=None): + if args is None: + args = sys.argv[1:] + expanded_args = [] + + next_arg_is_conf = False + conf_file = None + + for arg in args: + if arg.startswith('--config') or next_arg_is_conf: + if next_arg_is_conf: + conf_file = arg + elif arg.startswith('--config='): + conf_file = arg[9:].strip() #eat --config= + else: + next_arg_is_conf = True + if conf_file: + with open(conf_file,'r') as f: + expanded_args.extend(l.split('#')[0].strip() for l in f) + next_arg_is_conf = False + conf_file = None + else: + expanded_args.append(arg) + return argparse.ArgumentParser.parse_known_args(self, args=expanded_args, namespace=namespace) + + def parse_args(self, args=None, namespace=None): + args = argparse.ArgumentParser.parse_args(self, args=args, namespace=namespace) + if args.print_args: + print >>sys.stderr, os.path.basename(sys.argv[0]), " ".join(sys.argv[1:]) + return args diff --git a/kaldi_io/Makefile b/kaldi_io/Makefile new file mode 100644 index 0000000..3cb127b --- /dev/null +++ b/kaldi_io/Makefile @@ -0,0 +1,41 @@ + +all: +EXTRA_CXXFLAGS = -Wno-sign-compare + +KALDI_SRC = $(KALDI_ROOT)/src + +include $(KALDI_SRC)/kaldi.mk + +BINFILES = + + +OBJFILES = + + +ADDLIBS = $(KALDI_SRC)/lm/kaldi-lm.a $(KALDI_SRC)/decoder/kaldi-decoder.a $(KALDI_SRC)/lat/kaldi-lat.a \ + $(KALDI_SRC)/hmm/kaldi-hmm.a $(KALDI_SRC)/transform/kaldi-transform.a $(KALDI_SRC)/gmm/kaldi-gmm.a \ + $(KALDI_SRC)/tree/kaldi-tree.a $(KALDI_SRC)/matrix/kaldi-matrix.a $(KALDI_SRC)/util/kaldi-util.a \ + $(KALDI_SRC)/base/kaldi-base.a $(KALDI_SRC)/thread/kaldi-thread.a + +TESTFILES = + +PYLIB = $(shell python-config --libs) +PYINC = $(shell python-config --includes) +NPINC = -I$(shell python -c 'import numpy; print numpy.get_include()') + +PYLIBS = kaldi_io_internal.so + +#include $(KALDI_SRC)/makefiles/default_rules.mk + +%.so: %.cpp + g++ -shared -o $@ -Wall -fPIC -I$(KALDI_SRC) $(PYINC) $(NPINC) $(CXXFLAGS) $< $(ADDLIBS) $(LDFLAGS) -L$(PYLIB) $(LOADLIBES) $(LDLIBS) -lpython2.7 -lboost_python -lboost_system + +clean: + -rm -f *.o *.a *.so $(TESTFILES) $(BINFILES) $(TESTOUTPUTS) tmp* *.tmp + +depend: + -$(CXX) -I$(KALDI_SRC) $(PYINC) $(NPINC) -M $(CXXFLAGS) *.cpp > .depend.mk + +test: + +all: $(PYLIBS) diff --git a/kaldi_io/__init__.py b/kaldi_io/__init__.py new file mode 100644 index 0000000..744145c --- /dev/null +++ b/kaldi_io/__init__.py @@ -0,0 +1,315 @@ +"""Python Wrappers for Kaldi table IO (:kaldi:`io.html`) + +In Kaldi the archive does not carry information about its contents and the user is required to +use the proper Reader or Writer. This module follows this approach and provides wrappers for +RandomAccess and Sequential readers, and for the Writers. The classes are instantiated for +each Kaldi type. + +Internally, the wrappers define holders (:kaldi:`io.html#io_sec_holders`) for python types +and instantiates the regular Kaldi templates. In this way, the wrappers are 100% compatible with +Kaldi and support using pipes and subprograms for inputs and outputs. + +The Python readers and writers implement the context api, and are fully usable with the Python +`with` construct. + +Examples: + A matrix to text converter: + + .. code-block:: python + + with kaldi_io.SequentialBaseFloatMatrixReader('ark:mat.ark') as reader: + for name,mat in reader: + print name, mat + + A simple vector generator: + + .. code-block:: python + + with kaldi_io.Int32VectorWriter('ark:| gzip -c vec.ark.gz') as w: + for len in xrange(10): + vec = [len] * len + w['vec_%d' %(len,)] = vec + +Kaldi Reader classes +==================== + +Kaldi provides two types of reader: the Sequential reader which is akin to an iterator and the +Random Access reader which is akin to a dict. Both work with piped data, thus the random access +readers may be required to read and store objects in memory until the proper one is found. More +information is in :kaldi:`io.html#io_sec_bloat`. + +Kaldi programs typically open one Sequential reader (e.g. for the features) and several RandomAccess +readers. For each feature, the random access readers would be used to fetch auxiliary information, while +ensuring that they pertain to the same utterance. This resemples a merge-sort merge phase and works well +if all the files are properly sorted. Citing :kaldi:`data_prep.html#data_prep_data_yourself`: + +.. note:: + + All of these files should be sorted. If they are not sorted, you will get errors when you run the scripts. In The Table concept we explain why this is needed. It has to do with the I/O framework; the ultimate reason for the sorting is to enable something equivalent to random-access lookup on a stream that doesn't support fseek(), such as a piped command. Many Kaldi programs are reading multiple pipes from other Kaldi commands, reading different types of object, and are doing something roughly comparable to merge-sort on the different inputs; merge-sort, of course, requires that the inputs be sorted. Be careful when you sort that you have the shell variable LC_ALL defined as "C", for example (in bash), + + export LC_ALL=C + + If you don't do this, the files will be sorted in an order that's different from how C++ sorts strings, and Kaldi will crash. You have been warned! + +.. py:class:: DataTypeSequentialReader(rx_specifier) + + The SequentialReader mostly ressembles a Python iterator. Therefore it implements the + Iterator protocol: + + .. py:method:: __iter__() + + Returns self + + .. py:method:: next() + + :return: a tuple of: + + * key (string) + * value (type is determined by the reader class) + + Moreover it provides a method to check whether the iterator is empty: + + .. py:method:: done() + + Returns `True` if the iterator is empty + + Kaldi uses a slightly different iteration protocol, which can be accessed using the functions: + + .. py:method:: _kaldi_next() + + Advance the iterator by one value + + .. py:method:: _kaldi_key() + + Returns the key of the cirrent value + + .. py:method:: _kaldi_value() + + Returns the current value (i.e. the value that will be returned on the next call + to :func:`next`) + + For resource management the classes implement: + + .. py:method:: close() + + Closes the reader. + + .. py:method:: is_open() + + Returns `True` is the reader is opened and can be read from + + .. py:method:: __enter__() + .. py:method:: __exit__() + + Implement the `with` context protocol + + +.. py:class:: DataTypeRandomAccessReader(rx_specifier) + + The random access ressembles a Python dict - values are retrieved for a given key value. + Therefore the rader acts in a dict-like manner: + + .. py:method:: __contains__(key) + .. py:method:: has_key(key) + + Returns `True` if key is present in reader. Enabvles the use of the `in` operator. + + .. py:method:: __getitem__(key) + .. py:method:: value(key) + + Returns the value associeted with key + + For resource management the classes implement: + + .. py:method:: close() + + Closes the reader. + + .. py:method:: is_open() + + Returns `True` is the reader is opened and can be read from + + .. py:method:: __enter__() + .. py:method:: __exit__() + + Implement the `with` context protocol + +.. py:class: DataTypeRandomAccessReaderMapped(data_rx_specifier, maping_rx_specifier) + This class implement a random access reader whose keys have been mapped using the mapper. + See :kaldi:`io.html#io_sec_mapped` for more explanation + +Kaldi Writer class +================== + +Th writer stores key-value pairs and thus ressembles a dict. However, unlike a dict +no checks for key duplication are made. The writer will happily store all values using +the same key, which may render them unusable. For best cooperation with KAldi, the keys +should be written sorted in the `C order`. + + +.. py:class:: DataTypeWriter(wx_specifier) + .. py:method:: write(key, value) + .. py:method:: __setitem__(key,value) + + Append to the file the value under key + + .. py:method:: flush() + + Flush the output stream. + + For resource management the classes implement: + + .. py:method:: close() + + Closes the writer. + + .. py:method:: is_open() + + Returns `True` is the writer is opened and can be written to + + .. py:method:: __enter__() + .. py:method:: __exit__() + + Implement the `with` context protocol + +Transformed Readers +=================== + +Very often the value read into Python would need to be further converted. The classes +`TransRA` and `TransSeq` take an appropriate reader and a function that will be used to +transform all objects returned + + +Mapping between Kaldi and Python Objects +======================================== + +The readers and writers are named after the Kaldi type they access. + ++--------------------+---------------------+-----------------------+-----------------------+ +| Kaldi Type | Read Python Type | Writable Python Types | Notes | +| | | | | ++====================+=====================+=======================+=======================+ +|Matrix |NDArray of |Any Python object |BaseFloat is mapped to | +| |appropriate |convertible to an |either float32 (c's | +| |DTYPE. Float32 and |NDarray |float) or float64 (c's | +| |Float64 are used for | |double) based on Kaldi | +| |float and double, | |compile options | +| |respectively. | | | ++--------------------+---------------------+-----------------------+-----------------------+ +|Vector |1-dimensional NDarray|Any Python object |Same as for Matrix | +| |of appropriate type. |convertible to 1d | | +| | |NDarray of appropriate | | +| | |type | | +| | | | | +| | | | | ++--------------------+---------------------+-----------------------+-----------------------+ +|std vector |1-dimensional NDarray|any python iterable | | +| |of int32 | | | +| | | | | +| | | | | +| | | | | +| | | | | ++--------------------+---------------------+-----------------------+-----------------------+ +|std::vector> | |convertible to 1d | | +| | |NDarrays | | +| | | | | +| | | | | +| | | | | ++--------------------+---------------------+-----------------------+-----------------------+ +|std:: |tuple of ints |tuple of ints | | +|pair | | | | +| | | | | +| | | | | +| | | | | +| | | | | ++--------------------+---------------------+-----------------------+-----------------------+ +| | Any Python object | Any Python object |Uses repr/eval in text | +| | | |mode and cPickle in | +| | | |binary mode | +| | | | | +| | | | | +| | | | | ++--------------------+---------------------+-----------------------+-----------------------+ + + +Created on Jul 31, 2014 + +@author: chorows +""" + + +import numpy as np +from kaldi_io_internal import * + +if KALDI_BASE_FLOAT()==np.float64: + RandomAccessBaseFloatMatrixReader = RandomAccessFloat64MatrixReader + RandomAccessBaseFloatMatrixMapped = RandomAccessFloat64MatrixMapped + SequentialBaseFloatMatrixReader = SequentialFloat64MatrixReader + BaseFloatMatrixWriter = Float64MatrixWriter + + RandomAccessBaseFloatVectorReader = RandomAccessFloat64VectorReader + RandomAccessBaseFloatVectorReaderMapped = RandomAccessFloat64VectorReaderMapped + SequentialBaseFloatVectorReader = SequentialFloat64VectorReader + BaseFloatVectorWriter = Float64VectorWriter + +if KALDI_BASE_FLOAT()==np.float32: + RandomAccessBaseFloatMatrixReader = RandomAccessFloat32MatrixReader + RandomAccessBaseFloatMatrixMapped = RandomAccessFloat32MatrixMapped + SequentialBaseFloatMatrixReader = SequentialFloat32MatrixReader + BaseFloatMatrixWriter = Float32MatrixWriter + + RandomAccessBaseFloatVectorReader = RandomAccessFloat32VectorReader + RandomAccessBaseFloatVectorReaderMapped = RandomAccessFloat32VectorReaderMapped + SequentialBaseFloatVectorReader = SequentialFloat32VectorReader + BaseFloatVectorWriter = Float32VectorWriter + + +def get_io_for_dtype(access, dtype, element=''): + ''' + Get a writer or reader for the given dtype. eg: + get_io_for_dtype('Sequential',np.float32,'MatrixReader') + get_io_for_dtype('float32,'MatrixWriter') + ''' + if element=='': #assume we want a writer + access, dtype,element = '',access,dtype + dtypemap = {np.int32:'Int32', + np.float32:'Float32', + np.float64:'Float64', + 'float32':'Float32', + 'float64':'Float64'} + dtype = dtypemap[dtype] + return globals()[access + dtype + element] + + +class _Transformed(object): + def __init__(self, reader, transform_function, **kwargs): + super(_Transformed, self).__init__(**kwargs) + self.reader=reader + self.transform_function = transform_function + + def __getattr__(self, attr): + return getattr(self.reader,attr) + + +class TransRA(_Transformed): + def __init__(self, *args, **kwargs): + super(TransRA, self).__init__(*args, **kwargs) + + def value(self, key): + return self.transform_function(self.reader.value(key)) + + def __getitem__(self, key): + return self.value(key) + + +class TransSeq(_Transformed): + def __init__(self, *args, **kwargs): + super(TransSeq, self).__init__(*args, **kwargs) + + def next(self): + return self.transform_function(self.reader.next()) + + def _kaldi_value(self): + return self.transform_function(self.reader._kaldi_value()) + diff --git a/kaldi_io/bp_converters.h b/kaldi_io/bp_converters.h new file mode 100644 index 0000000..1a38b0b --- /dev/null +++ b/kaldi_io/bp_converters.h @@ -0,0 +1,175 @@ +/* + * bp_converters.h + * + * Created on: Aug 28, 2014 + * Author: chorows + */ + +#ifndef BP_CONVERTERS_H_ +#define BP_CONVERTERS_H_ + +#include + +#include +#include + +#include +#include +#include + + + +namespace kaldi { +// +// Code transformend from http://code.activestate.com/lists/python-cplusplus-sig/16463/ and +// http://misspent.wordpress.com/2009/09/27/how-to-write-boost-python-converters/ +// +template +struct VectorToListBPConverter { + + static PyObject* convert(std::vector const& vec) { + boost::python::list l; + + for (size_t i = 0; i < vec.size(); i++) + l.append(vec[i]); + return boost::python::incref(l.ptr()); + } +}; + +template +struct VectorFromListBPConverter { + VectorFromListBPConverter() { + using namespace boost::python; + using namespace boost::python::converter; + boost::python::converter::registry::push_back( + &VectorFromListBPConverter::convertible, + &VectorFromListBPConverter::construct, type_id >()); + } + + // Determine if obj_ptr can be converted in a std::vector + static void* convertible(PyObject* obj_ptr) { +// if (!PyIter_Check(obj_ptr)) { +// return 0; +// } + return obj_ptr; + } + + // Convert obj_ptr into a std::vector + static void construct( + PyObject* obj_ptr, + boost::python::converter::rvalue_from_python_stage1_data* data) { + + boost::python::object o = boost::python::object(boost::python::handle<>(boost::python::borrowed(obj_ptr))); + boost::python::stl_input_iterator begin(o); + boost::python::stl_input_iterator end; + + // Grab pointer to memory into which to construct the new std::vector + void* storage = ((boost::python::converter::rvalue_from_python_storage< + std::vector >*) data)->storage.bytes; + + // in-place construct the new std::vector using the character data + // extraced from the python object + std::vector& v = *(new (storage) std::vector()); + + v.insert(v.end(), begin, end); + + // Stash the memory chunk pointer for later use by boost.python + data->convertible = storage; + } +}; + +template +struct MapFromDictBPConverter { + MapFromDictBPConverter() { + boost::python::converter::registry::push_back( + &MapFromDictBPConverter::convertible, + &MapFromDictBPConverter::construct, boost::python::type_id()); + } + + // Determine if obj_ptr can be converted in a std::vector + static void* convertible(PyObject* obj_ptr) { + if (!PyDict_Check(obj_ptr)) { + return 0; + } + return obj_ptr; + } + + // Convert obj_ptr into a std::vector + static void construct( + PyObject* obj_ptr, + boost::python::converter::rvalue_from_python_stage1_data* data) { + + boost::python::dict obj(boost::python::handle<>(boost::python::borrowed(obj_ptr))); + boost::python::list keys = obj.keys(); + + // Grab pointer to memory into which to construct the new std::vector + void* storage = ((boost::python::converter::rvalue_from_python_storage< M >*) data)->storage.bytes; + + M& map = *(new (storage) M()); + + boost::python::stl_input_iterator begin(keys); + boost::python::stl_input_iterator end; + + for (;begin!=end; ++begin) { + const typename M::key_type& k = *begin; + const typename M::mapped_type& v = boost::python::extract(obj[k]); + map[k] = v; + } + + // Stash the memory chunk pointer for later use by boost.python + data->convertible = storage; + } +}; + + +template +struct PairToTupleBPConverter { + + static PyObject* convert(std::pair const& p) { + return boost::python::incref(boost::python::make_tuple(p.first, p.second).ptr()); + } +}; + +template +struct PairFromTupleBPConverter { + PairFromTupleBPConverter() { + boost::python::converter::registry::push_back( + &PairFromTupleBPConverter::convertible, + &PairFromTupleBPConverter::construct, boost::python::type_id >()); + } + + // Determine if obj_ptr can be converted in a std::vector + static void* convertible(PyObject* obj_ptr) { + if (!PyTuple_Check(obj_ptr) || PySequence_Length(obj_ptr)!=2) { + return 0; + } + return obj_ptr; + } + + // Convert obj_ptr into a std::vector + static void construct( + PyObject* obj_ptr, + boost::python::converter::rvalue_from_python_stage1_data* data) { + + boost::python::tuple t = boost::python::tuple(boost::python::handle<>(boost::python::borrowed(obj_ptr))); + + // Grab pointer to memory into which to construct the new std::vector + void* storage = ((boost::python::converter::rvalue_from_python_storage< + std::pair >*) data)->storage.bytes; + + // in-place construct the new std::vector using the character data + // extraced from the python object + std::pair& v = *(new (storage) std::pair()); + + v.first=boost::python::extract(t[0]); + v.second=boost::python::extract(t[1]); + + // Stash the memory chunk pointer for later use by boost.python + data->convertible = storage; + } +}; + + +} + +#endif /* BP_CONVERTERS_H_ */ diff --git a/kaldi_io/kaldi_io_internal.cpp b/kaldi_io/kaldi_io_internal.cpp new file mode 100644 index 0000000..96730a5 --- /dev/null +++ b/kaldi_io/kaldi_io_internal.cpp @@ -0,0 +1,474 @@ +/* + * kaldi-io.cpp + * + * Created on: Jul 29, 2014 + * Author: chorows + */ + +extern "C" { +#include "Python.h" +#include "numpy/arrayobject.h" +} + +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "python_wrappers.h" +#include "bp_converters.h" + + +using namespace std; + +namespace bp = boost::python; + +//keep a copy of the cPickle module in cache +struct PickleWrapper { + PickleWrapper() { + bp::object pickle = bp::import("cPickle"); + loads = pickle.attr("loads"); + dumps = pickle.attr("dumps"); + } + + bp::object loads, dumps; +}; + +// +// Holder for Python objects. +// +// In binary model uses Pickle to dump, the object is written as dump_length, pickled_string +// In text mode uses repr/eval (only single line), which works OK for simple types - lists, tuples, ints, but may fail for large arrays (as repr skips elemets for ndarray). +// +class PyObjectHolder { + public: + typedef bp::object T; + + PyObjectHolder() { + } + + static bool Write(std::ostream &os, bool binary, const T &t) { + kaldi::InitKaldiOutputStream(os, binary); // Puts binary header if binary mode. + try { + if (binary) { //pickle the object + bp::object py_string = PW()->dumps(t,-1); + int len = bp::extract(py_string.attr("__len__")()); + const char* string = bp::extract(py_string); + kaldi::WriteBasicType(os, true, len); + os.write(string, len); + } else { //use repr + PyObject* repr = PyObject_Repr(t.ptr()); + os << PyString_AsString(repr) << '\n'; + Py_DECREF(repr); + } + return os.good(); + + } catch (const std::exception &e) { + KALDI_WARN<< "Exception caught writing Table object: " << e.what(); + if (!kaldi::IsKaldiError(e.what())) {std::cerr << e.what();} + return false; // Write failure. + } + } + + bool Read(std::istream &is) { + bool is_binary; + if (!kaldi::InitKaldiInputStream(is, &is_binary)) { + KALDI_WARN << "Reading Table object [integer type], failed reading binary header\n"; + return false; + } + try { + if (is_binary) { + int len; + kaldi::ReadBasicType(is, true, &len); + std::auto_ptr buf(new char[len]); + is.read(buf.get(), len); + bp::str py_string(buf.get(), len); + t_ = PW()->loads(py_string); + } else { + std::string line; + std::getline(is, line); + bp::str repr(line); + t_ = bp::eval(repr); + } + return true; + } catch (std::exception &e) { + KALDI_WARN << "Exception caught reading Table object"; + if (!kaldi::IsKaldiError(e.what())) {std::cerr << e.what();} + return false; + } + } + + static bool IsReadInBinary() {return true;} + + const T &Value() const {return t_;} // if t is a pointer, would return *t_; + + void Clear() {} + + ~PyObjectHolder() {} + +private: + KALDI_DISALLOW_COPY_AND_ASSIGN(PyObjectHolder); + T t_; // t_ may alternatively be of type T*. + static PickleWrapper *PW_; + static PickleWrapper * PW() { + if (!PW_) { + PW_ = new PickleWrapper(); + } + return PW_; + } +}; + +PickleWrapper * PyObjectHolder::PW_ = 0; + + +template +struct MatrixToNdArrayConverter { + typedef kaldi::KaldiObjectHolder > HR; + typedef kaldi::KaldiObjectHolder > HW; + + static inline bp::object kaldi_to_python(const kaldi::Matrix& mat) { + npy_intp dims[2]; + dims[0] = mat.NumRows(); + dims[1] = mat.NumCols(); + int nd = 2; + int arr_type = kaldi::get_np_type(); + PyObject* ao = PyArray_SimpleNew(nd, dims, arr_type); + bp::object arr=bp::object(bp::handle<>( + ao + )); + kaldi::NpWrapperMatrix arr_wrap((PyArrayObject*)arr.ptr()); + arr_wrap.CopyFromMat(mat); + return arr; + } + + static inline kaldi::NpWrapperMatrix* python_to_kaldi(bp::object o) { + PyObject* raw_arr = PyArray_FromAny(o.ptr(),PyArray_DescrFromType(kaldi::get_np_type()), 2, 2, NPY_C_CONTIGUOUS | NPY_FORCECAST, NULL); + //why does this fail: bp::object arr(bp::handle<>(raw_arr)); + bp::object arr=bp::object(bp::handle<>(raw_arr)); + return new kaldi::NpWrapperMatrix((PyArrayObject*)arr.ptr()); + } +}; + +template +struct VectorToNdArrayConverter { + typedef kaldi::KaldiObjectHolder > HR; + typedef kaldi::KaldiObjectHolder > HW; + + static inline bp::object kaldi_to_python(const kaldi::Vector& vec) { + npy_intp dims[1]; + dims[0] = vec.Dim(); + int nd = 1; + + int arr_type = kaldi::get_np_type(); + PyObject* ao = PyArray_SimpleNew(nd, dims, arr_type); + bp::object arr=bp::object(bp::handle<>( + ao + )); + kaldi::NpWrapperVector vec_wrap((PyArrayObject*)arr.ptr()); + vec_wrap.CopyFromVec(vec); + return arr; + } + + static inline kaldi::NpWrapperVector* python_to_kaldi(bp::object o) { + PyObject* raw_arr = PyArray_FromAny(o.ptr(),PyArray_DescrFromType(kaldi::get_np_type()), 1, 1, NPY_C_CONTIGUOUS | NPY_FORCECAST, NULL); + //why does this fail: bp::object arr(bp::handle<>(raw_arr)); + bp::object arr=bp::object(bp::handle<>(raw_arr)); + return new kaldi::NpWrapperVector((PyArrayObject*)arr.ptr()); + } +}; + + + +template +struct VectorToNDArrayBPConverter { + static PyObject* convert(std::vector const& vec) { + npy_intp dims[1]; + dims[0] = vec.size(); + int nd = 1; + int arr_type = kaldi::get_np_type(); + PyObject* ao = PyArray_SimpleNew(nd, dims, arr_type); + bp::object arr=bp::object(bp::handle<>( + ao + )); + std::copy(vec.begin(), vec.end(), (T*)PyArray_DATA(ao)); + return bp::incref(arr.ptr()); + } +}; + + + +template +struct BoostPythonconverter { + typedef HW_ HW; + typedef HR_ HR; + + static inline bp::object kaldi_to_python(const Obj& o) { + return bp::object(o); + } + + static inline Obj * python_to_kaldi(bp::object o) { + return new Obj(bp::extract(o)); + } +}; + +template +class PythonToKaldiHolder { + public: + typedef bp::object T; + typedef typename Converter::HR HR; + typedef typename Converter::HW HW; + + PythonToKaldiHolder() : h_() { + } + + static bool Write(std::ostream &os, bool binary, const T &t) { + try { + auto_ptr obj(Converter::python_to_kaldi(t)); + return HW::Write(os, binary, (*obj)); + } catch (std::exception &e) { + KALDI_WARN << "Exception caught reading Table object"; + if (!kaldi::IsKaldiError(e.what())) {std::cerr << e.what();} + return false; + } + } + + bool Read(std::istream &is) { + if (!h_.Read(is)) + return false; + t_ = Converter::kaldi_to_python(h_.Value()); + return true; + } + + static bool IsReadInBinary() {return true;} + + const T &Value() const {return t_;} // if t is a pointer, would return *t_; + + void Clear() {} + + ~PythonToKaldiHolder() {} + +private: + KALDI_DISALLOW_COPY_AND_ASSIGN(PythonToKaldiHolder); + HR h_; + T t_; // t_ may alternatively be of type T*. +}; + +template +struct VectorHolder { + typedef PythonToKaldiHolder, + kaldi::BasicVectorHolder, kaldi::BasicVectorHolder > > type; + + static void register_converters() { + bp::to_python_converter, kaldi::VectorToListBPConverter >(); + kaldi::VectorFromListBPConverter(); + } +}; + +template +struct VectorNDArrayHolder { + typedef PythonToKaldiHolder, + kaldi::BasicVectorHolder, kaldi::BasicVectorHolder > > type; + + static void register_converters() { + bp::to_python_converter, VectorToNDArrayBPConverter >(); + kaldi::VectorFromListBPConverter(); + } +}; + +template +struct VectorVectorHolder { + typedef PythonToKaldiHolder > , + kaldi::BasicVectorVectorHolder, kaldi::BasicVectorVectorHolder > > type; + + static void register_converters() { + bp::to_python_converter >, kaldi::VectorToListBPConverter > >(); + kaldi::VectorFromListBPConverter >(); + } +}; + +template +struct PairVectorHolder { + typedef PythonToKaldiHolder > , + kaldi::BasicPairVectorHolder, kaldi::BasicPairVectorHolder > > type; + + static void register_converters() { + //register the pair first + bp::to_python_converter, + kaldi::PairToTupleBPConverter >(); + kaldi::PairFromTupleBPConverter(); + + //next register the pair vector + bp::to_python_converter >, + kaldi::VectorToListBPConverter > >(); + kaldi::VectorFromListBPConverter >(); + } +}; + +template +const T& get_self_ref(const T& t) { + return t; +} + +template +void exit(T& t, const bp::object& type, + const bp::object& value, const bp::object& traceback) { + t.Close(); +} + +template +bp::object sequential_reader_next(T& reader) { + if (!reader.IsOpen() || reader.Done()) { + PyErr_SetString(PyExc_StopIteration, "No more data."); + bp::throw_error_already_set(); + } + //if not done, extract the contents + bp::str key(reader.Key()); + bp::object val(reader.Value()); + //move the reading head, the contents will be read with the next call to next! + reader.Next(); + return bp::make_tuple(key,val); +} + +template +class RandomAccessWrapper: public bp::class_ { +public: + template + inline RandomAccessWrapper(char const* name, bp::init_base const& i) + : bp::class_(name, i) { + (*this) + .def("close", &Reader::Close) + .def("is_open", &Reader::IsOpen) + .def("__contains__", &Reader::HasKey) + .def("has_key", &Reader::HasKey) + .def("__getitem__", &Reader::Value, + bp::return_value_policy()) + .def("value", &Reader::Value, + bp::return_value_policy()) + .def("__enter__", &get_self_ref, + bp::return_internal_reference<1>()) + .def("__exit__", &exit) + ; + } +}; + +template +class SequentialReaderWrapper: public bp::class_ { +public: + template + inline SequentialReaderWrapper(char const* name, bp::init_base const& i) + : bp::class_(name, i) { + (*this) + .def("close", &Reader::Close) + .def("is_open", &Reader::IsOpen) + .def("__enter__", &get_self_ref, + bp::return_internal_reference<1>()) + .def("__iter__", &get_self_ref, + bp::return_internal_reference<1>()) + .def("next", sequential_reader_next) + .def("__exit__", &exit) + .def("done", &Reader::Done) + .def("_kaldi_value", &Reader::Value, + bp::return_value_policy()) + .def("_kaldi_next", &Reader::Next) + .def("_kaldi_key", &Reader::Key) + ; + } +}; + +template +class WriterWrapper: public bp::class_ { +public: + template + inline WriterWrapper(char const* name, bp::init_base const& i) + : bp::class_(name, i) { + (*this) + .def("close", &Writer::Close) + .def("is_open", &Writer::IsOpen) + .def("flush", &Writer::Flush) + .def("write", &Writer::Write) + .def("__setitem__", &Writer::Write) + .def("__enter__", &get_self_ref, + bp::return_internal_reference<1>()) + .def("__exit__",&exit) + ; + } +}; + + +PyObject* KALDI_BASE_FLOAT() { + return (PyObject*)PyArray_DescrFromType(kaldi::get_np_type()); +} + +BOOST_PYTHON_MODULE(kaldi_io_internal) +{ + import_array(); + + bp::def("KALDI_BASE_FLOAT", &KALDI_BASE_FLOAT); + + //Python objects + RandomAccessWrapper >("RandomAccessPythonReader", bp::init()); + RandomAccessWrapper >("RandomAccessPythonReaderMapped", bp::init()); + SequentialReaderWrapper >("SequentialPythonReader",bp::init()); + WriterWrapper >("PythonWriter", bp::init()); + + //Matrices as NdArrays + RandomAccessWrapper > > >("RandomAccessFloat64MatrixReader", bp::init()); + RandomAccessWrapper > > >("RandomAccessFloat64MatrixMapped",bp::init()); + SequentialReaderWrapper > > >("SequentialFloat64MatrixReader",bp::init()); + WriterWrapper > > >("Float64MatrixWriter", bp::init()); + + RandomAccessWrapper > > >("RandomAccessFloat32MatrixReader", bp::init()); + RandomAccessWrapper > > >("RandomAccessFloat32MatrixMapped",bp::init()); + SequentialReaderWrapper > > >("SequentialFloat32MatrixReader",bp::init()); + WriterWrapper > > >("Float32MatrixWriter", bp::init()); + + //Vectors as NdArrays + RandomAccessWrapper > > >("RandomAccessFloat64VectorReader", bp::init()); + RandomAccessWrapper > > >("RandomAccessFloat64VectorReaderMapped",bp::init()); + SequentialReaderWrapper > > >("SequentialFloat64VectorReader",bp::init()); + WriterWrapper > > >("Float64VectorWriter", bp::init()); + + RandomAccessWrapper > > >("RandomAccessFloat32VectorReader", bp::init()); + RandomAccessWrapper > > >("RandomAccessFloat32VectorReaderMapped",bp::init()); + SequentialReaderWrapper > > >("SequentialFloat32VectorReader",bp::init()); + WriterWrapper > > >("Float32VectorWriter", bp::init()); + + //Integers + RandomAccessWrapper("RandomAccessInt32Reader", bp::init()); + SequentialReaderWrapper("SequentialInt32Reader",bp::init()); + WriterWrapper("Int32Writer", bp::init()); + + // std::vector as ndarray + VectorNDArrayHolder::register_converters(); + RandomAccessWrapper::type > >("RandomAccessInt32VectorReader", bp::init()); + SequentialReaderWrapper::type > >("SequentialInt32VectorReader",bp::init()); + WriterWrapper::type > >("Int32VectorWriter", bp::init()); + +// Vector of simple types as lists +// VectorHolder::register_converters(); +// RandomAccessWrapper::type > >("RandomAccessInt32VectorReader", bp::init()); +// SequentialReaderWrapper::type > >("SequentialInt32VectorReader",bp::init()); +// WriterWrapper::type > >("Int32VectorWriter", bp::init()); + + + // std::vector > + VectorVectorHolder::register_converters(); + RandomAccessWrapper::type > >("RandomAccessInt32VectorVectorReader", bp::init()); + SequentialReaderWrapper::type > >("SequentialInt32VectorVectorReader",bp::init()); + WriterWrapper::type > >("Int32VectorVectorWriter", bp::init()); + + // std::vector > + PairVectorHolder::register_converters(); + RandomAccessWrapper::type > >("RandomAccessInt32PairVectorReader", bp::init()); + SequentialReaderWrapper::type > >("SequentialInt32PairVectorReader",bp::init()); + WriterWrapper::type > >("Int32PairVectorWriter", bp::init()); + +} diff --git a/kaldi_io/python_wrappers.h b/kaldi_io/python_wrappers.h new file mode 100644 index 0000000..a1d817e --- /dev/null +++ b/kaldi_io/python_wrappers.h @@ -0,0 +1,126 @@ +/* + * python_wrappers.h + * + * Created on: Aug 28, 2014 + * Author: chorows + */ + +#ifndef PYTHON_WRAPPERS_H_ +#define PYTHON_WRAPPERS_H_ + +extern "C" { +#include "Python.h" +#include "numpy/arrayobject.h" +} + +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace kaldi { +//Helper to get proper np type +template +int get_np_type() { + //BOOST_STATIC_ASSERT_MSG(false, "Call one of the explicitly instantiated templates for float or double."); + KALDI_ERR << "Call one of the explicitly instantiated templates for float or double."; + return -1; +} + +template <> +int get_np_type() { + return NPY_DOUBLE; +} + +template <> +int get_np_type() { + return NPY_FLOAT; +} + +template <> +int get_np_type() { + return NPY_INT32; +} + +template +class NpWrapperMatrix : public kaldi::MatrixBase { + public: + NpWrapperMatrix(PyArrayObject* arr) + : kaldi::MatrixBase(), + arr_(arr) { + if (PyArray_NDIM(arr_)!=2) { + PyErr_SetString(PyExc_TypeError, "Can wrap only matrices (2D arrays)"); + boost::python::throw_error_already_set(); + } + if (PyArray_TYPE(arr)!=get_np_type()) { + PyErr_SetString(PyExc_TypeError, "Wrong array dtype"); + boost::python::throw_error_already_set(); + } + npy_intp* dims = PyArray_DIMS(arr_); + npy_intp* strides = PyArray_STRIDES(arr_); + if (strides[1]!=sizeof(Real)) { + PyErr_SetString(PyExc_TypeError, "Wrong array column stride"); + boost::python::throw_error_already_set(); + } + Py_INCREF(arr_); + //why do we have to use this-> in here?? + this->data_ = (Real*)PyArray_DATA(arr); + this->num_rows_ = dims[0]; + this->num_cols_ = dims[1]; + this->stride_ = strides[0]/sizeof(Real); + } + + ~NpWrapperMatrix() { + Py_DECREF(arr_); + } + + protected: + PyArrayObject* arr_; +}; + +template +class NpWrapperVector : public kaldi::VectorBase { + public: + NpWrapperVector(PyArrayObject* arr) + : kaldi::VectorBase(), + arr_(arr) { + if (PyArray_NDIM(arr_)!=1) { + PyErr_SetString(PyExc_TypeError, "Can wrap only vectors (1D arrays)"); + boost::python::throw_error_already_set(); + } + if (PyArray_TYPE(arr)!=get_np_type()) { + PyErr_SetString(PyExc_TypeError, "Wrong array dtype"); + boost::python::throw_error_already_set(); + } + npy_intp* dims = PyArray_DIMS(arr_); + npy_intp* strides = PyArray_STRIDES(arr_); + if (strides[0]!=sizeof(Real)) { + PyErr_SetString(PyExc_TypeError, "Wrong array column stride"); + boost::python::throw_error_already_set(); + } + Py_INCREF(arr_); + //why do we have to use this-> in here?? + this->data_ = (Real*)PyArray_DATA(arr); + this->dim_ = dims[0]; + } + + ~NpWrapperVector() { + Py_DECREF(arr_); + } + + protected: + PyArrayObject* arr_; +}; + +} //namespace kaldi + + +#endif /* PYTHON_WRAPPERS_H_ */