Skip to content

Added support for python 3 #51

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,21 +14,21 @@
TESSERACT_TRAINDIR= TESSERACT_DIR + '/training'


country = raw_input("Two-Letter Country Code to Train: ").lower()
country = input("Two-Letter Country Code to Train: ").lower()

LANGUAGE_NAME='l' + country

box_files = glob.glob('./' + country + '/input/*.box')
if not box_files:
print "Cannot find input files"
print( "Cannot find input files")
sys.exit(1)

os.system("rm ./tmp/*")

font_properties_file = open('./tmp/font_properties','w')

for box_file in box_files:
print "Processing: " + box_file
print( "Processing: " + box_file)

file_without_dir = os.path.split(box_file)[1]
file_without_ext = os.path.splitext(file_without_dir)[0]
Expand All @@ -37,7 +37,7 @@
tif_file = input_dir + '/' + file_without_ext + ".tif"

train_cmd = "%s -l eng %s %s nobatch box.train.stderr" % (TESSERACT_BIN, tif_file, file_without_ext)
print "Executing: " + train_cmd
print( "Executing: " + train_cmd )
os.system(train_cmd)
os.system("mv ./" + file_without_ext + ".tr ./tmp/" + file_without_ext + ".tr")
os.system("mv ./" + file_without_ext + ".txt ./tmp/" + file_without_ext + ".txt")
Expand All @@ -52,12 +52,12 @@

# Shape clustering should currently only be used for the "indic" languages
#train_cmd = TESSERACT_TRAINDIR + '/shapeclustering -F ./' + country + '/input/font_properties -U unicharset ./' + country + '/input/*.tr'
#print "Executing: " + train_cmd
#print( "Executing: " + train_cmd)
#os.system(train_cmd)


train_cmd = TESSERACT_TRAINDIR + '/mftraining -F ./tmp/font_properties -U unicharset -O ./tmp/' + LANGUAGE_NAME + '.unicharset ./tmp/*.tr'
print "Executing: " + train_cmd
print( "Executing: " + train_cmd)
os.system(train_cmd)
os.system("rm ./unicharset")
os.system("mv ./tmp/" + LANGUAGE_NAME + ".unicharset ./")
Expand All @@ -79,7 +79,7 @@
# If a config file is in the country's directory, use that.
config_file = os.path.join('./', country, country + '.config')
if os.path.isfile(config_file):
print "Applying config file: " + config_file
print( "Applying config file: " + config_file)
trainedata_file = LANGUAGE_NAME + '.traineddata'
os.system(TESSERACT_TRAINDIR + '/combine_tessdata -o ' + trainedata_file + ' ' + config_file )

Expand Down