
* scribo/toolchain/nepomuk/text_extraction.hh, * scribo/toolchain/text_in_doc.hh, * src/content_in_doc.cc: Here. --- scribo/ChangeLog | 8 ++++ scribo/scribo/toolchain/nepomuk/text_extraction.hh | 20 +++++++---- scribo/scribo/toolchain/text_in_doc.hh | 3 ++ scribo/src/content_in_doc.cc | 36 ++++++++++++-------- 4 files changed, 46 insertions(+), 21 deletions(-) diff --git a/scribo/ChangeLog b/scribo/ChangeLog index 360f65d..3ec57f0 100644 --- a/scribo/ChangeLog +++ b/scribo/ChangeLog @@ -1,5 +1,13 @@ 2011-01-25 Guillaume Lazzara <z@lrde.epita.fr> + Add an option to choose the recognized language. + + * scribo/toolchain/nepomuk/text_extraction.hh, + * scribo/toolchain/text_in_doc.hh, + * src/content_in_doc.cc: Here. + +2011-01-25 Guillaume Lazzara <z@lrde.epita.fr> + * scribo/text/recognition.hh: Make use of TessBaseAPI::setImage. 2011-01-25 Guillaume Lazzara <z@lrde.epita.fr> diff --git a/scribo/scribo/toolchain/nepomuk/text_extraction.hh b/scribo/scribo/toolchain/nepomuk/text_extraction.hh index effb13f..6def090 100644 --- a/scribo/scribo/toolchain/nepomuk/text_extraction.hh +++ b/scribo/scribo/toolchain/nepomuk/text_extraction.hh @@ -24,8 +24,8 @@ // executable file might be covered by the GNU General Public License. -#ifndef SCRIBO_TOOLCHAIN_TEXT_EXTRACTION_HH -# define SCRIBO_TOOLCHAIN_TEXT_EXTRACTION_HH +#ifndef SCRIBO_TOOLCHAIN_NEPOMUK_TEXT_EXTRACTION_HH +# define SCRIBO_TOOLCHAIN_NEPOMUK_TEXT_EXTRACTION_HH /// \file /// @@ -77,13 +77,13 @@ namespace scribo */ QSet<QString> - text_extraction(const QImage& input); + text_extraction(const QImage& input, const QString& language); # ifndef MLN_INCLUDE_ONLY QSet<QString> - text_extraction(const QImage& input) + text_extraction(const QImage& input, const QString& language = QString("eng")) { trace::entering("scribo::toolchain::nepomuk::text_extraction"); @@ -119,13 +119,19 @@ namespace scribo // Process { // Run document toolchain. - lines_bg = scribo::toolchain::text_in_doc(input_bin, true, false); + lines_bg = scribo::toolchain::text_in_doc(input_bin, + true, + language.toUtf8().data(), + false); // Negate document. logical::not_inplace(input_bin); // Run document toolchain. - lines_fg = scribo::toolchain::text_in_doc(input_bin, true, false); + lines_fg = scribo::toolchain::text_in_doc(input_bin, + true, + language.toUtf8().data(), + false); } @@ -165,4 +171,4 @@ namespace scribo } // end of namespace scribo -#endif // ! SCRIBO_TOOLCHAIN_TEXT_EXTRACTION_HH +#endif // ! SCRIBO_TOOLCHAIN_NEPOMUK_TEXT_EXTRACTION_HH diff --git a/scribo/scribo/toolchain/text_in_doc.hh b/scribo/scribo/toolchain/text_in_doc.hh index 0ad6cf3..e6ba69e 100644 --- a/scribo/scribo/toolchain/text_in_doc.hh +++ b/scribo/scribo/toolchain/text_in_doc.hh @@ -45,6 +45,7 @@ namespace scribo template <typename I> line_set<mln_ch_value(I, def::lbl_type)> text_in_doc(const Image<I>& input, bool denoise, + const std::string& language = std::string("eng"), bool find_line_seps = true, bool find_whitespace_seps = true, bool debug = false); @@ -56,6 +57,7 @@ namespace scribo template <typename I> line_set<mln_ch_value(I, def::lbl_type)> text_in_doc(const Image<I>& input, bool denoise, + const std::string& language = std::string("eng"), bool find_line_seps = true, bool find_whitespace_seps = true, bool debug = false) @@ -65,6 +67,7 @@ namespace scribo f.enable_line_seps = find_line_seps; f.enable_whitespace_seps = find_whitespace_seps; f.enable_debug = debug; + f.ocr_language = language; line_set<mln_ch_value(I, def::lbl_type)> lines = f(input); diff --git a/scribo/src/content_in_doc.cc b/scribo/src/content_in_doc.cc index 2c31d90..f453f08 100644 --- a/scribo/src/content_in_doc.cc +++ b/scribo/src/content_in_doc.cc @@ -1,4 +1,5 @@ -// Copyright (C) 2010 EPITA Research and Development Laboratory (LRDE) +// Copyright (C) 2010, 2011 EPITA Research and Development Laboratory +// (LRDE) // // This file is part of Olena. // @@ -72,6 +73,7 @@ const char *args_desc[][2] = { "pmin_col", "Col index of the top left corner of the Region of interest." }, { "pmax_row", "Row index of the bottom right corner of the Region of interest." }, { "pmax_col", "Col index of the bottom right corner of the Region of interest." }, + { "language", "Language to be used for the text recognition. [eng|fra] (Default: eng)" }, { "find_lines", "Find vertical lines. (Default 1)" }, { "find_whitespaces", "Find whitespaces separators. (Default 1)" }, { "K", "Sauvola's binarization threshold parameter. (Default: 0.34)" }, @@ -86,16 +88,16 @@ int main(int argc, char* argv[]) using namespace scribo; using namespace mln; - if (argc < 3 || (argc > 8 && argc != 12)) + if (argc < 3 || argc > 14) return scribo::debug::usage(argv, "Find text lines and elements in a document", - "input.* out.xml <denoise_enabled> [<pmin_row> <pmin_col> <pmax_row> <pmax_col>] <find_lines> <find_whitespaces> <K> <debug_dir>", + "input.* out.xml <denoise_enabled> [<pmin_row> <pmin_col> <pmax_row> <pmax_col>] [language] [find_lines] [find_whitespaces] [K] [debug_dir]", args_desc); bool debug = false; // Enable debug output. - if (argc == 8 || argc == 12) + if (argc == 9 || argc == 13) { scribo::make::internal::debug_filename_prefix = argv[argc - 1]; debug = true; @@ -113,10 +115,10 @@ int main(int argc, char* argv[]) image2d<bool> input; { double K = 0.34; - if (argc == 7 || argc == 8 || argc == 11) + if (argc == 8 || argc == 12 || argc >= 12) { - if (argc == 7) - K = atof(argv[6]); + if (argc == 8) + K = atof(argv[7]); else K = atof(argv[argc - 2]); std::cout << "Using K = " << K << std::endl; @@ -150,16 +152,21 @@ int main(int argc, char* argv[]) bool denoise = (argc > 3 && atoi(argv[3]) != 0); + std::string language = "eng"; + if (argc >= 5 && argc < 13) + language = argv[4]; + bool find_line_seps = true; - if (argc >= 5 && argc < 12) - find_line_seps = (atoi(argv[4]) != 0); + if (argc >= 6 && argc < 13) + find_line_seps = (atoi(argv[5]) != 0); bool find_whitespace_seps = true; - if (argc >= 6 && argc < 12) - find_line_seps = (atoi(argv[5]) != 0); + if (argc >= 7 && argc < 13) + find_line_seps = (atoi(argv[6]) != 0); std::cout << "Running with the following options :" - << "find_lines_seps = " << find_line_seps + << " ocr_language = " << language + << " | find_lines_seps = " << find_line_seps << " | find_whitespace_seps = " << find_whitespace_seps << " | debug = " << debug << std::endl; @@ -169,8 +176,9 @@ int main(int argc, char* argv[]) // Text std::cout << "Extracting text" << std::endl; line_set<L> - lines = scribo::toolchain::text_in_doc(input, denoise, find_line_seps, - find_whitespace_seps, debug); + lines = scribo::toolchain::text_in_doc(input, denoise, language, + find_line_seps, find_whitespace_seps, + debug); doc.set_text(lines); // Elements -- 1.5.6.5