[Olena-patches] last-svn-commit-744-g6ba320b Add an option to choose the recognized language.

25 Jan 2011

* scribo/toolchain/nepomuk/text_extraction.hh,
	* scribo/toolchain/text_in_doc.hh,
	* src/content_in_doc.cc: Here.
---
 scribo/ChangeLog                                   |    8 ++++
 scribo/scribo/toolchain/nepomuk/text_extraction.hh |   20 +++++++----
 scribo/scribo/toolchain/text_in_doc.hh             |    3 ++
 scribo/src/content_in_doc.cc                       |   36 ++++++++++++--------
 4 files changed, 46 insertions(+), 21 deletions(-)

diff --git a/scribo/ChangeLog b/scribo/ChangeLog
index 360f65d..3ec57f0 100644
--- a/scribo/ChangeLog
+++ b/scribo/ChangeLog
@@ -1,5 +1,13 @@
 2011-01-25  Guillaume Lazzara  <z@lrde.epita.fr>
 
+	Add an option to choose the recognized language.
+
+	* scribo/toolchain/nepomuk/text_extraction.hh,
+	* scribo/toolchain/text_in_doc.hh,
+	* src/content_in_doc.cc: Here.
+
+2011-01-25  Guillaume Lazzara  <z@lrde.epita.fr>
+
 	* scribo/text/recognition.hh: Make use of TessBaseAPI::setImage.
 
 2011-01-25  Guillaume Lazzara  <z@lrde.epita.fr>
diff --git a/scribo/scribo/toolchain/nepomuk/text_extraction.hh b/scribo/scribo/toolchain/nepomuk/text_extraction.hh
index effb13f..6def090 100644
--- a/scribo/scribo/toolchain/nepomuk/text_extraction.hh
+++ b/scribo/scribo/toolchain/nepomuk/text_extraction.hh
@@ -24,8 +24,8 @@
 // executable file might be covered by the GNU General Public License.
 
 
-#ifndef SCRIBO_TOOLCHAIN_TEXT_EXTRACTION_HH
-# define SCRIBO_TOOLCHAIN_TEXT_EXTRACTION_HH
+#ifndef SCRIBO_TOOLCHAIN_NEPOMUK_TEXT_EXTRACTION_HH
+# define SCRIBO_TOOLCHAIN_NEPOMUK_TEXT_EXTRACTION_HH
 
 /// \file
 ///
@@ -77,13 +77,13 @@ namespace scribo
 
        */
       QSet<QString>
-      text_extraction(const QImage& input);
+      text_extraction(const QImage& input, const QString& language);
 
 
 # ifndef MLN_INCLUDE_ONLY
 
       QSet<QString>
-      text_extraction(const QImage& input)
+      text_extraction(const QImage& input, const QString& language = QString("eng"))
       {
 	trace::entering("scribo::toolchain::nepomuk::text_extraction");
 
@@ -119,13 +119,19 @@ namespace scribo
 	// Process
 	{
 	  // Run document toolchain.
-	  lines_bg = scribo::toolchain::text_in_doc(input_bin, true, false);
+	  lines_bg = scribo::toolchain::text_in_doc(input_bin,
+						    true,
+						    language.toUtf8().data(),
+						    false);
 
 	  // Negate document.
 	  logical::not_inplace(input_bin);
 
 	  // Run document toolchain.
-	  lines_fg = scribo::toolchain::text_in_doc(input_bin, true, false);
+	  lines_fg = scribo::toolchain::text_in_doc(input_bin,
+						    true,
+						    language.toUtf8().data(),
+						    false);
 	}
 
 
@@ -165,4 +171,4 @@ namespace scribo
 } // end of namespace scribo
 
 
-#endif // ! SCRIBO_TOOLCHAIN_TEXT_EXTRACTION_HH
+#endif // ! SCRIBO_TOOLCHAIN_NEPOMUK_TEXT_EXTRACTION_HH
diff --git a/scribo/scribo/toolchain/text_in_doc.hh b/scribo/scribo/toolchain/text_in_doc.hh
index 0ad6cf3..e6ba69e 100644
--- a/scribo/scribo/toolchain/text_in_doc.hh
+++ b/scribo/scribo/toolchain/text_in_doc.hh
@@ -45,6 +45,7 @@ namespace scribo
     template <typename I>
     line_set<mln_ch_value(I, def::lbl_type)>
     text_in_doc(const Image<I>& input, bool denoise,
+		const std::string& language = std::string("eng"),
 		bool find_line_seps = true,
 		bool find_whitespace_seps = true,
 		bool debug = false);
@@ -56,6 +57,7 @@ namespace scribo
     template <typename I>
     line_set<mln_ch_value(I, def::lbl_type)>
     text_in_doc(const Image<I>& input, bool denoise,
+		const std::string& language = std::string("eng"),
 		bool find_line_seps = true,
 		bool find_whitespace_seps = true,
 		bool debug = false)
@@ -65,6 +67,7 @@ namespace scribo
       f.enable_line_seps = find_line_seps;
       f.enable_whitespace_seps = find_whitespace_seps;
       f.enable_debug = debug;
+      f.ocr_language = language;
 
       line_set<mln_ch_value(I, def::lbl_type)> lines = f(input);
 
diff --git a/scribo/src/content_in_doc.cc b/scribo/src/content_in_doc.cc
index 2c31d90..f453f08 100644
--- a/scribo/src/content_in_doc.cc
+++ b/scribo/src/content_in_doc.cc
@@ -1,4 +1,5 @@
-// Copyright (C) 2010 EPITA Research and Development Laboratory (LRDE)
+// Copyright (C) 2010, 2011 EPITA Research and Development Laboratory
+// (LRDE)
 //
 // This file is part of Olena.
 //
@@ -72,6 +73,7 @@ const char *args_desc[][2] =
   { "pmin_col", "Col index of the top left corner of the Region of interest." },
   { "pmax_row", "Row index of the bottom right corner of the Region of interest." },
   { "pmax_col", "Col index of the bottom right corner of the Region of interest." },
+  { "language", "Language to be used for the text recognition. [eng|fra] (Default: eng)" },
   { "find_lines", "Find vertical lines. (Default 1)" },
   { "find_whitespaces", "Find whitespaces separators. (Default 1)" },
   { "K", "Sauvola's binarization threshold parameter. (Default: 0.34)" },
@@ -86,16 +88,16 @@ int main(int argc, char* argv[])
   using namespace scribo;
   using namespace mln;
 
-  if (argc < 3 || (argc > 8 &&  argc != 12))
+  if (argc < 3 || argc > 14)
     return scribo::debug::usage(argv,
 				"Find text lines and elements in a document",
-				"input.* out.xml <denoise_enabled> [<pmin_row> <pmin_col> <pmax_row> <pmax_col>] <find_lines> <find_whitespaces> <K> <debug_dir>",
+				"input.* out.xml <denoise_enabled> [<pmin_row> <pmin_col> <pmax_row> <pmax_col>] [language] [find_lines] [find_whitespaces] [K] [debug_dir]",
 				args_desc);
 
   bool debug = false;
 
   // Enable debug output.
-  if (argc == 8 || argc == 12)
+  if (argc == 9 || argc == 13)
   {
     scribo::make::internal::debug_filename_prefix = argv[argc - 1];
     debug = true;
@@ -113,10 +115,10 @@ int main(int argc, char* argv[])
   image2d<bool> input;
   {
     double K = 0.34;
-    if (argc == 7  || argc == 8 || argc == 11)
+    if (argc == 8  || argc == 12 || argc >= 12)
     {
-      if (argc == 7)
-	K = atof(argv[6]);
+      if (argc == 8)
+	K = atof(argv[7]);
       else
 	K = atof(argv[argc - 2]);
       std::cout << "Using K = " << K << std::endl;
@@ -150,16 +152,21 @@ int main(int argc, char* argv[])
 
   bool denoise = (argc > 3 && atoi(argv[3]) != 0);
 
+  std::string language = "eng";
+  if (argc >= 5 && argc < 13)
+    language = argv[4];
+
   bool find_line_seps = true;
-  if (argc >= 5 && argc < 12)
-    find_line_seps = (atoi(argv[4]) != 0);
+  if (argc >= 6 && argc < 13)
+    find_line_seps = (atoi(argv[5]) != 0);
 
   bool find_whitespace_seps = true;
-  if (argc >= 6 && argc < 12)
-    find_line_seps = (atoi(argv[5]) != 0);
+  if (argc >= 7 && argc < 13)
+    find_line_seps = (atoi(argv[6]) != 0);
 
   std::cout << "Running with the following options :"
-	    << "find_lines_seps = " << find_line_seps
+	    << " ocr_language = " << language
+	    << " | find_lines_seps = " << find_line_seps
 	    << " | find_whitespace_seps = " << find_whitespace_seps
 	    << " | debug = " << debug
 	    << std::endl;
@@ -169,8 +176,9 @@ int main(int argc, char* argv[])
   // Text
   std::cout << "Extracting text" << std::endl;
   line_set<L>
-    lines = scribo::toolchain::text_in_doc(input, denoise, find_line_seps,
-					   find_whitespace_seps, debug);
+    lines = scribo::toolchain::text_in_doc(input, denoise, language,
+					   find_line_seps, find_whitespace_seps,
+					   debug);
   doc.set_text(lines);
 
   // Elements
-- 
1.5.6.5