* scribo/toolchain/nepomuk/text_extraction.hh,
* scribo/toolchain/text_in_doc.hh,
* src/content_in_doc.cc: Here.
---
scribo/ChangeLog | 8 ++++
scribo/scribo/toolchain/nepomuk/text_extraction.hh | 20 +++++++----
scribo/scribo/toolchain/text_in_doc.hh | 3 ++
scribo/src/content_in_doc.cc | 36 ++++++++++++--------
4 files changed, 46 insertions(+), 21 deletions(-)
diff --git a/scribo/ChangeLog b/scribo/ChangeLog
index 360f65d..3ec57f0 100644
--- a/scribo/ChangeLog
+++ b/scribo/ChangeLog
@@ -1,5 +1,13 @@
2011-01-25 Guillaume Lazzara <z(a)lrde.epita.fr>
+ Add an option to choose the recognized language.
+
+ * scribo/toolchain/nepomuk/text_extraction.hh,
+ * scribo/toolchain/text_in_doc.hh,
+ * src/content_in_doc.cc: Here.
+
+2011-01-25 Guillaume Lazzara <z(a)lrde.epita.fr>
+
* scribo/text/recognition.hh: Make use of TessBaseAPI::setImage.
2011-01-25 Guillaume Lazzara <z(a)lrde.epita.fr>
diff --git a/scribo/scribo/toolchain/nepomuk/text_extraction.hh
b/scribo/scribo/toolchain/nepomuk/text_extraction.hh
index effb13f..6def090 100644
--- a/scribo/scribo/toolchain/nepomuk/text_extraction.hh
+++ b/scribo/scribo/toolchain/nepomuk/text_extraction.hh
@@ -24,8 +24,8 @@
// executable file might be covered by the GNU General Public License.
-#ifndef SCRIBO_TOOLCHAIN_TEXT_EXTRACTION_HH
-# define SCRIBO_TOOLCHAIN_TEXT_EXTRACTION_HH
+#ifndef SCRIBO_TOOLCHAIN_NEPOMUK_TEXT_EXTRACTION_HH
+# define SCRIBO_TOOLCHAIN_NEPOMUK_TEXT_EXTRACTION_HH
/// \file
///
@@ -77,13 +77,13 @@ namespace scribo
*/
QSet<QString>
- text_extraction(const QImage& input);
+ text_extraction(const QImage& input, const QString& language);
# ifndef MLN_INCLUDE_ONLY
QSet<QString>
- text_extraction(const QImage& input)
+ text_extraction(const QImage& input, const QString& language =
QString("eng"))
{
trace::entering("scribo::toolchain::nepomuk::text_extraction");
@@ -119,13 +119,19 @@ namespace scribo
// Process
{
// Run document toolchain.
- lines_bg = scribo::toolchain::text_in_doc(input_bin, true, false);
+ lines_bg = scribo::toolchain::text_in_doc(input_bin,
+ true,
+ language.toUtf8().data(),
+ false);
// Negate document.
logical::not_inplace(input_bin);
// Run document toolchain.
- lines_fg = scribo::toolchain::text_in_doc(input_bin, true, false);
+ lines_fg = scribo::toolchain::text_in_doc(input_bin,
+ true,
+ language.toUtf8().data(),
+ false);
}
@@ -165,4 +171,4 @@ namespace scribo
} // end of namespace scribo
-#endif // ! SCRIBO_TOOLCHAIN_TEXT_EXTRACTION_HH
+#endif // ! SCRIBO_TOOLCHAIN_NEPOMUK_TEXT_EXTRACTION_HH
diff --git a/scribo/scribo/toolchain/text_in_doc.hh
b/scribo/scribo/toolchain/text_in_doc.hh
index 0ad6cf3..e6ba69e 100644
--- a/scribo/scribo/toolchain/text_in_doc.hh
+++ b/scribo/scribo/toolchain/text_in_doc.hh
@@ -45,6 +45,7 @@ namespace scribo
template <typename I>
line_set<mln_ch_value(I, def::lbl_type)>
text_in_doc(const Image<I>& input, bool denoise,
+ const std::string& language = std::string("eng"),
bool find_line_seps = true,
bool find_whitespace_seps = true,
bool debug = false);
@@ -56,6 +57,7 @@ namespace scribo
template <typename I>
line_set<mln_ch_value(I, def::lbl_type)>
text_in_doc(const Image<I>& input, bool denoise,
+ const std::string& language = std::string("eng"),
bool find_line_seps = true,
bool find_whitespace_seps = true,
bool debug = false)
@@ -65,6 +67,7 @@ namespace scribo
f.enable_line_seps = find_line_seps;
f.enable_whitespace_seps = find_whitespace_seps;
f.enable_debug = debug;
+ f.ocr_language = language;
line_set<mln_ch_value(I, def::lbl_type)> lines = f(input);
diff --git a/scribo/src/content_in_doc.cc b/scribo/src/content_in_doc.cc
index 2c31d90..f453f08 100644
--- a/scribo/src/content_in_doc.cc
+++ b/scribo/src/content_in_doc.cc
@@ -1,4 +1,5 @@
-// Copyright (C) 2010 EPITA Research and Development Laboratory (LRDE)
+// Copyright (C) 2010, 2011 EPITA Research and Development Laboratory
+// (LRDE)
//
// This file is part of Olena.
//
@@ -72,6 +73,7 @@ const char *args_desc[][2] =
{ "pmin_col", "Col index of the top left corner of the Region of
interest." },
{ "pmax_row", "Row index of the bottom right corner of the Region of
interest." },
{ "pmax_col", "Col index of the bottom right corner of the Region of
interest." },
+ { "language", "Language to be used for the text recognition. [eng|fra]
(Default: eng)" },
{ "find_lines", "Find vertical lines. (Default 1)" },
{ "find_whitespaces", "Find whitespaces separators. (Default 1)"
},
{ "K", "Sauvola's binarization threshold parameter. (Default:
0.34)" },
@@ -86,16 +88,16 @@ int main(int argc, char* argv[])
using namespace scribo;
using namespace mln;
- if (argc < 3 || (argc > 8 && argc != 12))
+ if (argc < 3 || argc > 14)
return scribo::debug::usage(argv,
"Find text lines and elements in a document",
- "input.* out.xml <denoise_enabled> [<pmin_row> <pmin_col>
<pmax_row> <pmax_col>] <find_lines> <find_whitespaces> <K>
<debug_dir>",
+ "input.* out.xml <denoise_enabled> [<pmin_row> <pmin_col>
<pmax_row> <pmax_col>] [language] [find_lines] [find_whitespaces] [K]
[debug_dir]",
args_desc);
bool debug = false;
// Enable debug output.
- if (argc == 8 || argc == 12)
+ if (argc == 9 || argc == 13)
{
scribo::make::internal::debug_filename_prefix = argv[argc - 1];
debug = true;
@@ -113,10 +115,10 @@ int main(int argc, char* argv[])
image2d<bool> input;
{
double K = 0.34;
- if (argc == 7 || argc == 8 || argc == 11)
+ if (argc == 8 || argc == 12 || argc >= 12)
{
- if (argc == 7)
- K = atof(argv[6]);
+ if (argc == 8)
+ K = atof(argv[7]);
else
K = atof(argv[argc - 2]);
std::cout << "Using K = " << K << std::endl;
@@ -150,16 +152,21 @@ int main(int argc, char* argv[])
bool denoise = (argc > 3 && atoi(argv[3]) != 0);
+ std::string language = "eng";
+ if (argc >= 5 && argc < 13)
+ language = argv[4];
+
bool find_line_seps = true;
- if (argc >= 5 && argc < 12)
- find_line_seps = (atoi(argv[4]) != 0);
+ if (argc >= 6 && argc < 13)
+ find_line_seps = (atoi(argv[5]) != 0);
bool find_whitespace_seps = true;
- if (argc >= 6 && argc < 12)
- find_line_seps = (atoi(argv[5]) != 0);
+ if (argc >= 7 && argc < 13)
+ find_line_seps = (atoi(argv[6]) != 0);
std::cout << "Running with the following options :"
- << "find_lines_seps = " << find_line_seps
+ << " ocr_language = " << language
+ << " | find_lines_seps = " << find_line_seps
<< " | find_whitespace_seps = " << find_whitespace_seps
<< " | debug = " << debug
<< std::endl;
@@ -169,8 +176,9 @@ int main(int argc, char* argv[])
// Text
std::cout << "Extracting text" << std::endl;
line_set<L>
- lines = scribo::toolchain::text_in_doc(input, denoise, find_line_seps,
- find_whitespace_seps, debug);
+ lines = scribo::toolchain::text_in_doc(input, denoise, language,
+ find_line_seps, find_whitespace_seps,
+ debug);
doc.set_text(lines);
// Elements
--
1.5.6.5