* core/line_info.hh: Add a new attribute 'text'.
* text/recognition.hh: Store results in the line_info structure.
---
scribo/ChangeLog | 8 ++++++
scribo/core/line_info.hh | 28 ++++++++++++++++++++++-
scribo/text/recognition.hh | 53 ++++++++++++-------------------------------
3 files changed, 50 insertions(+), 39 deletions(-)
diff --git a/scribo/ChangeLog b/scribo/ChangeLog
index 76decb5..4ce573a 100644
--- a/scribo/ChangeLog
+++ b/scribo/ChangeLog
@@ -1,5 +1,13 @@
2010-05-25 Guillaume Lazzara <z(a)lrde.epita.fr>
+ Store OCR results in the line_info structure.
+
+ * core/line_info.hh: Add a new attribute 'text'.
+
+ * text/recognition.hh: Store results in the line_info structure.
+
+2010-05-25 Guillaume Lazzara <z(a)lrde.epita.fr>
+
Improve debug outputs in Sauvola and make binarization tools
compile again.
diff --git a/scribo/core/line_info.hh b/scribo/core/line_info.hh
index 9ecf600..d0066c0 100644
--- a/scribo/core/line_info.hh
+++ b/scribo/core/line_info.hh
@@ -40,6 +40,7 @@
# include <mln/accu/stat/median_h.hh>
# include <mln/accu/shape/bbox.hh>
# include <mln/util/object_id.hh>
+# include <mln/value/int_u.hh>
# include <scribo/core/tag/component.hh>
# include <scribo/core/tag/line.hh>
@@ -124,6 +125,9 @@ namespace scribo
bool indented() const;
+ const std::string& text() const;
+ void update_text(const std::string& str);
+
bool is_valid() const;
@@ -232,6 +236,8 @@ namespace scribo
bool indented_;
+ std::string text_;
+
// Line set holding this element.
line_set<L> holder_;
@@ -295,6 +301,8 @@ namespace scribo
indented_ = other.indented();
+ text_ = other.text();
+
holder_ = other.holder();
}
@@ -578,6 +586,21 @@ namespace scribo
return indented_;
}
+ template <typename L>
+ const std::string&
+ line_info<L>::text() const
+ {
+ return text_;
+ }
+
+
+ template <typename L>
+ void
+ line_info<L>::update_text(const std::string& str)
+ {
+ text_ = str;
+ }
+
template <typename L>
bool
@@ -833,7 +856,9 @@ namespace scribo
else
char_width_ = char_width.to_result();
-
+ // FIXME: There is a bug here when the input document is too
+ // large. The baselines indexes are too high for the type used
+ // in the median accumulator!
baseline_ = absolute_baseline.to_result();
meanline_ = absolute_meanline.to_result();
x_height_ = absolute_baseline - absolute_meanline + 1;
@@ -890,6 +915,7 @@ namespace scribo
<< ", orientation=" << info.orientation()
<< ", reading_orientation=" << info.reading_orientation()
<< ", indented=" << info.indented()
+ << ", text=" << info.text()
<< ")" << std::endl;
}
diff --git a/scribo/text/recognition.hh b/scribo/text/recognition.hh
index fcc83dd..f8d8f4f 100644
--- a/scribo/text/recognition.hh
+++ b/scribo/text/recognition.hh
@@ -81,9 +81,7 @@ namespace scribo
//
template <typename L>
void
- recognition(const line_set<L>& lines,
- const char *language,
- const char *output_file = 0);
+ recognition(line_set<L>& lines, const char *language);
/// Recognize text from an image.
@@ -91,7 +89,7 @@ namespace scribo
void
recognition(const Image<I>& line,
const char *language,
- const char *output_file = 0);
+ const std::string& output_file = 0);
@@ -102,9 +100,7 @@ namespace scribo
template <typename L>
void
- recognition(const line_set<L>& lines,
- const char *language,
- const char *output_file = 0)
+ recognition(line_set<L>& lines, const char *language)
{
trace::entering("scribo::text::recognition");
@@ -121,9 +117,6 @@ namespace scribo
0, 9, 0, 9, 0 };
w_window2d_int dmap_win = mln::make::w_window2d_int(vals);
- std::ofstream file;
- if (output_file != 0)
- file.open(output_file);
/// Use text bboxes with Tesseract
for_all_lines(i, lines)
@@ -141,7 +134,7 @@ namespace scribo
box.enlarge(2);
I text_ima(box);
- data::fill(text_ima, true);
+ data::fill(text_ima, false);
// Careful : background is set to 'False'
const component_set<L>& comp_set = lines.components();
@@ -152,7 +145,7 @@ namespace scribo
{
unsigned comp_id = comps(e);
data::fill(((text_ima | comp_set(comp_id).bbox()).rw() | (pw::value(lbl) ==
pw::cst(comp_id))).rw(),
- false);
+ true);
}
/// Improve text quality.
@@ -161,8 +154,6 @@ namespace scribo
I text_ima_cleaned = text::clean(lines(i), text_ima);
// mln::io::pbm::save(text_ima_cleaned, mln::debug::filename("line.pbm",
debug_id++));
- // Setting objects to 'True'
- logical::not_inplace(text_ima_cleaned);
// Make sure there is no border.
border::resize(text_ima_cleaned, 0);
@@ -180,30 +171,16 @@ namespace scribo
if (s != 0)
{
- std::cerr << s << std::endl;
- if (output_file != 0)
- {
- std::string str(s);
- str = str.substr(0, str.length() - 1);
- file << lines(i).bbox().pmin().row()
- << " "
- << lines(i).bbox().pmin().col()
- << " "
- << lines(i).bbox().pmax().row()
- << " "
- << lines(i).bbox().pmax().col()
- << " "
- << str;
- }
+ std::cerr << s << std::endl;
+ std::string str(s);
+ str = str.substr(0, str.length() - 2);
+ lines(i).update_text(str);
}
- // The string has been allocated by Tesseract. We must free it.
+ // The string has been allocated by Tesseract. It must be released.
free(s);
}
- if (output_file != 0)
- file.close();
-
trace::exiting("scribo::text::recognition");
}
@@ -212,7 +189,7 @@ namespace scribo
void
recognition(const Image<I>& line_,
const char *language,
- const char *output_file = 0)
+ const std::string& output_file = 0)
{
trace::entering("scribo::text::recognition");
@@ -223,8 +200,8 @@ namespace scribo
TessBaseAPI::InitWithLanguage(NULL, NULL, language, NULL, false, 0, NULL);
std::ofstream file;
- if (output_file != 0)
- file.open(output_file);
+ if (!output_file.empty())
+ file.open(output_file.c_str());
mln_domain(I) box = line.domain();
// Make sure characters are isolated from the borders.
@@ -252,7 +229,7 @@ namespace scribo
if (s != 0)
{
std::cout << s << std::endl;
- if (output_file != 0)
+ if (!output_file.empty())
{
std::string str(s);
str = str.substr(0, str.length() - 1);
@@ -271,7 +248,7 @@ namespace scribo
// The string has been allocated by Tesseract. We must free it.
free(s);
- if (output_file != 0)
+ if (!output_file.empty())
file.close();
trace::exiting("scribo::text::recognition");
--
1.5.6.5