last-svn-commit-126-gca67a57 Store OCR results in the line_info structure.

* core/line_info.hh: Add a new attribute 'text'. * text/recognition.hh: Store results in the line_info structure. --- scribo/ChangeLog | 8 ++++++ scribo/core/line_info.hh | 28 ++++++++++++++++++++++- scribo/text/recognition.hh | 53 ++++++++++++------------------------------- 3 files changed, 50 insertions(+), 39 deletions(-) diff --git a/scribo/ChangeLog b/scribo/ChangeLog index 76decb5..4ce573a 100644 --- a/scribo/ChangeLog +++ b/scribo/ChangeLog @@ -1,5 +1,13 @@ 2010-05-25 Guillaume Lazzara <z@lrde.epita.fr> + Store OCR results in the line_info structure. + + * core/line_info.hh: Add a new attribute 'text'. + + * text/recognition.hh: Store results in the line_info structure. + +2010-05-25 Guillaume Lazzara <z@lrde.epita.fr> + Improve debug outputs in Sauvola and make binarization tools compile again. diff --git a/scribo/core/line_info.hh b/scribo/core/line_info.hh index 9ecf600..d0066c0 100644 --- a/scribo/core/line_info.hh +++ b/scribo/core/line_info.hh @@ -40,6 +40,7 @@ # include <mln/accu/stat/median_h.hh> # include <mln/accu/shape/bbox.hh> # include <mln/util/object_id.hh> +# include <mln/value/int_u.hh> # include <scribo/core/tag/component.hh> # include <scribo/core/tag/line.hh> @@ -124,6 +125,9 @@ namespace scribo bool indented() const; + const std::string& text() const; + void update_text(const std::string& str); + bool is_valid() const; @@ -232,6 +236,8 @@ namespace scribo bool indented_; + std::string text_; + // Line set holding this element. line_set<L> holder_; @@ -295,6 +301,8 @@ namespace scribo indented_ = other.indented(); + text_ = other.text(); + holder_ = other.holder(); } @@ -578,6 +586,21 @@ namespace scribo return indented_; } + template <typename L> + const std::string& + line_info<L>::text() const + { + return text_; + } + + + template <typename L> + void + line_info<L>::update_text(const std::string& str) + { + text_ = str; + } + template <typename L> bool @@ -833,7 +856,9 @@ namespace scribo else char_width_ = char_width.to_result(); - + // FIXME: There is a bug here when the input document is too + // large. The baselines indexes are too high for the type used + // in the median accumulator! baseline_ = absolute_baseline.to_result(); meanline_ = absolute_meanline.to_result(); x_height_ = absolute_baseline - absolute_meanline + 1; @@ -890,6 +915,7 @@ namespace scribo << ", orientation=" << info.orientation() << ", reading_orientation=" << info.reading_orientation() << ", indented=" << info.indented() + << ", text=" << info.text() << ")" << std::endl; } diff --git a/scribo/text/recognition.hh b/scribo/text/recognition.hh index fcc83dd..f8d8f4f 100644 --- a/scribo/text/recognition.hh +++ b/scribo/text/recognition.hh @@ -81,9 +81,7 @@ namespace scribo // template <typename L> void - recognition(const line_set<L>& lines, - const char *language, - const char *output_file = 0); + recognition(line_set<L>& lines, const char *language); /// Recognize text from an image. @@ -91,7 +89,7 @@ namespace scribo void recognition(const Image<I>& line, const char *language, - const char *output_file = 0); + const std::string& output_file = 0); @@ -102,9 +100,7 @@ namespace scribo template <typename L> void - recognition(const line_set<L>& lines, - const char *language, - const char *output_file = 0) + recognition(line_set<L>& lines, const char *language) { trace::entering("scribo::text::recognition"); @@ -121,9 +117,6 @@ namespace scribo 0, 9, 0, 9, 0 }; w_window2d_int dmap_win = mln::make::w_window2d_int(vals); - std::ofstream file; - if (output_file != 0) - file.open(output_file); /// Use text bboxes with Tesseract for_all_lines(i, lines) @@ -141,7 +134,7 @@ namespace scribo box.enlarge(2); I text_ima(box); - data::fill(text_ima, true); + data::fill(text_ima, false); // Careful : background is set to 'False' const component_set<L>& comp_set = lines.components(); @@ -152,7 +145,7 @@ namespace scribo { unsigned comp_id = comps(e); data::fill(((text_ima | comp_set(comp_id).bbox()).rw() | (pw::value(lbl) == pw::cst(comp_id))).rw(), - false); + true); } /// Improve text quality. @@ -161,8 +154,6 @@ namespace scribo I text_ima_cleaned = text::clean(lines(i), text_ima); // mln::io::pbm::save(text_ima_cleaned, mln::debug::filename("line.pbm", debug_id++)); - // Setting objects to 'True' - logical::not_inplace(text_ima_cleaned); // Make sure there is no border. border::resize(text_ima_cleaned, 0); @@ -180,30 +171,16 @@ namespace scribo if (s != 0) { - std::cerr << s << std::endl; - if (output_file != 0) - { - std::string str(s); - str = str.substr(0, str.length() - 1); - file << lines(i).bbox().pmin().row() - << " " - << lines(i).bbox().pmin().col() - << " " - << lines(i).bbox().pmax().row() - << " " - << lines(i).bbox().pmax().col() - << " " - << str; - } + std::cerr << s << std::endl; + std::string str(s); + str = str.substr(0, str.length() - 2); + lines(i).update_text(str); } - // The string has been allocated by Tesseract. We must free it. + // The string has been allocated by Tesseract. It must be released. free(s); } - if (output_file != 0) - file.close(); - trace::exiting("scribo::text::recognition"); } @@ -212,7 +189,7 @@ namespace scribo void recognition(const Image<I>& line_, const char *language, - const char *output_file = 0) + const std::string& output_file = 0) { trace::entering("scribo::text::recognition"); @@ -223,8 +200,8 @@ namespace scribo TessBaseAPI::InitWithLanguage(NULL, NULL, language, NULL, false, 0, NULL); std::ofstream file; - if (output_file != 0) - file.open(output_file); + if (!output_file.empty()) + file.open(output_file.c_str()); mln_domain(I) box = line.domain(); // Make sure characters are isolated from the borders. @@ -252,7 +229,7 @@ namespace scribo if (s != 0) { std::cout << s << std::endl; - if (output_file != 0) + if (!output_file.empty()) { std::string str(s); str = str.substr(0, str.length() - 1); @@ -271,7 +248,7 @@ namespace scribo // The string has been allocated by Tesseract. We must free it. free(s); - if (output_file != 0) + if (!output_file.empty()) file.close(); trace::exiting("scribo::text::recognition"); -- 1.5.6.5
participants (1)
-
Guillaume Lazzara