last-svn-commit-745-g2601ca6 Small fixes in Scribo.

* scribo/core/paragraph_set.hh, * scribo/core/document.hh: Add new methods. * scribo/core/macros.hh: Add a missing macro. * scribo/io/xml/save.hh, * scribo/primitive/extract/elements.hh: Make use of the methods in document class. * src/pbm_text_in_doc.cc: Add recognized language as an option. * tests/toolchain/nepomuk/text_extraction.cc: Make the test not case dependent. --- scribo/ChangeLog | 18 +++++++++ scribo/scribo/core/document.hh | 41 +++++++++++++++------ scribo/scribo/core/macros.hh | 3 ++ scribo/scribo/core/paragraph_set.hh | 20 ++++++++++ scribo/scribo/io/xml/save.hh | 20 +++++----- scribo/scribo/primitive/extract/elements.hh | 5 ++- scribo/src/pbm_text_in_doc.cc | 40 ++++++++++++-------- scribo/tests/toolchain/nepomuk/text_extraction.cc | 4 +- 8 files changed, 110 insertions(+), 41 deletions(-) diff --git a/scribo/ChangeLog b/scribo/ChangeLog index 3ec57f0..c947550 100644 --- a/scribo/ChangeLog +++ b/scribo/ChangeLog @@ -1,5 +1,23 @@ 2011-01-25 Guillaume Lazzara <z@lrde.epita.fr> + Small fixes in Scribo. + + * scribo/core/paragraph_set.hh, + * scribo/core/document.hh: Add new methods. + + * scribo/core/macros.hh: Add a missing macro. + + * scribo/io/xml/save.hh, + * scribo/primitive/extract/elements.hh: Make use of the methods in + document class. + + * src/pbm_text_in_doc.cc: Add recognized language as an option. + + * tests/toolchain/nepomuk/text_extraction.cc: Make the test not + case dependent. + +2011-01-25 Guillaume Lazzara <z@lrde.epita.fr> + Add an option to choose the recognized language. * scribo/toolchain/nepomuk/text_extraction.hh, diff --git a/scribo/scribo/core/document.hh b/scribo/scribo/core/document.hh index f4a78ff..b547da4 100644 --- a/scribo/scribo/core/document.hh +++ b/scribo/scribo/core/document.hh @@ -1,4 +1,5 @@ -// Copyright (C) 2010 EPITA Research and Development Laboratory (LRDE) +// Copyright (C) 2010, 2011 EPITA Research and Development Laboratory +// (LRDE) // // This file is part of Olena. // @@ -59,15 +60,24 @@ namespace scribo bool is_valid() const; - const line_set<L>& text() const; + /*! \brief Check whether this document contains text. + + If it returns true, that document contains paragraphs, lines and + text components. + + */ bool has_text() const; - void set_text(const line_set<L>& line); + + mln::def::coord height() const; + mln::def::coord width() const; + + const line_set<L>& lines() const; const paragraph_set<L>& paragraphs() const; void set_paragraphs(const paragraph_set<L>& parset); - const component_set<L>& elements() const; bool has_elements() const; + const component_set<L>& elements() const; void set_elements(const component_set<L>& elements); const mln::image2d<value::rgb8>& image() const; @@ -150,10 +160,18 @@ namespace scribo template <typename L> - const line_set<L>& - document<L>::text() const + mln::def::coord + document<L>::width() const { - return lines_; + return image_.ncols(); + } + + + template <typename L> + mln::def::coord + document<L>::height() const + { + return image_.nrows(); } @@ -161,17 +179,18 @@ namespace scribo bool document<L>::has_text() const { - return lines_.is_valid(); + return parset_.is_valid(); } template <typename L> - void - document<L>::set_text(const line_set<L>& line) + const line_set<L>& + document<L>::lines() const { - lines_ = line; + return parset_.lines(); } + template <typename L> const paragraph_set<L>& document<L>::paragraphs() const diff --git a/scribo/scribo/core/macros.hh b/scribo/scribo/core/macros.hh index 1060358..887539f 100644 --- a/scribo/scribo/core/macros.hh +++ b/scribo/scribo/core/macros.hh @@ -62,4 +62,7 @@ # define for_all_anchors(E, S) \ for_all_elements(E, S) +# define for_all_paragraph_lines(E, S) \ + for_all_elements(E, S) + #endif // ! SCRIBO_CORE_MACROS_HH diff --git a/scribo/scribo/core/paragraph_set.hh b/scribo/scribo/core/paragraph_set.hh index afb59c5..355eaa9 100644 --- a/scribo/scribo/core/paragraph_set.hh +++ b/scribo/scribo/core/paragraph_set.hh @@ -53,9 +53,13 @@ namespace scribo paragraph_info<L>& operator()(unsigned i); const paragraph_info<L>& operator()(unsigned i) const; + bool is_valid() const; + + const line_set<L>& lines() const; private: mln::util::array<paragraph_info<L> > pars_; + line_set<L> lines_; }; @@ -82,6 +86,7 @@ namespace scribo paragraph_set<L>::paragraph_set(const line_links<L>& llinks, unsigned npars) : pars_(npars + 1, paragraph_info<L>(llinks)) { + lines_ = llinks.lines(); } template <typename L> @@ -106,6 +111,21 @@ namespace scribo } + template <typename L> + bool + paragraph_set<L>::is_valid() const + { + return !pars_.is_empty(); + } + + + template <typename L> + const line_set<L>& + paragraph_set<L>::lines() const + { + return lines_; + } + namespace make { diff --git a/scribo/scribo/io/xml/save.hh b/scribo/scribo/io/xml/save.hh index 1bcdd6f..41d4fef 100644 --- a/scribo/scribo/io/xml/save.hh +++ b/scribo/scribo/io/xml/save.hh @@ -130,9 +130,6 @@ namespace scribo abort(); } - const line_set<L>& lines = doc.text(); - const paragraph_set<L>& parset = doc.paragraphs(); - std::map<char, std::string> html_map; html_map['\"'] = """; html_map['<'] = "<"; @@ -150,13 +147,16 @@ namespace scribo file << " </pcMetadata>" << std::endl; file << " <page image_filename=\"" << doc.filename() - << "\" image_width=\"" << lines.components().labeled_image().ncols() - << "\" image_height=\"" << lines.components().labeled_image().nrows() + << "\" image_width=\"" << doc.width() + << "\" image_height=\"" << doc.height() << "\">" << std::endl; // Text if (doc.has_text()) { + const line_set<L>& lines = doc.lines(); + const paragraph_set<L>& parset = doc.paragraphs(); + for_all_paragraphs(p, parset) { const mln::util::array<line_id_t>& line_ids = parset(p).line_ids(); @@ -224,9 +224,6 @@ namespace scribo abort(); } - const line_set<L>& lines = doc.text(); - const paragraph_set<L>& parset = doc.paragraphs(); - std::map<char, std::string> html_map; html_map['\"'] = """; html_map['<'] = "<"; @@ -244,13 +241,16 @@ namespace scribo file << " </pcMetadata>" << std::endl; file << " <page image_filename=\"" << doc.filename() - << "\" image_width=\"" << lines.components().labeled_image().ncols() - << "\" image_height=\"" << lines.components().labeled_image().nrows() + << "\" image_width=\"" << doc.width() + << "\" image_height=\"" << doc.height() << "\">" << std::endl; // Text if (doc.has_text()) { + const line_set<L>& lines = doc.lines(); + const paragraph_set<L>& parset = doc.paragraphs(); + for_all_paragraphs(p, parset) { const mln::util::array<line_id_t>& line_ids = parset(p).line_ids(); diff --git a/scribo/scribo/primitive/extract/elements.hh b/scribo/scribo/primitive/extract/elements.hh index 2e6a0cb..ddf2c92 100644 --- a/scribo/scribo/primitive/extract/elements.hh +++ b/scribo/scribo/primitive/extract/elements.hh @@ -1,4 +1,5 @@ -// Copyright (C) 2010 EPITA Research and Development Laboratory (LRDE) +// Copyright (C) 2010, 2011 EPITA Research and Development Laboratory +// (LRDE) // // This file is part of Olena. // @@ -114,7 +115,7 @@ namespace scribo mln_precondition(doc.is_valid()); mln_precondition(input.is_valid()); - const line_set<L>& lines = doc.text(); + const line_set<L>& lines = doc.lines(); // Element extraction diff --git a/scribo/src/pbm_text_in_doc.cc b/scribo/src/pbm_text_in_doc.cc index 2240f42..721ff47 100644 --- a/scribo/src/pbm_text_in_doc.cc +++ b/scribo/src/pbm_text_in_doc.cc @@ -1,5 +1,5 @@ -// Copyright (C) 2009, 2010 EPITA Research and Development Laboratory -// (LRDE) +// Copyright (C) 2009, 2010, 2011 EPITA Research and Development +// Laboratory (LRDE) // // This file is part of Olena. // @@ -49,7 +49,6 @@ #include <scribo/preprocessing/crop_without_localization.hh> -#include <scribo/io/xml/save.hh> #include <scribo/io/text_boxes/save.hh> @@ -65,6 +64,7 @@ for the background." }, { "pmin_col", "Col index of the top left corner of the Region of interest." }, { "pmax_row", "Row index of the bottom right corner of the Region of interest." }, { "pmax_col", "Col index of the bottom right corner of the Region of interest." }, + { "language", "Language to be used for the text recognition. [eng|fra] (Default: eng)" }, { "find_lines", "Find vertical lines. (Default 1)" }, { "find_whitespaces", "Find whitespaces separators. (Default 1)" }, { "debug_dir", "Output directory for debug image" }, @@ -77,16 +77,16 @@ int main(int argc, char* argv[]) using namespace scribo; using namespace mln; - if (argc != 3 && argc != 4 && argc != 5 && argc != 8 && argc != 9) + if (argc < 3 || argc > 12) return scribo::debug::usage(argv, "Find text lines using left/right validation and display x-height in a binarized article.", - "input.pbm out.txt <denoise_enabled> [<pmin_row> <pmin_col> <pmax_row> <pmax_col>] <find_lines> <find_whitespaces> <debug_dir>", + "input.pbm out.txt <denoise_enabled> [<pmin_row> <pmin_col> <pmax_row> <pmax_col>] <language> <find_lines> <find_whitespaces> <debug_dir>", args_desc); bool debug = false; // Enable debug output. - if (argc == 7 || argc == 11) + if (argc == 8 || argc == 12) { scribo::make::internal::debug_filename_prefix = argv[argc - 1]; debug = true; @@ -101,7 +101,7 @@ int main(int argc, char* argv[]) // Optional Cropping point2d crop_shift = literal::origin; - if (argc >= 11) + if (argc >= 12) { mln::def::coord minr = atoi(argv[4]), @@ -120,13 +120,24 @@ int main(int argc, char* argv[]) bool denoise = (argc > 3 && atoi(argv[3]) != 0); + std::string language = "eng"; + if (argc > 4 && argc < 12) + language = argv[4]; + else if (argc == 12) + language = argv[8]; + bool find_line_seps = true; - if (argc >= 4 && argc < 11) - find_line_seps = (atoi(argv[3]) != 0); + if (argc > 5 && argc < 12) + find_line_seps = (atoi(argv[5]) != 0); + else if (argc == 12) + find_line_seps = (atoi(argv[9]) != 0); bool find_whitespace_seps = true; - if (argc >= 5 && argc < 11) - find_line_seps = (atoi(argv[4]) != 0); + if (argc > 6 && argc < 12) + find_whitespace_seps = (atoi(argv[6]) != 0); + else if (argc == 12) + find_whitespace_seps = (atoi(argv[10]) != 0); + std::cout << "Running with the following options :" << "find_lines_seps = " << find_line_seps @@ -136,15 +147,12 @@ int main(int argc, char* argv[]) // Run document toolchain. line_set<L> - lines = scribo::toolchain::text_in_doc(input, denoise, find_line_seps, + lines = scribo::toolchain::text_in_doc(input, denoise, + language, find_line_seps, find_whitespace_seps, debug); scribo::document<L> doc; doc.set_filename(argv[1]); - doc.set_text(lines); - - // Saving results - scribo::io::xml::save(doc, "out.xml", true); // Specify shift due to potential previous crop. scribo::io::text_boxes::save(lines, argv[2], crop_shift); diff --git a/scribo/tests/toolchain/nepomuk/text_extraction.cc b/scribo/tests/toolchain/nepomuk/text_extraction.cc index 7191650..eeafd6b 100644 --- a/scribo/tests/toolchain/nepomuk/text_extraction.cc +++ b/scribo/tests/toolchain/nepomuk/text_extraction.cc @@ -43,9 +43,9 @@ int main() QImage ima(SCRIBO_IMG_DIR "/wildly.pbm"); QSet<QString> words = scribo::toolchain::nepomuk::text_extraction(ima); - words = words.toLower(); mln_assertion(words.size() == 1); - mln_assertion(words.contains("wildly")); + QString word = words.toList().at(0).toLower(); + mln_assertion(word == "wildly"); return 0; } -- 1.5.6.5
participants (1)
-
Guillaume Lazzara