last-svn-commit-745-g2601ca6 Small fixes in Scribo.

26 Jan 2011

* scribo/core/paragraph_set.hh,
	* scribo/core/document.hh: Add new methods.

	* scribo/core/macros.hh: Add a missing macro.

	* scribo/io/xml/save.hh,
	* scribo/primitive/extract/elements.hh: Make use of the methods in
	document class.

	* src/pbm_text_in_doc.cc: Add recognized language as an option.

	* tests/toolchain/nepomuk/text_extraction.cc: Make the test not
	case dependent.
---
 scribo/ChangeLog                                  |   18 +++++++++
 scribo/scribo/core/document.hh                    |   41 +++++++++++++++------
 scribo/scribo/core/macros.hh                      |    3 ++
 scribo/scribo/core/paragraph_set.hh               |   20 ++++++++++
 scribo/scribo/io/xml/save.hh                      |   20 +++++-----
 scribo/scribo/primitive/extract/elements.hh       |    5 ++-
 scribo/src/pbm_text_in_doc.cc                     |   40 ++++++++++++--------
 scribo/tests/toolchain/nepomuk/text_extraction.cc |    4 +-
 8 files changed, 110 insertions(+), 41 deletions(-)

diff --git a/scribo/ChangeLog b/scribo/ChangeLog
index 3ec57f0..c947550 100644
--- a/scribo/ChangeLog
+++ b/scribo/ChangeLog
@@ -1,5 +1,23 @@
 2011-01-25  Guillaume Lazzara  <z@lrde.epita.fr>
 
+	Small fixes in Scribo.
+
+	* scribo/core/paragraph_set.hh,
+	* scribo/core/document.hh: Add new methods.
+
+	* scribo/core/macros.hh: Add a missing macro.
+
+	* scribo/io/xml/save.hh,
+	* scribo/primitive/extract/elements.hh: Make use of the methods in
+	document class.
+
+	* src/pbm_text_in_doc.cc: Add recognized language as an option.
+
+	* tests/toolchain/nepomuk/text_extraction.cc: Make the test not
+	case dependent.
+
+2011-01-25  Guillaume Lazzara  <z@lrde.epita.fr>
+
 	Add an option to choose the recognized language.
 
 	* scribo/toolchain/nepomuk/text_extraction.hh,
diff --git a/scribo/scribo/core/document.hh b/scribo/scribo/core/document.hh
index f4a78ff..b547da4 100644
--- a/scribo/scribo/core/document.hh
+++ b/scribo/scribo/core/document.hh
@@ -1,4 +1,5 @@
-// Copyright (C) 2010 EPITA Research and Development Laboratory (LRDE)
+// Copyright (C) 2010, 2011 EPITA Research and Development Laboratory
+// (LRDE)
 //
 // This file is part of Olena.
 //
@@ -59,15 +60,24 @@ namespace scribo
 
     bool is_valid() const;
 
-    const line_set<L>& text() const;
+    /*! \brief Check whether this document contains text.
+
+      If it returns true, that document contains paragraphs, lines and
+      text components.
+
+     */
     bool has_text() const;
-    void set_text(const line_set<L>& line);
+
+    mln::def::coord height() const;
+    mln::def::coord width() const;
+
+    const line_set<L>& lines() const;
 
     const paragraph_set<L>& paragraphs() const;
     void set_paragraphs(const paragraph_set<L>& parset);
 
-    const component_set<L>& elements() const;
     bool has_elements() const;
+    const component_set<L>& elements() const;
     void set_elements(const component_set<L>& elements);
 
     const mln::image2d<value::rgb8>& image() const;
@@ -150,10 +160,18 @@ namespace scribo
 
 
   template <typename L>
-  const line_set<L>&
-  document<L>::text() const
+  mln::def::coord
+  document<L>::width() const
   {
-    return lines_;
+    return image_.ncols();
+  }
+
+
+  template <typename L>
+  mln::def::coord
+  document<L>::height() const
+  {
+    return image_.nrows();
   }
 
 
@@ -161,17 +179,18 @@ namespace scribo
   bool
   document<L>::has_text() const
   {
-    return lines_.is_valid();
+    return parset_.is_valid();
   }
 
 
   template <typename L>
-  void
-  document<L>::set_text(const line_set<L>& line)
+  const line_set<L>&
+  document<L>::lines() const
   {
-    lines_ = line;
+    return parset_.lines();
   }
 
+
   template <typename L>
   const paragraph_set<L>&
   document<L>::paragraphs() const
diff --git a/scribo/scribo/core/macros.hh b/scribo/scribo/core/macros.hh
index 1060358..887539f 100644
--- a/scribo/scribo/core/macros.hh
+++ b/scribo/scribo/core/macros.hh
@@ -62,4 +62,7 @@
 # define for_all_anchors(E, S) \
   for_all_elements(E, S)
 
+# define for_all_paragraph_lines(E, S) \
+  for_all_elements(E, S)
+
 #endif // ! SCRIBO_CORE_MACROS_HH
diff --git a/scribo/scribo/core/paragraph_set.hh b/scribo/scribo/core/paragraph_set.hh
index afb59c5..355eaa9 100644
--- a/scribo/scribo/core/paragraph_set.hh
+++ b/scribo/scribo/core/paragraph_set.hh
@@ -53,9 +53,13 @@ namespace scribo
     paragraph_info<L>& operator()(unsigned i);
     const paragraph_info<L>& operator()(unsigned i) const;
 
+    bool is_valid() const;
+
+    const line_set<L>& lines() const;
 
   private:
     mln::util::array<paragraph_info<L> > pars_;
+    line_set<L> lines_;
   };
 
 
@@ -82,6 +86,7 @@ namespace scribo
   paragraph_set<L>::paragraph_set(const line_links<L>& llinks, unsigned npars)
     : pars_(npars + 1, paragraph_info<L>(llinks))
   {
+    lines_ = llinks.lines();
   }
 
   template <typename L>
@@ -106,6 +111,21 @@ namespace scribo
   }
 
 
+  template <typename L>
+  bool
+  paragraph_set<L>::is_valid() const
+  {
+    return !pars_.is_empty();
+  }
+
+
+  template <typename L>
+  const line_set<L>&
+  paragraph_set<L>::lines() const
+  {
+    return lines_;
+  }
+
 
   namespace make
   {
diff --git a/scribo/scribo/io/xml/save.hh b/scribo/scribo/io/xml/save.hh
index 1bcdd6f..41d4fef 100644
--- a/scribo/scribo/io/xml/save.hh
+++ b/scribo/scribo/io/xml/save.hh
@@ -130,9 +130,6 @@ namespace scribo
 	    abort();
 	  }
 
-	  const line_set<L>& lines = doc.text();
-	  const paragraph_set<L>& parset = doc.paragraphs();
-
 	  std::map<char, std::string> html_map;
 	  html_map['\"'] = """;
 	  html_map['<'] = "<";
@@ -150,13 +147,16 @@ namespace scribo
 	  file << "  </pcMetadata>" << std::endl;
 
 	  file << "  <page image_filename=\"" << doc.filename()
-	       << "\" image_width=\"" << lines.components().labeled_image().ncols()
-	       << "\" image_height=\"" << lines.components().labeled_image().nrows()
+	       << "\" image_width=\"" << doc.width()
+	       << "\" image_height=\"" << doc.height()
 	       << "\">" << std::endl;
 
 	  // Text
 	  if (doc.has_text())
 	  {
+	    const line_set<L>& lines = doc.lines();
+	    const paragraph_set<L>& parset = doc.paragraphs();
+
 	    for_all_paragraphs(p, parset)
 	    {
 	      const mln::util::array<line_id_t>& line_ids = parset(p).line_ids();
@@ -224,9 +224,6 @@ namespace scribo
 	    abort();
 	  }
 
-	  const line_set<L>& lines = doc.text();
-	  const paragraph_set<L>& parset = doc.paragraphs();
-
 	  std::map<char, std::string> html_map;
 	  html_map['\"'] = """;
 	  html_map['<'] = "<";
@@ -244,13 +241,16 @@ namespace scribo
 	  file << "  </pcMetadata>" << std::endl;
 
 	  file << "  <page image_filename=\"" << doc.filename()
-	       << "\" image_width=\"" << lines.components().labeled_image().ncols()
-	       << "\" image_height=\"" << lines.components().labeled_image().nrows()
+	       << "\" image_width=\"" << doc.width()
+	       << "\" image_height=\"" << doc.height()
 	       << "\">" << std::endl;
 
 	  // Text
 	  if (doc.has_text())
 	  {
+	    const line_set<L>& lines = doc.lines();
+	    const paragraph_set<L>& parset = doc.paragraphs();
+
 	    for_all_paragraphs(p, parset)
 	    {
 	      const mln::util::array<line_id_t>& line_ids = parset(p).line_ids();
diff --git a/scribo/scribo/primitive/extract/elements.hh b/scribo/scribo/primitive/extract/elements.hh
index 2e6a0cb..ddf2c92 100644
--- a/scribo/scribo/primitive/extract/elements.hh
+++ b/scribo/scribo/primitive/extract/elements.hh
@@ -1,4 +1,5 @@
-// Copyright (C) 2010 EPITA Research and Development Laboratory (LRDE)
+// Copyright (C) 2010, 2011 EPITA Research and Development Laboratory
+// (LRDE)
 //
 // This file is part of Olena.
 //
@@ -114,7 +115,7 @@ namespace scribo
 	mln_precondition(doc.is_valid());
 	mln_precondition(input.is_valid());
 
-	const line_set<L>& lines = doc.text();
+	const line_set<L>& lines = doc.lines();
 
 	// Element extraction
 
diff --git a/scribo/src/pbm_text_in_doc.cc b/scribo/src/pbm_text_in_doc.cc
index 2240f42..721ff47 100644
--- a/scribo/src/pbm_text_in_doc.cc
+++ b/scribo/src/pbm_text_in_doc.cc
@@ -1,5 +1,5 @@
-// Copyright (C) 2009, 2010 EPITA Research and Development Laboratory
-// (LRDE)
+// Copyright (C) 2009, 2010, 2011 EPITA Research and Development
+// Laboratory (LRDE)
 //
 // This file is part of Olena.
 //
@@ -49,7 +49,6 @@
 
 #include <scribo/preprocessing/crop_without_localization.hh>
 
-#include <scribo/io/xml/save.hh>
 #include <scribo/io/text_boxes/save.hh>
 
 
@@ -65,6 +64,7 @@ for the background." },
   { "pmin_col", "Col index of the top left corner of the Region of interest." },
   { "pmax_row", "Row index of the bottom right corner of the Region of interest." },
   { "pmax_col", "Col index of the bottom right corner of the Region of interest." },
+  { "language", "Language to be used for the text recognition. [eng|fra] (Default: eng)" },
   { "find_lines", "Find vertical lines. (Default 1)" },
   { "find_whitespaces", "Find whitespaces separators. (Default 1)" },
   { "debug_dir", "Output directory for debug image" },
@@ -77,16 +77,16 @@ int main(int argc, char* argv[])
   using namespace scribo;
   using namespace mln;
 
-  if (argc != 3 && argc != 4 && argc != 5 && argc != 8 && argc != 9)
+  if (argc < 3 || argc > 12)
     return scribo::debug::usage(argv,
 				"Find text lines using left/right validation and display x-height in a binarized article.",
-				"input.pbm out.txt <denoise_enabled> [<pmin_row> <pmin_col> <pmax_row> <pmax_col>] <find_lines> <find_whitespaces> <debug_dir>",
+				"input.pbm out.txt <denoise_enabled> [<pmin_row> <pmin_col> <pmax_row> <pmax_col>] <language> <find_lines> <find_whitespaces> <debug_dir>",
 				args_desc);
 
   bool debug = false;
 
   // Enable debug output.
-  if (argc == 7 || argc == 11)
+  if (argc == 8 || argc == 12)
   {
     scribo::make::internal::debug_filename_prefix = argv[argc - 1];
     debug = true;
@@ -101,7 +101,7 @@ int main(int argc, char* argv[])
 
   // Optional Cropping
   point2d crop_shift = literal::origin;
-  if (argc >= 11)
+  if (argc >= 12)
   {
     mln::def::coord
       minr = atoi(argv[4]),
@@ -120,13 +120,24 @@ int main(int argc, char* argv[])
 
   bool denoise = (argc > 3 && atoi(argv[3]) != 0);
 
+  std::string language = "eng";
+  if (argc > 4 && argc < 12)
+    language = argv[4];
+  else if (argc == 12)
+    language = argv[8];
+
   bool find_line_seps = true;
-  if (argc >= 4 && argc < 11)
-    find_line_seps = (atoi(argv[3]) != 0);
+  if (argc > 5 && argc < 12)
+    find_line_seps = (atoi(argv[5]) != 0);
+  else if (argc == 12)
+    find_line_seps = (atoi(argv[9]) != 0);
 
   bool find_whitespace_seps = true;
-  if (argc >= 5 && argc < 11)
-    find_line_seps = (atoi(argv[4]) != 0);
+  if (argc > 6 && argc < 12)
+    find_whitespace_seps = (atoi(argv[6]) != 0);
+  else if (argc == 12)
+    find_whitespace_seps = (atoi(argv[10]) != 0);
+
 
   std::cout << "Running with the following options :"
 	    << "find_lines_seps = " << find_line_seps
@@ -136,15 +147,12 @@ int main(int argc, char* argv[])
 
   // Run document toolchain.
   line_set<L>
-    lines = scribo::toolchain::text_in_doc(input, denoise, find_line_seps,
+    lines = scribo::toolchain::text_in_doc(input, denoise,
+					   language, find_line_seps,
 					   find_whitespace_seps, debug);
 
   scribo::document<L> doc;
   doc.set_filename(argv[1]);
-  doc.set_text(lines);
-
-  // Saving results
-  scribo::io::xml::save(doc, "out.xml", true);
 
   // Specify shift due to potential previous crop.
   scribo::io::text_boxes::save(lines, argv[2], crop_shift);
diff --git a/scribo/tests/toolchain/nepomuk/text_extraction.cc b/scribo/tests/toolchain/nepomuk/text_extraction.cc
index 7191650..eeafd6b 100644
--- a/scribo/tests/toolchain/nepomuk/text_extraction.cc
+++ b/scribo/tests/toolchain/nepomuk/text_extraction.cc
@@ -43,9 +43,9 @@ int main()
   QImage ima(SCRIBO_IMG_DIR "/wildly.pbm");
   QSet<QString> words = scribo::toolchain::nepomuk::text_extraction(ima);
 
-  words = words.toLower();
   mln_assertion(words.size() == 1);
-  mln_assertion(words.contains("wildly"));
+  QString word = words.toList().at(0).toLower();
+  mln_assertion(word == "wildly");
 
   return 0;
 }
-- 
1.5.6.5

    

Guillaume Lazzara

tags

participants (1)