* scribo/toolchain/content_in_doc.hh: New.
* scribo/toolchain/internal/content_in_doc_functor.hh: Use
paragraph related routines.
* src/content_in_doc.cc: Update use of content_in_doc_functor.
---
scribo/ChangeLog | 11 ++
.../{text_in_doc.hh => content_in_doc.hh} | 52 +++++-----
.../toolchain/internal/content_in_doc_functor.hh | 106 +++++++++++++++++++-
scribo/src/content_in_doc.cc | 48 +++-------
4 files changed, 155 insertions(+), 62 deletions(-)
copy scribo/scribo/toolchain/{text_in_doc.hh => content_in_doc.hh} (57%)
diff --git a/scribo/ChangeLog b/scribo/ChangeLog
index c947550..809ce7d 100644
--- a/scribo/ChangeLog
+++ b/scribo/ChangeLog
@@ -1,3 +1,14 @@
+2011-01-26 Guillaume Lazzara <z(a)lrde.epita.fr>
+
+ Add paragraph processing in content_in_doc toolchain.
+
+ * scribo/toolchain/content_in_doc.hh: New.
+
+ * scribo/toolchain/internal/content_in_doc_functor.hh: Use
+ paragraph related routines.
+
+ * src/content_in_doc.cc: Update use of content_in_doc_functor.
+
2011-01-25 Guillaume Lazzara <z(a)lrde.epita.fr>
Small fixes in Scribo.
diff --git a/scribo/scribo/toolchain/text_in_doc.hh
b/scribo/scribo/toolchain/content_in_doc.hh
similarity index 57%
copy from scribo/scribo/toolchain/text_in_doc.hh
copy to scribo/scribo/toolchain/content_in_doc.hh
index e6ba69e..f2938d9 100644
--- a/scribo/scribo/toolchain/text_in_doc.hh
+++ b/scribo/scribo/toolchain/content_in_doc.hh
@@ -1,5 +1,4 @@
-// Copyright (C) 2009, 2010 EPITA Research and Development Laboratory
-// (LRDE)
+// Copyright (C) 2011 EPITA Research and Development Laboratory (LRDE)
//
// This file is part of Olena.
//
@@ -24,14 +23,14 @@
// exception does not however invalidate any other reasons why the
// executable file might be covered by the GNU General Public License.
-#ifndef SCRIBO_TOOLCHAIN_TEXT_IN_DOC_HH
-# define SCRIBO_TOOLCHAIN_TEXT_IN_DOC_HH
+#ifndef SCRIBO_TOOLCHAIN_CONTENT_IN_DOC_HH
+# define SCRIBO_TOOLCHAIN_CONTENT_IN_DOC_HH
/// \file
///
-/// Extract text from a document.
+/// Analyse a document.
-# include <scribo/toolchain/internal/text_in_doc_functor.hh>
+# include <scribo/toolchain/internal/content_in_doc_functor.hh>
namespace scribo
{
@@ -42,36 +41,41 @@ namespace scribo
using namespace mln;
- template <typename I>
- line_set<mln_ch_value(I, def::lbl_type)>
- text_in_doc(const Image<I>& input, bool denoise,
- const std::string& language = std::string("eng"),
- bool find_line_seps = true,
- bool find_whitespace_seps = true,
- bool debug = false);
+ template <typename I, typename J>
+ document<mln_ch_value(I, def::lbl_type)>
+ content_in_doc(const Image<I>& input, const Image<J>&
input_preproc,
+ bool denoise,
+ const std::string& language = std::string("eng"),
+ bool find_line_seps = true,
+ bool find_whitespace_seps = true,
+ bool debug = false);
# ifndef MLN_INCLUDE_ONLY
- template <typename I>
- line_set<mln_ch_value(I, def::lbl_type)>
- text_in_doc(const Image<I>& input, bool denoise,
- const std::string& language = std::string("eng"),
- bool find_line_seps = true,
- bool find_whitespace_seps = true,
- bool debug = false)
+ template <typename I, typename J>
+ document<mln_ch_value(I, def::lbl_type)>
+ content_in_doc(const Image<I>& input, const Image<J>&
input_preproc,
+ bool denoise,
+ const std::string& language = std::string("eng"),
+ bool find_line_seps = true,
+ bool find_whitespace_seps = true,
+ bool debug = false)
{
- internal::text_in_doc_functor<I> f;
+ mln_precondition(input.is_valid());
+ mln_precondition(input_preproc.is_valid());
+
+ internal::content_in_doc_functor<J> f("noname");
f.enable_denoising = denoise;
f.enable_line_seps = find_line_seps;
f.enable_whitespace_seps = find_whitespace_seps;
f.enable_debug = debug;
f.ocr_language = language;
- line_set<mln_ch_value(I, def::lbl_type)> lines = f(input);
+ document<mln_ch_value(I, def::lbl_type)> doc = f(input, input_preproc);
- return lines;
+ return doc;
}
@@ -83,5 +87,5 @@ namespace scribo
} // end of namespace scribo
-#endif // SCRIBO_TOOLCHAIN_TEXT_IN_DOC_HH
+#endif // SCRIBO_TOOLCHAIN_CONTENT_IN_DOC_HH
diff --git a/scribo/scribo/toolchain/internal/content_in_doc_functor.hh
b/scribo/scribo/toolchain/internal/content_in_doc_functor.hh
index 12e5137..7c665e5 100644
--- a/scribo/scribo/toolchain/internal/content_in_doc_functor.hh
+++ b/scribo/scribo/toolchain/internal/content_in_doc_functor.hh
@@ -30,14 +30,20 @@
# include <scribo/core/def/lbl_type.hh>
# include <scribo/core/document.hh>
+# include <scribo/core/line_set.hh>
+# include <scribo/core/paragraph_set.hh>
# include <scribo/primitive/extract/elements.hh>
# include <scribo/primitive/extract/components.hh>
# include <scribo/primitive/extract/vertical_separators.hh>
# include <scribo/primitive/extract/separators_nonvisible.hh>
+# include <scribo/primitive/extract/elements.hh>
+
+# include <scribo/primitive/identify.hh>
# include <scribo/primitive/remove/separators.hh>
+# include <scribo/filter/line_links_x_height.hh>
# include <scribo/filter/object_links_bbox_h_ratio.hh>
# include <scribo/filter/objects_small.hh>
@@ -52,6 +58,7 @@
# include <scribo/text/recognition.hh>
# include <scribo/text/merging.hh>
+# include <scribo/text/link_lines.hh>
# include <scribo/make/debug_filename.hh>
@@ -336,6 +343,7 @@ namespace scribo
lines = scribo::text::merging(lines);
+ //===== DEBUG =====
if (enable_debug)
{
@@ -353,24 +361,116 @@ namespace scribo
}
+ //===== END OF DEBUG =====
on_progress();
- on_new_progress_label("Recognizing text");
+ // Text recognition
+ on_new_progress_label("Recognizing text");
scribo::text::recognition(lines, ocr_language.c_str());
- doc.set_text(lines);
on_progress();
+
+ // Link text lines
+ on_new_progress_label("Linking text lines");
+ line_links<L> llinks = scribo::text::link_lines(lines);
+
+
+ //===== DEBUG =====
+ if (enable_debug)
+ {
+ image2d<value::rgb8> debug = data::convert(value::rgb8(), original_image);
+ for_all_lines(l, lines)
+ {
+ if (! lines(l).is_valid() || lines(l).is_hidden() || lines(l).type() != line::Text)
+ continue;
+
+ mln::draw::box(debug, lines(l).bbox(), literal::blue);
+ mln::draw::line(debug, lines(l).bbox().pcenter(), lines(llinks(l)).bbox().pcenter(),
literal::green);
+ }
+
+ mln::io::ppm::save(debug, scribo::make::debug_filename("links_raw.ppm"));
+ }
+ //===== END OF DEBUG =====
+
+ on_progress();
+
+
+ // Filter line links.
+ on_new_progress_label("Filter line links");
+ llinks = scribo::filter::line_links_x_height(llinks);
+
+ //===== DEBUG =====
+ if (enable_debug)
+ {
+ image2d<value::rgb8> debug = data::convert(value::rgb8(), original_image);
+ for_all_links(i, llinks)
+ if (llinks(i) && llinks(i) != i)
+ mln::draw::line(debug, lines(i).bbox().pcenter(),
+ lines(llinks(i)).bbox().pcenter(), literal::red);
+
+ mln::io::ppm::save(debug, scribo::make::debug_filename("links.ppm"));
+
+
+ for (unsigned i = 1; i < llinks.nelements(); ++i)
+ llinks(i) = scribo::make::internal::find_root(llinks, i);
+
+ debug = data::convert(value::rgb8(), original_image);
+ mln::util::array<accu::shape::bbox<point2d> > nbbox(llinks.nelements());
+ for_all_lines(i, lines)
+ {
+ if (! lines(i).is_valid() || lines(i).is_hidden() || lines(i).type() != line::Text)
+ continue;
+
+ mln::draw::box(debug, lines(i).bbox(), literal::red);
+ nbbox(llinks(i)).take(lines(i).bbox());
+ }
+
+ for (unsigned i = 1; i < nbbox.nelements(); ++i)
+ if (nbbox(i).is_valid())
+ {
+ box2d b = nbbox(i).to_result();
+ mln::draw::box(debug, b, literal::green);
+ b.enlarge(1);
+ mln::draw::box(debug, b, literal::green);
+ b.enlarge(1);
+ mln::draw::box(debug, b, literal::green);
+ }
+
+ mln::io::ppm::save(debug, scribo::make::debug_filename("par.ppm"));
+ }
+ //===== END OF DEBUG =====
+
+ on_progress();
+
+
+ // Construct paragraphs
+ on_new_progress_label("Constructing paragraphs");
+ scribo::paragraph_set<L> parset = scribo::make::paragraph(llinks);
+ doc.set_paragraphs(parset);
+
+ on_progress();
+
+
// Extract other Elements
on_new_progress_label("Extracting Elements");
component_set<L>
elements = scribo::primitive::extract::elements(doc, original_image);
+
+ on_progress();
+
+
+ // Identify other Elements
+ on_new_progress_label("Identifying Elements");
+ elements = scribo::primitive::identify(elements);
doc.set_elements(elements);
on_progress();
+
+
// Saving results
if (save_doc_as_xml)
{
@@ -391,7 +491,7 @@ namespace scribo
int
content_in_doc_functor<I>::nsteps() const
{
- return 7 + enable_denoising + enable_line_seps
+ return 11 + enable_denoising + enable_line_seps
+ enable_whitespace_seps + save_doc_as_xml;
}
diff --git a/scribo/src/content_in_doc.cc b/scribo/src/content_in_doc.cc
index f453f08..8cd262b 100644
--- a/scribo/src/content_in_doc.cc
+++ b/scribo/src/content_in_doc.cc
@@ -30,38 +30,23 @@
#include <iostream>
#include <mln/core/image/image2d.hh>
-#include <mln/core/alias/neighb2d.hh>
#include <mln/io/pbm/save.hh>
#include <mln/io/magick/load.hh>
-#include <mln/value/label_8.hh>
-
-#include <mln/core/var.hh>
-
-#include <mln/accu/count_value.hh>
-
-#include <mln/draw/box_plain.hh>
-
-
-#include <scribo/toolchain/text_in_doc.hh>
+#include <scribo/toolchain/content_in_doc.hh>
#include <scribo/toolchain/text_in_doc_preprocess.hh>
#include <scribo/core/document.hh>
-#include <scribo/core/line_set.hh>
#include <scribo/debug/usage.hh>
#include <scribo/make/debug_filename.hh>
-#include <scribo/primitive/extract/elements.hh>
-
#include <scribo/preprocessing/crop_without_localization.hh>
#include <scribo/preprocessing/crop.hh>
#include <scribo/io/xml/save.hh>
-#include <scribo/io/text_boxes/save.hh>
-
const char *args_desc[][2] =
@@ -108,11 +93,11 @@ int main(int argc, char* argv[])
Magick::InitializeMagick(*argv);
typedef image2d<scribo::def::lbl_type> L;
- scribo::document<L> doc(argv[1]);
- doc.open();
+ image2d<value::rgb8> input;
+ mln::io::magick::load(input, argv[1]);
// Preprocess document
- image2d<bool> input;
+ image2d<bool> input_preproc;
{
double K = 0.34;
if (argc == 8 || argc == 12 || argc >= 12)
@@ -125,7 +110,7 @@ int main(int argc, char* argv[])
}
image2d<bool> tmp_fg;
- input = toolchain::text_in_doc_preprocess(doc.image(), false, K);
+ input_preproc = toolchain::text_in_doc_preprocess(input, false, K);
}
// Optional Cropping
@@ -142,12 +127,12 @@ int main(int argc, char* argv[])
<< " to (" << maxr << "," << maxc
<< ")" << std::endl;
box2d roi = mln::make::box2d(minr, minc, maxr, maxc);
- input = preprocessing::crop_without_localization(input, roi);
+ input_preproc = preprocessing::crop_without_localization(input_preproc, roi);
crop_shift = point2d(minr, minc);
if (debug)
- mln::io::pbm::save(input,
- scribo::make::debug_filename("input_cropped.pbm"));
+ mln::io::pbm::save(input_preproc,
+ scribo::make::debug_filename("input_preproc_cropped.pbm"));
}
bool denoise = (argc > 3 && atoi(argv[3]) != 0);
@@ -174,18 +159,11 @@ int main(int argc, char* argv[])
// Run document toolchain.
// Text
- std::cout << "Extracting text" << std::endl;
- line_set<L>
- lines = scribo::toolchain::text_in_doc(input, denoise, language,
- find_line_seps, find_whitespace_seps,
- debug);
- doc.set_text(lines);
-
- // Elements
- std::cout << "Extracting Elements" << std::endl;
- component_set<L> elements = scribo::primitive::extract::elements(doc, input);
- doc.set_elements(elements);
-
+ std::cout << "Analysing document..." << std::endl;
+ document<L>
+ doc = scribo::toolchain::content_in_doc(input, input_preproc, denoise, language,
+ find_line_seps, find_whitespace_seps,
+ debug);
// Saving results
scribo::io::xml::save(doc, argv[2], true);
--
1.5.6.5