* scribo/toolchain/content_in_hdoc.hh,
* scribo/toolchain/internal/content_in_hdoc_functor.hh,
* src/content_in_hdoc.cc: New.
---
scribo/ChangeLog | 8 ++
.../{content_in_doc.hh => content_in_hdoc.hh} | 14 ++--
...n_doc_functor.hh => content_in_hdoc_functor.hh} | 69 +++++++++++++------
.../src/{content_in_doc.cc => content_in_hdoc.cc} | 47 ++++++++++++--
4 files changed, 103 insertions(+), 35 deletions(-)
copy scribo/scribo/toolchain/{content_in_doc.hh => content_in_hdoc.hh} (85%)
copy scribo/scribo/toolchain/internal/{content_in_doc_functor.hh =>
content_in_hdoc_functor.hh} (87%)
copy scribo/src/{content_in_doc.cc => content_in_hdoc.cc} (80%)
diff --git a/scribo/ChangeLog b/scribo/ChangeLog
index 0b6042c..d3d7f90 100644
--- a/scribo/ChangeLog
+++ b/scribo/ChangeLog
@@ -1,3 +1,11 @@
+2011-05-16 Guillaume Lazzara <lazzara(a)fidji.lrde.epita.fr>
+
+ New specific toolchain for historical documents.
+
+ * scribo/toolchain/content_in_hdoc.hh,
+ * scribo/toolchain/internal/content_in_hdoc_functor.hh,
+ * src/content_in_hdoc.cc: New.
+
2011-05-17 Guillaume Lazzara <z(a)lrde.epita.fr>
Fix use of skeleton_constrained.
diff --git a/scribo/scribo/toolchain/content_in_doc.hh
b/scribo/scribo/toolchain/content_in_hdoc.hh
similarity index 85%
copy from scribo/scribo/toolchain/content_in_doc.hh
copy to scribo/scribo/toolchain/content_in_hdoc.hh
index 8f6f7a4..97233d5 100644
--- a/scribo/scribo/toolchain/content_in_doc.hh
+++ b/scribo/scribo/toolchain/content_in_hdoc.hh
@@ -23,14 +23,14 @@
// exception does not however invalidate any other reasons why the
// executable file might be covered by the GNU General Public License.
-#ifndef SCRIBO_TOOLCHAIN_CONTENT_IN_DOC_HH
-# define SCRIBO_TOOLCHAIN_CONTENT_IN_DOC_HH
+#ifndef SCRIBO_TOOLCHAIN_CONTENT_IN_HDOC_HH
+# define SCRIBO_TOOLCHAIN_CONTENT_IN_HDOC_HH
/// \file
///
/// Analyse a document.
-# include <scribo/toolchain/internal/content_in_doc_functor.hh>
+# include <scribo/toolchain/internal/content_in_hdoc_functor.hh>
namespace scribo
{
@@ -43,7 +43,7 @@ namespace scribo
template <typename I, typename J>
document<mln_ch_value(I, def::lbl_type)>
- content_in_doc(const Image<I>& input, const Image<J>&
input_preproc,
+ content_in_hdoc(const Image<I>& input, const Image<J>&
input_preproc,
bool denoise,
bool find_line_seps = true,
bool find_whitespace_seps = true,
@@ -56,7 +56,7 @@ namespace scribo
template <typename I, typename J>
document<mln_ch_value(I, def::lbl_type)>
- content_in_doc(const Image<I>& input, const Image<J>&
input_preproc,
+ content_in_hdoc(const Image<I>& input, const Image<J>&
input_preproc,
bool denoise,
bool find_line_seps = true,
bool find_whitespace_seps = true,
@@ -66,7 +66,7 @@ namespace scribo
mln_precondition(exact(input).is_valid());
mln_precondition(exact(input_preproc).is_valid());
- internal::content_in_doc_functor<J> f("noname");
+ internal::content_in_hdoc_functor<J> f("noname");
f.enable_denoising = denoise;
f.enable_line_seps = find_line_seps;
f.enable_whitespace_seps = find_whitespace_seps;
@@ -87,5 +87,5 @@ namespace scribo
} // end of namespace scribo
-#endif // SCRIBO_TOOLCHAIN_CONTENT_IN_DOC_HH
+#endif // SCRIBO_TOOLCHAIN_CONTENT_IN_HDOC_HH
diff --git a/scribo/scribo/toolchain/internal/content_in_doc_functor.hh
b/scribo/scribo/toolchain/internal/content_in_hdoc_functor.hh
similarity index 87%
copy from scribo/scribo/toolchain/internal/content_in_doc_functor.hh
copy to scribo/scribo/toolchain/internal/content_in_hdoc_functor.hh
index d60f3cc..92db8a7 100644
--- a/scribo/scribo/toolchain/internal/content_in_doc_functor.hh
+++ b/scribo/scribo/toolchain/internal/content_in_hdoc_functor.hh
@@ -24,19 +24,20 @@
// exception does not however invalidate any other reasons why the
// executable file might be covered by the GNU General Public License.
-#ifndef SCRIBO_TOOLCHAIN_INTERNAL_CONTENT_IN_DOC_FUNCTOR_HH
-# define SCRIBO_TOOLCHAIN_INTERNAL_CONTENT_IN_DOC_FUNCTOR_HH
+#ifndef SCRIBO_TOOLCHAIN_INTERNAL_CONTENT_IN_HDOC_FUNCTOR_HH
+# define SCRIBO_TOOLCHAIN_INTERNAL_CONTENT_IN_HDOC_FUNCTOR_HH
# include <scribo/core/def/lbl_type.hh>
# include <scribo/core/document.hh>
# include <scribo/core/line_set.hh>
# include <scribo/core/paragraph_set.hh>
-# include <scribo/primitive/extract/non_text.hh>
+# include <scribo/primitive/extract/non_text_hdoc.hh>
# include <scribo/primitive/extract/components.hh>
-# include <scribo/primitive/extract/separators.hh>
-# include <scribo/primitive/extract/vertical_separators.hh>
-# include <scribo/primitive/extract/horizontal_separators.hh>
+// # include <scribo/primitive/extract/separators.hh>
+// # include <scribo/primitive/extract/vertical_separators.hh>
+// # include <scribo/primitive/extract/horizontal_separators.hh>
+# include <scribo/primitive/extract/lines_h_thick_and_thin.hh>
# include <scribo/primitive/extract/alignments.hh>
@@ -44,6 +45,8 @@
# include <scribo/primitive/remove/separators.hh>
+# include <scribo/preprocessing/rotate_90.hh>
+
# include <scribo/filter/line_links_x_height.hh>
# include <scribo/filter/object_links_bbox_h_ratio.hh>
# include <scribo/filter/objects_small.hh>
@@ -87,13 +90,13 @@ namespace scribo
template <typename I>
- struct content_in_doc_functor
+ struct content_in_hdoc_functor
: public Toolchain_Functor
{
typedef scribo::def::lbl_type V;
typedef mln_ch_value(I,V) L;
- content_in_doc_functor(const char *doc_filename);
+ content_in_hdoc_functor(const char *doc_filename);
virtual int nsteps() const;
@@ -135,7 +138,7 @@ namespace scribo
# ifndef MLN_INCLUDE_ONLY
template <typename I>
- content_in_doc_functor<I>::content_in_doc_functor(const char *doc_filename)
+ content_in_hdoc_functor<I>::content_in_hdoc_functor(const char
*doc_filename)
: enable_denoising(true),
enable_line_seps(true),
enable_whitespace_seps(true),
@@ -155,9 +158,9 @@ namespace scribo
template <typename I>
template <typename J>
- scribo::document<typename content_in_doc_functor<I>::L>
- content_in_doc_functor<I>::operator()(const Image<J>&
original_image,
- const Image<I>& processed_image)
+ scribo::document<typename content_in_hdoc_functor<I>::L>
+ content_in_hdoc_functor<I>::operator()(const Image<J>&
original_image,
+ const Image<I>& processed_image)
{
mln_precondition(exact(original_image).is_valid());
mln_precondition(exact(processed_image).is_valid());
@@ -177,8 +180,11 @@ namespace scribo
// Vertical and horizontal separators
{
mln_ch_value(I,bool)
- vseparators = primitive::extract::vertical_separators(processed_image, 81),
- hseparators = primitive::extract::horizontal_separators(processed_image, 81);
+ vseparators = preprocessing::rotate_90(
+ primitive::extract::lines_h_thick_and_thin(
+ preprocessing::rotate_90(processed_image), 101, 3, 0.05, 0.80, 2), false),
+ hseparators = primitive::extract::lines_h_thick_and_thin(
+ processed_image, 101, 3);
doc.set_vline_separators(vseparators);
doc.set_hline_separators(hseparators);
@@ -196,6 +202,7 @@ namespace scribo
input_cleaned = primitive::remove::separators(processed_image,
separators);
+ doc.set_binary_image_wo_seps(input_cleaned);
on_progress();
}
@@ -204,15 +211,15 @@ namespace scribo
// Debug
if (enable_line_seps)
{
- debug::logger().log_image(debug::AuxiliaryResults,
+ debug::logger().log_image(debug::Special,
doc.vline_seps(),
"vseparators");
- debug::logger().log_image(debug::AuxiliaryResults,
+ debug::logger().log_image(debug::Special,
doc.hline_seps(),
"hseparators");
- debug::logger().log_image(debug::AuxiliaryResults,
+ debug::logger().log_image(debug::Special,
input_cleaned,
"input_wo_separators");
}
@@ -247,6 +254,7 @@ namespace scribo
on_progress();
+
/// Set separator components.
if (enable_line_seps)
components.add_separators(separators);
@@ -371,7 +379,7 @@ namespace scribo
if (debug::logger().is_enabled())
{
if (enable_whitespace_seps)
- debug::logger().log_image(debug::AuxiliaryResults,
+ debug::logger().log_image(debug::Special,
whitespaces, "whitespaces");
// Bboxes image.
@@ -435,6 +443,22 @@ namespace scribo
on_progress();
+ //===== DEBUG =====
+# ifndef SCRIBO_NDEBUG
+ {
+ image2d<bool> tmp = duplicate(input_cleaned);
+ for_all_lines(l, lines)
+ if (lines(l).is_textline())
+ mln::draw::box_plain(tmp, lines(l).bbox(), false);
+
+ debug::logger().log_image(
+ debug::AuxiliaryResults,
+ tmp,
+ "input_wo_text");
+ }
+# endif // ! SCRIBO_NDEBUG
+ //===== END OF DEBUG =====
+
// Text recognition
if (enable_ocr)
{
@@ -543,7 +567,7 @@ namespace scribo
// Extract other Elements
on_new_progress_label("Extracting Elements");
component_set<L>
- elements = scribo::primitive::extract::non_text(doc, 3);
+ elements = scribo::primitive::extract::non_text_hdoc(doc, 31);
on_progress();
@@ -551,6 +575,7 @@ namespace scribo
// Identify other Elements
on_new_progress_label("Identifying Elements");
elements = scribo::primitive::identify(elements);
+
doc.set_elements(elements);
on_progress();
@@ -575,7 +600,7 @@ namespace scribo
template<typename I>
int
- content_in_doc_functor<I>::nsteps() const
+ content_in_hdoc_functor<I>::nsteps() const
{
return 10 + enable_denoising + enable_line_seps
+ enable_whitespace_seps + enable_ocr + save_doc_as_xml;
@@ -584,7 +609,7 @@ namespace scribo
template<typename I>
void
- content_in_doc_functor<I>::on_xml_saved()
+ content_in_hdoc_functor<I>::on_xml_saved()
{
// Nothing
}
@@ -598,4 +623,4 @@ namespace scribo
} // end of namespace scribo
-#endif // ! SCRIBO_TOOLCHAIN_INTERNAL_CONTENT_IN_DOC_FUNCTOR_HH
+#endif // ! SCRIBO_TOOLCHAIN_INTERNAL_CONTENT_IN_HDOC_FUNCTOR_HH
diff --git a/scribo/src/content_in_doc.cc b/scribo/src/content_in_hdoc.cc
similarity index 80%
copy from scribo/src/content_in_doc.cc
copy to scribo/src/content_in_hdoc.cc
index c879504..e0d2258 100644
--- a/scribo/src/content_in_doc.cc
+++ b/scribo/src/content_in_hdoc.cc
@@ -34,7 +34,7 @@
#include <mln/io/pbm/save.hh>
#include <mln/io/magick/load.hh>
-#include <scribo/toolchain/content_in_doc.hh>
+#include <scribo/toolchain/content_in_hdoc.hh>
#include <scribo/toolchain/text_in_doc_preprocess.hh>
#include <scribo/core/document.hh>
@@ -46,6 +46,14 @@
#include <scribo/preprocessing/crop.hh>
#include <scribo/io/xml/save.hh>
+#include <scribo/io/img/save.hh>
+
+
+#include <mln/core/alias/neighb2d.hh>
+#include <mln/labeling/compute.hh>
+#include <mln/labeling/foreground.hh>
+#include <mln/util/timer.hh>
+
const char *args_desc[][2] =
@@ -87,10 +95,15 @@ int main(int argc, char* argv[])
scribo::make::internal::debug_filename_prefix = argv[argc - 1];
}
+ scribo::debug::logger().set_level(scribo::debug::None);
+
trace::entering("main");
Magick::InitializeMagick(*argv);
+ mln::util::timer t;
+ t.start();
+
typedef image2d<scribo::def::lbl_type> L;
image2d<value::rgb8> input;
mln::io::magick::load(input, argv[1]);
@@ -109,6 +122,23 @@ int main(int argc, char* argv[])
}
input_preproc = toolchain::text_in_doc_preprocess(input, false, K);
+
+ // Cleanup components on borders
+ {
+ typedef scribo::def::lbl_type V;
+ V nlabels;
+ image2d<V> lbl = labeling::foreground(input_preproc, c8(), nlabels);
+ mln::util::array<box2d>
+ bbox = labeling::compute(accu::shape::bbox<point2d>(), lbl, nlabels);
+
+ const box2d& b = input.domain();
+ for_all_ncomponents(e, nlabels)
+ if (bbox(e).pmin().row() == b.pmin().row()
+ || bbox(e).pmax().row() == b.pmax().row()
+ || bbox(e).pmin().col() == b.pmin().col()
+ || bbox(e).pmax().col() == b.pmax().col())
+ data::fill(((input_preproc | bbox(e)).rw() | (pw::value(lbl) == pw::cst(e))).rw(),
false);
+ }
}
// Optional Cropping
@@ -164,15 +194,20 @@ int main(int argc, char* argv[])
// Text
std::cout << "Analysing document..." << std::endl;
document<L>
- doc = scribo::toolchain::content_in_doc(input, input_preproc, denoise,
- find_line_seps, find_whitespace_seps,
- !language.empty(), language);
+ doc = scribo::toolchain::content_in_hdoc(input, input_preproc, denoise,
+ find_line_seps, find_whitespace_seps,
+ !language.empty(), language);
// Saving results
std::cout << "Saving results..." << std::endl;
- scribo::io::xml::save(doc, argv[2], scribo::io::xml::PageExtended);
scribo::io::xml::save(doc, "page.xml", scribo::io::xml::Page);
- scribo::io::xml::save(doc, "full.xml", scribo::io::xml::Full);
+
+ std::cout << "End of process - " << t << std::endl;
+
+ scribo::io::xml::save(doc, argv[2], scribo::io::xml::PageExtended);
+ scribo::io::img::save(doc, "debug.png", scribo::io::img::DebugWoImage);
+ scribo::io::img::save(doc, "full.png", scribo::io::img::Full);
+ // scribo::io::xml::save(doc, "full.xml", scribo::io::xml::Full);
trace::exiting("main");
}
--
1.5.6.5