last-svn-commit-872-g316f515 New specific toolchain for historical documents.

* scribo/toolchain/content_in_hdoc.hh, * scribo/toolchain/internal/content_in_hdoc_functor.hh, * src/content_in_hdoc.cc: New. --- scribo/ChangeLog | 8 ++ .../{content_in_doc.hh => content_in_hdoc.hh} | 14 ++-- ...n_doc_functor.hh => content_in_hdoc_functor.hh} | 69 +++++++++++++------ .../src/{content_in_doc.cc => content_in_hdoc.cc} | 47 ++++++++++++-- 4 files changed, 103 insertions(+), 35 deletions(-) copy scribo/scribo/toolchain/{content_in_doc.hh => content_in_hdoc.hh} (85%) copy scribo/scribo/toolchain/internal/{content_in_doc_functor.hh => content_in_hdoc_functor.hh} (87%) copy scribo/src/{content_in_doc.cc => content_in_hdoc.cc} (80%) diff --git a/scribo/ChangeLog b/scribo/ChangeLog index 0b6042c..d3d7f90 100644 --- a/scribo/ChangeLog +++ b/scribo/ChangeLog @@ -1,3 +1,11 @@ +2011-05-16 Guillaume Lazzara <lazzara@fidji.lrde.epita.fr> + + New specific toolchain for historical documents. + + * scribo/toolchain/content_in_hdoc.hh, + * scribo/toolchain/internal/content_in_hdoc_functor.hh, + * src/content_in_hdoc.cc: New. + 2011-05-17 Guillaume Lazzara <z@lrde.epita.fr> Fix use of skeleton_constrained. diff --git a/scribo/scribo/toolchain/content_in_doc.hh b/scribo/scribo/toolchain/content_in_hdoc.hh similarity index 85% copy from scribo/scribo/toolchain/content_in_doc.hh copy to scribo/scribo/toolchain/content_in_hdoc.hh index 8f6f7a4..97233d5 100644 --- a/scribo/scribo/toolchain/content_in_doc.hh +++ b/scribo/scribo/toolchain/content_in_hdoc.hh @@ -23,14 +23,14 @@ // exception does not however invalidate any other reasons why the // executable file might be covered by the GNU General Public License. -#ifndef SCRIBO_TOOLCHAIN_CONTENT_IN_DOC_HH -# define SCRIBO_TOOLCHAIN_CONTENT_IN_DOC_HH +#ifndef SCRIBO_TOOLCHAIN_CONTENT_IN_HDOC_HH +# define SCRIBO_TOOLCHAIN_CONTENT_IN_HDOC_HH /// \file /// /// Analyse a document. -# include <scribo/toolchain/internal/content_in_doc_functor.hh> +# include <scribo/toolchain/internal/content_in_hdoc_functor.hh> namespace scribo { @@ -43,7 +43,7 @@ namespace scribo template <typename I, typename J> document<mln_ch_value(I, def::lbl_type)> - content_in_doc(const Image<I>& input, const Image<J>& input_preproc, + content_in_hdoc(const Image<I>& input, const Image<J>& input_preproc, bool denoise, bool find_line_seps = true, bool find_whitespace_seps = true, @@ -56,7 +56,7 @@ namespace scribo template <typename I, typename J> document<mln_ch_value(I, def::lbl_type)> - content_in_doc(const Image<I>& input, const Image<J>& input_preproc, + content_in_hdoc(const Image<I>& input, const Image<J>& input_preproc, bool denoise, bool find_line_seps = true, bool find_whitespace_seps = true, @@ -66,7 +66,7 @@ namespace scribo mln_precondition(exact(input).is_valid()); mln_precondition(exact(input_preproc).is_valid()); - internal::content_in_doc_functor<J> f("noname"); + internal::content_in_hdoc_functor<J> f("noname"); f.enable_denoising = denoise; f.enable_line_seps = find_line_seps; f.enable_whitespace_seps = find_whitespace_seps; @@ -87,5 +87,5 @@ namespace scribo } // end of namespace scribo -#endif // SCRIBO_TOOLCHAIN_CONTENT_IN_DOC_HH +#endif // SCRIBO_TOOLCHAIN_CONTENT_IN_HDOC_HH diff --git a/scribo/scribo/toolchain/internal/content_in_doc_functor.hh b/scribo/scribo/toolchain/internal/content_in_hdoc_functor.hh similarity index 87% copy from scribo/scribo/toolchain/internal/content_in_doc_functor.hh copy to scribo/scribo/toolchain/internal/content_in_hdoc_functor.hh index d60f3cc..92db8a7 100644 --- a/scribo/scribo/toolchain/internal/content_in_doc_functor.hh +++ b/scribo/scribo/toolchain/internal/content_in_hdoc_functor.hh @@ -24,19 +24,20 @@ // exception does not however invalidate any other reasons why the // executable file might be covered by the GNU General Public License. -#ifndef SCRIBO_TOOLCHAIN_INTERNAL_CONTENT_IN_DOC_FUNCTOR_HH -# define SCRIBO_TOOLCHAIN_INTERNAL_CONTENT_IN_DOC_FUNCTOR_HH +#ifndef SCRIBO_TOOLCHAIN_INTERNAL_CONTENT_IN_HDOC_FUNCTOR_HH +# define SCRIBO_TOOLCHAIN_INTERNAL_CONTENT_IN_HDOC_FUNCTOR_HH # include <scribo/core/def/lbl_type.hh> # include <scribo/core/document.hh> # include <scribo/core/line_set.hh> # include <scribo/core/paragraph_set.hh> -# include <scribo/primitive/extract/non_text.hh> +# include <scribo/primitive/extract/non_text_hdoc.hh> # include <scribo/primitive/extract/components.hh> -# include <scribo/primitive/extract/separators.hh> -# include <scribo/primitive/extract/vertical_separators.hh> -# include <scribo/primitive/extract/horizontal_separators.hh> +// # include <scribo/primitive/extract/separators.hh> +// # include <scribo/primitive/extract/vertical_separators.hh> +// # include <scribo/primitive/extract/horizontal_separators.hh> +# include <scribo/primitive/extract/lines_h_thick_and_thin.hh> # include <scribo/primitive/extract/alignments.hh> @@ -44,6 +45,8 @@ # include <scribo/primitive/remove/separators.hh> +# include <scribo/preprocessing/rotate_90.hh> + # include <scribo/filter/line_links_x_height.hh> # include <scribo/filter/object_links_bbox_h_ratio.hh> # include <scribo/filter/objects_small.hh> @@ -87,13 +90,13 @@ namespace scribo template <typename I> - struct content_in_doc_functor + struct content_in_hdoc_functor : public Toolchain_Functor { typedef scribo::def::lbl_type V; typedef mln_ch_value(I,V) L; - content_in_doc_functor(const char *doc_filename); + content_in_hdoc_functor(const char *doc_filename); virtual int nsteps() const; @@ -135,7 +138,7 @@ namespace scribo # ifndef MLN_INCLUDE_ONLY template <typename I> - content_in_doc_functor<I>::content_in_doc_functor(const char *doc_filename) + content_in_hdoc_functor<I>::content_in_hdoc_functor(const char *doc_filename) : enable_denoising(true), enable_line_seps(true), enable_whitespace_seps(true), @@ -155,9 +158,9 @@ namespace scribo template <typename I> template <typename J> - scribo::document<typename content_in_doc_functor<I>::L> - content_in_doc_functor<I>::operator()(const Image<J>& original_image, - const Image<I>& processed_image) + scribo::document<typename content_in_hdoc_functor<I>::L> + content_in_hdoc_functor<I>::operator()(const Image<J>& original_image, + const Image<I>& processed_image) { mln_precondition(exact(original_image).is_valid()); mln_precondition(exact(processed_image).is_valid()); @@ -177,8 +180,11 @@ namespace scribo // Vertical and horizontal separators { mln_ch_value(I,bool) - vseparators = primitive::extract::vertical_separators(processed_image, 81), - hseparators = primitive::extract::horizontal_separators(processed_image, 81); + vseparators = preprocessing::rotate_90( + primitive::extract::lines_h_thick_and_thin( + preprocessing::rotate_90(processed_image), 101, 3, 0.05, 0.80, 2), false), + hseparators = primitive::extract::lines_h_thick_and_thin( + processed_image, 101, 3); doc.set_vline_separators(vseparators); doc.set_hline_separators(hseparators); @@ -196,6 +202,7 @@ namespace scribo input_cleaned = primitive::remove::separators(processed_image, separators); + doc.set_binary_image_wo_seps(input_cleaned); on_progress(); } @@ -204,15 +211,15 @@ namespace scribo // Debug if (enable_line_seps) { - debug::logger().log_image(debug::AuxiliaryResults, + debug::logger().log_image(debug::Special, doc.vline_seps(), "vseparators"); - debug::logger().log_image(debug::AuxiliaryResults, + debug::logger().log_image(debug::Special, doc.hline_seps(), "hseparators"); - debug::logger().log_image(debug::AuxiliaryResults, + debug::logger().log_image(debug::Special, input_cleaned, "input_wo_separators"); } @@ -247,6 +254,7 @@ namespace scribo on_progress(); + /// Set separator components. if (enable_line_seps) components.add_separators(separators); @@ -371,7 +379,7 @@ namespace scribo if (debug::logger().is_enabled()) { if (enable_whitespace_seps) - debug::logger().log_image(debug::AuxiliaryResults, + debug::logger().log_image(debug::Special, whitespaces, "whitespaces"); // Bboxes image. @@ -435,6 +443,22 @@ namespace scribo on_progress(); + //===== DEBUG ===== +# ifndef SCRIBO_NDEBUG + { + image2d<bool> tmp = duplicate(input_cleaned); + for_all_lines(l, lines) + if (lines(l).is_textline()) + mln::draw::box_plain(tmp, lines(l).bbox(), false); + + debug::logger().log_image( + debug::AuxiliaryResults, + tmp, + "input_wo_text"); + } +# endif // ! SCRIBO_NDEBUG + //===== END OF DEBUG ===== + // Text recognition if (enable_ocr) { @@ -543,7 +567,7 @@ namespace scribo // Extract other Elements on_new_progress_label("Extracting Elements"); component_set<L> - elements = scribo::primitive::extract::non_text(doc, 3); + elements = scribo::primitive::extract::non_text_hdoc(doc, 31); on_progress(); @@ -551,6 +575,7 @@ namespace scribo // Identify other Elements on_new_progress_label("Identifying Elements"); elements = scribo::primitive::identify(elements); + doc.set_elements(elements); on_progress(); @@ -575,7 +600,7 @@ namespace scribo template<typename I> int - content_in_doc_functor<I>::nsteps() const + content_in_hdoc_functor<I>::nsteps() const { return 10 + enable_denoising + enable_line_seps + enable_whitespace_seps + enable_ocr + save_doc_as_xml; @@ -584,7 +609,7 @@ namespace scribo template<typename I> void - content_in_doc_functor<I>::on_xml_saved() + content_in_hdoc_functor<I>::on_xml_saved() { // Nothing } @@ -598,4 +623,4 @@ namespace scribo } // end of namespace scribo -#endif // ! SCRIBO_TOOLCHAIN_INTERNAL_CONTENT_IN_DOC_FUNCTOR_HH +#endif // ! SCRIBO_TOOLCHAIN_INTERNAL_CONTENT_IN_HDOC_FUNCTOR_HH diff --git a/scribo/src/content_in_doc.cc b/scribo/src/content_in_hdoc.cc similarity index 80% copy from scribo/src/content_in_doc.cc copy to scribo/src/content_in_hdoc.cc index c879504..e0d2258 100644 --- a/scribo/src/content_in_doc.cc +++ b/scribo/src/content_in_hdoc.cc @@ -34,7 +34,7 @@ #include <mln/io/pbm/save.hh> #include <mln/io/magick/load.hh> -#include <scribo/toolchain/content_in_doc.hh> +#include <scribo/toolchain/content_in_hdoc.hh> #include <scribo/toolchain/text_in_doc_preprocess.hh> #include <scribo/core/document.hh> @@ -46,6 +46,14 @@ #include <scribo/preprocessing/crop.hh> #include <scribo/io/xml/save.hh> +#include <scribo/io/img/save.hh> + + +#include <mln/core/alias/neighb2d.hh> +#include <mln/labeling/compute.hh> +#include <mln/labeling/foreground.hh> +#include <mln/util/timer.hh> + const char *args_desc[][2] = @@ -87,10 +95,15 @@ int main(int argc, char* argv[]) scribo::make::internal::debug_filename_prefix = argv[argc - 1]; } + scribo::debug::logger().set_level(scribo::debug::None); + trace::entering("main"); Magick::InitializeMagick(*argv); + mln::util::timer t; + t.start(); + typedef image2d<scribo::def::lbl_type> L; image2d<value::rgb8> input; mln::io::magick::load(input, argv[1]); @@ -109,6 +122,23 @@ int main(int argc, char* argv[]) } input_preproc = toolchain::text_in_doc_preprocess(input, false, K); + + // Cleanup components on borders + { + typedef scribo::def::lbl_type V; + V nlabels; + image2d<V> lbl = labeling::foreground(input_preproc, c8(), nlabels); + mln::util::array<box2d> + bbox = labeling::compute(accu::shape::bbox<point2d>(), lbl, nlabels); + + const box2d& b = input.domain(); + for_all_ncomponents(e, nlabels) + if (bbox(e).pmin().row() == b.pmin().row() + || bbox(e).pmax().row() == b.pmax().row() + || bbox(e).pmin().col() == b.pmin().col() + || bbox(e).pmax().col() == b.pmax().col()) + data::fill(((input_preproc | bbox(e)).rw() | (pw::value(lbl) == pw::cst(e))).rw(), false); + } } // Optional Cropping @@ -164,15 +194,20 @@ int main(int argc, char* argv[]) // Text std::cout << "Analysing document..." << std::endl; document<L> - doc = scribo::toolchain::content_in_doc(input, input_preproc, denoise, - find_line_seps, find_whitespace_seps, - !language.empty(), language); + doc = scribo::toolchain::content_in_hdoc(input, input_preproc, denoise, + find_line_seps, find_whitespace_seps, + !language.empty(), language); // Saving results std::cout << "Saving results..." << std::endl; - scribo::io::xml::save(doc, argv[2], scribo::io::xml::PageExtended); scribo::io::xml::save(doc, "page.xml", scribo::io::xml::Page); - scribo::io::xml::save(doc, "full.xml", scribo::io::xml::Full); + + std::cout << "End of process - " << t << std::endl; + + scribo::io::xml::save(doc, argv[2], scribo::io::xml::PageExtended); + scribo::io::img::save(doc, "debug.png", scribo::io::img::DebugWoImage); + scribo::io::img::save(doc, "full.png", scribo::io::img::Full); + // scribo::io::xml::save(doc, "full.xml", scribo::io::xml::Full); trace::exiting("main"); } -- 1.5.6.5
participants (1)
-
Guillaume Lazzara