last-svn-commit-905-ged11b3b Improve output cleanup for historical document toolchain.

* scribo/filter/paragraphs_in_borders.hh, * scribo/filter/separators_in_borders.hh, * scribo/filter/separators_vert_in_borders.hh: New. * scribo/filter/images_in_paragraph.hh, * scribo/filter/paragraphs_bbox_overlap.hh, * scribo/filter/paragraphs_in_image.hh, * scribo/filter/separators_in_element.hh, * scribo/filter/separators_in_paragraph.hh: Improve filtering. * scribo/toolchain/internal/content_in_hdoc_functor.hh: Make use of new filters. --- scribo/ChangeLog | 17 ++ scribo/scribo/filter/images_in_paragraph.hh | 8 +- scribo/scribo/filter/paragraphs_bbox_overlap.hh | 175 ++++++++++++----- scribo/scribo/filter/paragraphs_in_borders.hh | 140 +++++++++++++ scribo/scribo/filter/paragraphs_in_image.hh | 29 +++- scribo/scribo/filter/separators_in_borders.hh | 206 ++++++++++++++++++++ scribo/scribo/filter/separators_in_element.hh | 84 ++++---- scribo/scribo/filter/separators_in_paragraph.hh | 92 +++++---- scribo/scribo/filter/separators_vert_in_borders.hh | 143 ++++++++++++++ .../toolchain/internal/content_in_hdoc_functor.hh | 58 +++++- 10 files changed, 799 insertions(+), 153 deletions(-) create mode 100644 scribo/scribo/filter/paragraphs_in_borders.hh create mode 100644 scribo/scribo/filter/separators_in_borders.hh create mode 100644 scribo/scribo/filter/separators_vert_in_borders.hh diff --git a/scribo/ChangeLog b/scribo/ChangeLog index 84564da..450c4d5 100644 --- a/scribo/ChangeLog +++ b/scribo/ChangeLog @@ -1,5 +1,22 @@ 2011-06-07 Guillaume Lazzara <z@lrde.epita.fr> + Improve output cleanup for historical document toolchain. + + * scribo/filter/paragraphs_in_borders.hh, + * scribo/filter/separators_in_borders.hh, + * scribo/filter/separators_vert_in_borders.hh: New. + + * scribo/filter/images_in_paragraph.hh, + * scribo/filter/paragraphs_bbox_overlap.hh, + * scribo/filter/paragraphs_in_image.hh, + * scribo/filter/separators_in_element.hh, + * scribo/filter/separators_in_paragraph.hh: Improve filtering. + + * scribo/toolchain/internal/content_in_hdoc_functor.hh: Make use + of new filters. + +2011-06-07 Guillaume Lazzara <z@lrde.epita.fr> + * scribo/util/component_precise_outline.hh: Add support for labeled_image. diff --git a/scribo/scribo/filter/images_in_paragraph.hh b/scribo/scribo/filter/images_in_paragraph.hh index e05b202..3cf64e1 100644 --- a/scribo/scribo/filter/images_in_paragraph.hh +++ b/scribo/scribo/filter/images_in_paragraph.hh @@ -101,12 +101,12 @@ namespace scribo // => Ignore it. if (tl && tr && ml && mc && mr && bl && br) elts(c).update_tag(component::Ignored); - - // FIXME: warning this call may produce inconsistent data - // Ignored components are still in the separator image... - doc.set_elements(elts); } + // FIXME: warning this call may produce inconsistent data + // Ignored components are still in the separator image... + doc.set_elements(elts); + trace::exiting("scribo::filter::images_in_paragraph"); } diff --git a/scribo/scribo/filter/paragraphs_bbox_overlap.hh b/scribo/scribo/filter/paragraphs_bbox_overlap.hh index aa1c8ac..188a77e 100644 --- a/scribo/scribo/filter/paragraphs_bbox_overlap.hh +++ b/scribo/scribo/filter/paragraphs_bbox_overlap.hh @@ -41,6 +41,7 @@ # include <scribo/core/paragraph_set.hh> +#include <mln/labeling/colorize.hh> namespace scribo { @@ -59,7 +60,7 @@ namespace scribo /// Paragraph::Ignored. template <typename L> paragraph_set<L> - paragraphs_bbox_overlap(const paragraph_set<L>& paragraphs); + paragraphs_bbox_overlap(const paragraph_set<L>& parset); # ifndef MLN_INCLUDE_ONLY @@ -70,23 +71,23 @@ namespace scribo template <typename L> struct order_paragraphs_id { - order_paragraphs_id(const scribo::paragraph_set<L>& paragraphs) - : paragraphs_(paragraphs) + order_paragraphs_id(const scribo::paragraph_set<L>& parset) + : parset_(parset) { } bool operator()(const scribo::paragraph_id_t& l1, const scribo::paragraph_id_t& l2) const { - const unsigned l1_nsites = paragraphs_(l1).bbox().nsites(); - const unsigned l2_nsites = paragraphs_(l2).bbox().nsites(); + const unsigned l1_nsites = parset_(l1).bbox().nsites(); + const unsigned l2_nsites = parset_(l2).bbox().nsites(); if (l1_nsites == l2_nsites) return l1 > l2; return l1_nsites > l2_nsites; } - scribo::paragraph_set<L> paragraphs_; + scribo::paragraph_set<L> parset_; }; } // end of namespace scribo::filter::internal @@ -94,74 +95,150 @@ namespace scribo template <typename L> paragraph_set<L> - paragraphs_bbox_overlap(const paragraph_set<L>& paragraphs) + paragraphs_bbox_overlap(const paragraph_set<L>& parset) { trace::entering("scribo::filter::paragraphs_bbox_overlap"); - mln_precondition(paragraphs.is_valid()); + mln_precondition(parset.is_valid()); - L billboard; - initialize(billboard, paragraphs.lines().components().labeled_image()); + mln_ch_value(L, paragraph_id_t) billboard; + initialize(billboard, parset.lines().components().labeled_image()); data::fill(billboard, 0); - mln::util::array<bool> not_to_ignore(paragraphs.nelements() + 1, true); + mln::util::array<bool> not_to_ignore(parset.nelements() + 1, true); not_to_ignore(0) = false; - for_all_paragraphs(cur_id, paragraphs) + paragraph_set<L> output = parset.duplicate(); + + mln::util::array<paragraph_id_t> candidate; + candidate.reserve(parset.nelements()); + for_all_paragraphs(cur_id, parset) + if (parset(cur_id).is_valid()) + candidate.append(cur_id); + + std::sort(candidate.hook_std_vector_().begin(), + candidate.hook_std_vector_().end(), + internal::order_paragraphs_id<L>(parset)); + + for_all_elements(e, candidate) { - const box2d& b_ = paragraphs(cur_id).bbox(); + paragraph_id_t cur_id = candidate(e); + + const box2d& b_ = parset(cur_id).bbox(); - if (paragraphs(cur_id).nlines() > 1) + if (parset(cur_id).nlines() > 3) { mln::draw::box_plain(billboard, b_, cur_id); continue; } - const unsigned tl = billboard(b_.pmin()); - const unsigned tr = billboard.at_(b_.pmin().row(), b_.pmax().col()); - const unsigned ml = billboard.at_(b_.pcenter().row(), b_.pmin().col()); const unsigned mc = billboard.at_(b_.pcenter().row(), b_.pcenter().col()); - const unsigned mr = billboard.at_(b_.pcenter().row(), b_.pmax().col()); - const unsigned bl = billboard.at_(b_.pmax().row(), b_.pmin().col()); - const unsigned br = billboard(b_.pmax()); - - typedef std::set<unsigned> set_t; - set_t labels; - labels.insert(tl); - labels.insert(tl); - labels.insert(tr); - labels.insert(ml); - labels.insert(mc); - labels.insert(mr); - labels.insert(bl); - labels.insert(br); - - for (set_t::const_iterator it = labels.begin(); - it != labels.end(); - ++it) - if (not_to_ignore(*it)) + + // Box is mostly in the background => do nothing. + if (mc == 0) + { + mln::draw::box_plain(billboard, b_, cur_id); + continue; + } + else // Bbox center is inside another box. Check if we can + // merge the current box with it. + { + // Consider other potential overlapping bboxes. + const unsigned tl = billboard(b_.pmin()); + const unsigned tr = billboard.at_(b_.pmin().row(), b_.pmax().col()); + const unsigned bl = billboard.at_(b_.pmax().row(), b_.pmin().col()); + const unsigned br = billboard(b_.pmax()); + + typedef std::set<unsigned> set_t; + set_t labels; + labels.insert(tl); + labels.insert(tr); + labels.insert(mc); + labels.insert(bl); + labels.insert(br); + + // FIXME: check that there are at least 3 points (including + // the center) in another paragraph. + + // The potential merged bbox is already ignored or the + // current bbox overlaps with several bboxes. + // => Ignore current bbox . + // + if (!not_to_ignore(mc) + || (labels.size() > 1 && labels.find(0) == labels.end())) { - box2d b2 = paragraphs(*it).bbox(); - box2d b_i = scribo::util::box_intersection(b_, b2); + mln::draw::box_plain(billboard, b_, cur_id); // Really? + not_to_ignore(cur_id) = false; + continue; + } - // si b_ est inclus dans une boite donc le nombre de comp > 1 => invalid juste b_ - // sinon => invalid b_ et b2 - if ((b_i.nsites() / (float)b_.nsites() > 0.4 - || (b_i.nsites() / (float)b2.nsites()) > 0.9)) + for (set_t::const_iterator it = labels.begin(); + it != labels.end(); ++it) + if (*it) { - not_to_ignore(cur_id) = false; - - if (paragraphs(*it).nlines() < 4) - not_to_ignore(*it) = false; + mln_assertion(*it != mc); + + box2d b2 = output(*it).bbox(); + box2d b_i = scribo::util::box_intersection(b_, b2); + volatile float + b_ratio = b_i.nsites() / (float)b_.nsites(); + + // If the bbox is widely included in another box. + if (b_ratio > 0.8) + { + output(mc).fast_merge(output(cur_id)); + mln::draw::box_plain(billboard, parset(mc).bbox(), mc); + } + else + mln::draw::box_plain(billboard, parset(cur_id).bbox(), cur_id); + break; } - } - mln::draw::box_plain(billboard, b_, cur_id); + } } - paragraph_set<L> output = paragraphs.duplicate(); + // if (not_to_ignore(*it)) + // { + // box2d b2 = output(*it).bbox(); + // box2d b_i = scribo::util::box_intersection(b_, b2); + + // volatile float + // b_ratio = b_i.nsites() / (float)b_.nsites(), + // b2_ratio = b_i.nsites() / (float)b2.nsites(); + + // if (b2_ratio == 1) + // { + // // Merge paragraphs and redraw the new bbox. + // output(cur_id).fast_merge(output(*it)); + // mln::draw::box_plain(billboard, output(cur_id).bbox(), cur_id); + // } + // else if (b_ratio == 1) + // { + // // Merge paragraphs and redraw the new bbox. + // output(*it).fast_merge(output(cur_id)); + // mln::draw::box_plain(billboard, output(*it).bbox(), *it); + // } + // else if ((b_ratio > 0.4 || b2_ratio > 0.9)) + // { + // // si b_ est inclus dans une boite dont le nombre de + // // comp > 4 => invalid juste b_ sinon => invalid b_ et + // // b2 + // not_to_ignore(cur_id) = false; + + // if (parset(*it).nlines() < 4) + // not_to_ignore(*it) = false; + // } + // } + + // mln::draw::box_plain(billboard, b_, cur_id); + // } + output.invalidate(not_to_ignore); + for_all_paragraphs(p, output) + if (output(p).is_valid()) + output(p).force_stats_update(); + trace::exiting("scribo::filter::paragraphs_bbox_overlap"); return output; } diff --git a/scribo/scribo/filter/paragraphs_in_borders.hh b/scribo/scribo/filter/paragraphs_in_borders.hh new file mode 100644 index 0000000..8953282 --- /dev/null +++ b/scribo/scribo/filter/paragraphs_in_borders.hh @@ -0,0 +1,140 @@ +// Copyright (C) 2011 EPITA Research and Development Laboratory (LRDE) +// +// This file is part of Olena. +// +// Olena is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation, version 2 of the License. +// +// Olena is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Olena. If not, see <http://www.gnu.org/licenses/>. +// +// As a special exception, you may use this file as part of a free +// software project without restriction. Specifically, if other files +// instantiate templates or use macros or inline functions from this +// file, or you compile this file and link it with other files to produce +// an executable, this file does not by itself cause the resulting +// executable to be covered by the GNU General Public License. This +// exception does not however invalidate any other reasons why the +// executable file might be covered by the GNU General Public License. + +#ifndef SCRIBO_FILTER_PARAGRAPHS_IN_BORDERS_HH +# define SCRIBO_FILTER_PARAGRAPHS_IN_BORDERS_HH + +/// \file +/// +/// Invalidate false positive paragraphs. +/// \fixme Share same test canvas as text::merging. + + +# include <mln/core/concept/image.hh> +# include <scribo/core/component_set.hh> +# include <scribo/core/document.hh> +# include <scribo/util/box_is_included.hh> + +namespace scribo +{ + + namespace filter + { + + using namespace mln; + + + /// Invalidate paragraphs located close to the image borders. + /// + /// \param[in,out] doc A document structure. + /// + /// Warning: it does not remove paragraphs from separator + /// image. It only invalidate separator components in their + /// respective component_set. + /// + /// \verbatim + /// + /// ----------- + /// |_!____!__| + /// | ! ! <--------- Paragraphs located in this area are + /// | ! ! | invalidated. + /// | ! ! | + /// |_!____!__| + /// | ! ! | + /// ----------- + /// + /// \endverbatim + // + template <typename L> + void + paragraphs_in_borders(document<L>& doc); + + +# ifndef MLN_INCLUDE_ONLY + + template <typename L> + void + paragraphs_in_borders(document<L>& doc) + { + trace::entering("scribo::filter::paragraphs_in_borders"); + + mln_precondition(doc.is_valid()); + + const mln::image2d<mln::value::rgb8>& ima = doc.image(); + + unsigned border_size = std::min(43., 0.02 * ima.domain().width()); + + /// pt + /// ptl X------X--- + /// |_!____!__X ptr + /// | ! ! | + /// | ! ! | + /// | ! ! | + /// pbl X_!____!__| + /// | ! ! | + /// --X-------X + /// pb pbr + /// + point2d + ptl = ima.domain().pmin(), + pt(geom::min_row(ima), geom::max_col(ima) - border_size), + ptr(border_size, geom::max_col(ima)), + pbr = ima.domain().pmax(), + pb(geom::max_row(ima), border_size), + pbl(geom::max_row(ima) - border_size, geom::min_col(ima)); + + box2d + bt(ptl, ptr), + br(pt, pbr), + bb(pbl, pbr), + bl(ptl, pb); + + // Horizontal paragraphs + if (doc.has_text()) + { + paragraph_set<L> parset = doc.paragraphs(); + for_all_paragraphs(p, parset) + if (parset(p).is_valid()) + if (util::box_is_included(parset(p).bbox(), bt) + || util::box_is_included(parset(p).bbox(), br) + || util::box_is_included(parset(p).bbox(), bb) + || util::box_is_included(parset(p).bbox(), bl)) + { + parset(p).invalidate(); + } + + doc.set_paragraphs(parset); + } + + trace::exiting("scribo::filter::paragraphs_in_borders"); + } + +# endif // ! MLN_INCLUDE_ONLY + + } // end of namespace scribo::filter + +} // end of namespace scribo + +#endif // ! SCRIBO_FILTER_PARAGRAPHS_IN_BORDERS_HH diff --git a/scribo/scribo/filter/paragraphs_in_image.hh b/scribo/scribo/filter/paragraphs_in_image.hh index 1029430..f67b863 100644 --- a/scribo/scribo/filter/paragraphs_in_image.hh +++ b/scribo/scribo/filter/paragraphs_in_image.hh @@ -89,8 +89,6 @@ namespace scribo && doc.elements()(e).type() == component::Image) mln::draw::box_plain(billboard, doc.elements()(e).bbox(), true); - mln::io::pbm::save(billboard, "billboard_parimage.pbm"); - const paragraph_set<L>& parset = doc.paragraphs(); mln::util::array<bool> not_to_ignore(parset.nelements() + 1, true); not_to_ignore(0) = false; @@ -101,15 +99,34 @@ namespace scribo const bool tl = billboard(b_.pmin()), tr = billboard.at_(b_.pmin().row(), b_.pmax().col()), - ml = billboard.at_(b_.pcenter().row(), b_.pmin().col()), mc = billboard.at_(b_.pcenter().row(), b_.pcenter().col()), - mr = billboard.at_(b_.pcenter().row(), b_.pmax().col()), bl = billboard.at_(b_.pmax().row(), b_.pmin().col()), br = billboard(b_.pmax()); + typedef mln::util::set<int> set_t; + set_t s; + s.insert(tl); + s.insert(tr); + s.insert(mc); + s.insert(bl); + s.insert(br); + + if (s.nelements() > 2 || (s.nelements() == 2 && !s.has(0))) + continue; + // The paragraph is fully included in an image. - if (tl && tr && ml && mc && mr && bl && br) - not_to_ignore(cur_id) = false; + for_all_elements(e, s) + if (s[e] != 0 + && (mc != 0 && mc == s[e] + && ((tl == mc && bl == mc) + || (tr == mc && br == mc) + || (tl == mc && tr == mc) + || (bl == mc && br == mc)))) + { +// if (tl && tr && ml && mc && mr && bl && br) + not_to_ignore(cur_id) = false; + break; + } } paragraph_set<L> output = parset.duplicate(); diff --git a/scribo/scribo/filter/separators_in_borders.hh b/scribo/scribo/filter/separators_in_borders.hh new file mode 100644 index 0000000..8ccb6b1 --- /dev/null +++ b/scribo/scribo/filter/separators_in_borders.hh @@ -0,0 +1,206 @@ +// Copyright (C) 2011 EPITA Research and Development Laboratory (LRDE) +// +// This file is part of Olena. +// +// Olena is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation, version 2 of the License. +// +// Olena is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Olena. If not, see <http://www.gnu.org/licenses/>. +// +// As a special exception, you may use this file as part of a free +// software project without restriction. Specifically, if other files +// instantiate templates or use macros or inline functions from this +// file, or you compile this file and link it with other files to produce +// an executable, this file does not by itself cause the resulting +// executable to be covered by the GNU General Public License. This +// exception does not however invalidate any other reasons why the +// executable file might be covered by the GNU General Public License. + +#ifndef SCRIBO_FILTER_SEPARATORS_IN_BORDERS_HH +# define SCRIBO_FILTER_SEPARATORS_IN_BORDERS_HH + +/// \file +/// +/// Invalidate false positive separators. +/// \fixme Share same test canvas as text::merging. + + +# include <mln/core/concept/image.hh> +# include <scribo/core/component_set.hh> +# include <scribo/core/document.hh> +# include <scribo/util/box_is_included.hh> + +namespace scribo +{ + + namespace filter + { + + using namespace mln; + + + /// Invalidate separators located close to the image borders. + /// + /// \param[in,out] doc A document structure. + /// + /// Warning: it does not remove separators from separator + /// image. It only invalidate separator components in their + /// respective component_set. + /// + /// \verbatim + /// + /// ----------- + /// |_!____!__| + /// | ! ! <--------- Separators located in this area are + /// | ! ! | invalidated. + /// | ! ! | + /// |_!____!__| + /// | ! ! | + /// ----------- + /// + /// \endverbatim + // + template <typename L> + void + separators_in_borders(document<L>& doc, float vratio, float hratio); + + +# ifndef MLN_INCLUDE_ONLY + + template <typename L> + void + separators_in_borders(document<L>& doc, float vratio, float hratio) + { + trace::entering("scribo::filter::separators_in_borders"); + + mln_precondition(doc.is_valid()); + + const mln::image2d<mln::value::rgb8>& ima = doc.image(); + + // Horizontal separators + if (doc.has_hline_seps()) + { + unsigned border_size = hratio * std::min(ima.domain().width(), ima.domain().height()); + + /// pt + /// ptl X------X--- + /// |_!____!__X ptr + /// | ! ! | + /// | ! ! | + /// | ! ! | + /// pbl X_!____!__| + /// | ! ! | + /// --X-------X + /// pb pbr + /// + point2d + ptl = ima.domain().pmin(), + pt(geom::min_row(ima), geom::max_col(ima) - border_size), + ptr(border_size, geom::max_col(ima)), + pbr = ima.domain().pmax(), + pb(geom::max_row(ima), border_size), + pbl(geom::max_row(ima) - border_size, geom::min_col(ima)); + + box2d + bt(ptl, ptr), + br(pt, pbr), + bb(pbl, pbr), + bl(ptl, pb); + + + component_set<L> hline = doc.hline_seps_comps().duplicate(); + for_all_comps(c, hline) + if (hline(c).is_valid()) + if (util::box_is_included(hline(c).bbox(), bt) + || util::box_is_included(hline(c).bbox(), br) + || util::box_is_included(hline(c).bbox(), bb) + || util::box_is_included(hline(c).bbox(), bl)) + { + hline(c).update_tag(component::Ignored); + } + + // FIXME: warning this call may produce inconsistent data + // Ignored components are still in the separator image... + doc.set_hline_separators(doc.hline_seps(), hline); + } + + + // Vertical separators + if (doc.has_vline_seps()) + { + unsigned border_size = vratio * std::min(ima.domain().width(), ima.domain().height()); + + /// pt + /// ptl X------X--- + /// |_!____!__X ptr + /// | ! ! | + /// | ! ! | + /// | ! ! | + /// pbl X_!____!__| + /// | ! ! | + /// --X-------X + /// pb pbr + /// + point2d + ptl = ima.domain().pmin(), + pt(geom::min_row(ima), geom::max_col(ima) - border_size), + ptr(border_size, geom::max_col(ima)), + pbr = ima.domain().pmax(), + pb(geom::max_row(ima), border_size), + pbl(geom::max_row(ima) - border_size, geom::min_col(ima)); + + box2d + bt(ptl, ptr), + br(pt, pbr), + bb(pbl, pbr), + bl(ptl, pb); + + + component_set<L> vline = doc.vline_seps_comps().duplicate(); + for_all_comps(c, vline) + if (vline(c).is_valid()) + { + if (util::box_is_included(vline(c).bbox(), bt) + || util::box_is_included(vline(c).bbox(), br) + || util::box_is_included(vline(c).bbox(), bb) + || util::box_is_included(vline(c).bbox(), bl)) + { + // std::cout << vline(c).bbox() << " is included in "; + // if (util::box_is_included(vline(c).bbox(), bt)) + // std::cout << bt << std::endl; + // if (util::box_is_included(vline(c).bbox(), br)) + // std::cout << br << std::endl; + // if (util::box_is_included(vline(c).bbox(), bb)) + // std::cout << bb << std::endl; + // if (util::box_is_included(vline(c).bbox(), bl)) + // std::cout << bl << std::endl; + + vline(c).update_tag(component::Ignored); + } + // else + // { + // std::cout << vline(c).bbox() << " is not included in " << bt << " - " << br << " - " << bb << " - " << bl << std::endl; + // } + } + // FIXME: warning this call may produce inconsistent data + // Ignored components are still in the separator image... + doc.set_vline_separators(doc.vline_seps(), vline); + } + + trace::exiting("scribo::filter::separators_in_borders"); + } + +# endif // ! MLN_INCLUDE_ONLY + + } // end of namespace scribo::filter + +} // end of namespace scribo + +#endif // ! SCRIBO_FILTER_SEPARATORS_IN_BORDERS_HH diff --git a/scribo/scribo/filter/separators_in_element.hh b/scribo/scribo/filter/separators_in_element.hh index 228d82f..a8b0ebb 100644 --- a/scribo/scribo/filter/separators_in_element.hh +++ b/scribo/scribo/filter/separators_in_element.hh @@ -90,26 +90,26 @@ namespace scribo { component_set<L> hline = doc.hline_seps_comps().duplicate(); for_all_comps(c, hline) - { - const mln_box(L)& b_ = hline(c).bbox(); - - const bool tl = billboard(b_.pmin()); - const bool tr = billboard.at_(b_.pmin().row(), b_.pmax().col()); - const bool ml = billboard.at_(b_.pcenter().row(), b_.pmin().col()); - const bool mc = billboard.at_(b_.pcenter().row(), b_.pcenter().col()); - const bool mr = billboard.at_(b_.pcenter().row(), b_.pmax().col()); - const bool bl = billboard.at_(b_.pmax().row(), b_.pmin().col()); - const bool br = billboard(b_.pmax()); - - // This separator is included in an element (picture, drawing...) - // => Ignore it. - if (tl && tr && ml && mc && mr && bl && br) - hline(c).update_tag(component::Ignored); - - // FIXME: warning this call may produce inconsistent data - // Ignored components are still in the separator image... - doc.set_hline_separators(doc.hline_seps(), hline); - } + if (hline(c).is_valid()) + { + const mln_box(L)& b_ = hline(c).bbox(); + + const bool tl = billboard(b_.pmin()); + const bool tr = billboard.at_(b_.pmin().row(), b_.pmax().col()); + const bool ml = billboard.at_(b_.pcenter().row(), b_.pmin().col()); + const bool mc = billboard.at_(b_.pcenter().row(), b_.pcenter().col()); + const bool mr = billboard.at_(b_.pcenter().row(), b_.pmax().col()); + const bool bl = billboard.at_(b_.pmax().row(), b_.pmin().col()); + const bool br = billboard(b_.pmax()); + + // This separator is included in an element (picture, drawing...) + // => Ignore it. + if (tl && tr && ml && mc && mr && bl && br) + hline(c).update_tag(component::Ignored); + } + // FIXME: warning this call may produce inconsistent data + // Ignored components are still in the separator image... + doc.set_hline_separators(doc.hline_seps(), hline); } // Vertical separators @@ -117,29 +117,29 @@ namespace scribo { component_set<L> vline = doc.vline_seps_comps().duplicate(); for_all_comps(c, vline) - { - const mln_box(L)& b_ = vline(c).bbox(); - - const bool tl = billboard(b_.pmin()); - const bool tr = billboard.at_(b_.pmin().row(), b_.pmax().col()); - const bool ml = billboard.at_(b_.pcenter().row(), b_.pmin().col()); - const bool mc = billboard.at_(b_.pcenter().row(), b_.pcenter().col()); - const bool mr = billboard.at_(b_.pcenter().row(), b_.pmax().col()); - const bool bl = billboard.at_(b_.pmax().row(), b_.pmin().col()); - const bool br = billboard(b_.pmax()); - - // This separator is included in an element (picture, drawing...) - // => Ignore it. - if (tl && tr && ml && mc && mr && bl && br) - vline(c).update_tag(component::Ignored); - - // FIXME: warning this call may produce inconsistent data - // Ignored components are still in the separator image... - doc.set_vline_separators(doc.vline_seps(), vline); - } - - trace::exiting("scribo::filter::separators_in_element"); + if (vline(c).is_valid()) + { + const mln_box(L)& b_ = vline(c).bbox(); + + const bool tl = billboard(b_.pmin()); + const bool tr = billboard.at_(b_.pmin().row(), b_.pmax().col()); + const bool ml = billboard.at_(b_.pcenter().row(), b_.pmin().col()); + const bool mc = billboard.at_(b_.pcenter().row(), b_.pcenter().col()); + const bool mr = billboard.at_(b_.pcenter().row(), b_.pmax().col()); + const bool bl = billboard.at_(b_.pmax().row(), b_.pmin().col()); + const bool br = billboard(b_.pmax()); + + // This separator is included in an element (picture, drawing...) + // => Ignore it. + if (tl && tr && ml && mc && mr && bl && br) + vline(c).update_tag(component::Ignored); + } + // FIXME: warning this call may produce inconsistent data + // Ignored components are still in the separator image... + doc.set_vline_separators(doc.vline_seps(), vline); } + + trace::exiting("scribo::filter::separators_in_element"); } # endif // ! MLN_INCLUDE_ONLY diff --git a/scribo/scribo/filter/separators_in_paragraph.hh b/scribo/scribo/filter/separators_in_paragraph.hh index 3e7a150..7c157be 100644 --- a/scribo/scribo/filter/separators_in_paragraph.hh +++ b/scribo/scribo/filter/separators_in_paragraph.hh @@ -58,14 +58,14 @@ namespace scribo /// template <typename L> void - separators_in_paragraph(document<L>& doc); + separators_in_paragraph(document<L>& doc, unsigned hmax_size, unsigned vmax_size); # ifndef MLN_INCLUDE_ONLY template <typename L> void - separators_in_paragraph(document<L>& doc) + separators_in_paragraph(document<L>& doc, unsigned hmax_size, unsigned vmax_size) { trace::entering("scribo::filter::separators_in_paragraph"); @@ -90,26 +90,28 @@ namespace scribo { component_set<L> hline = doc.hline_seps_comps().duplicate(); for_all_comps(c, hline) - { - const mln_box(L)& b_ = hline(c).bbox(); - - const bool tl = billboard(b_.pmin()); - const bool tr = billboard.at_(b_.pmin().row(), b_.pmax().col()); - const bool ml = billboard.at_(b_.pcenter().row(), b_.pmin().col()); - const bool mc = billboard.at_(b_.pcenter().row(), b_.pcenter().col()); - const bool mr = billboard.at_(b_.pcenter().row(), b_.pmax().col()); - const bool bl = billboard.at_(b_.pmax().row(), b_.pmin().col()); - const bool br = billboard(b_.pmax()); - - // This separator is included in an element (picture, drawing...) - // => Ignore it. - if (tl && tr && ml && mc && mr && bl && br) - hline(c).update_tag(component::Ignored); - - // FIXME: warning this call may produce inconsistent data - // Ignored components are still in the separator image... - doc.set_hline_separators(doc.hline_seps(), hline); - } + if (hline(c).is_valid()) + { + const mln_box(L)& b_ = hline(c).bbox(); + + const bool tl = billboard(b_.pmin()); + const bool tr = billboard.at_(b_.pmin().row(), b_.pmax().col()); + const bool ml = billboard.at_(b_.pcenter().row(), b_.pmin().col()); + const bool mc = billboard.at_(b_.pcenter().row(), b_.pcenter().col()); + const bool mr = billboard.at_(b_.pcenter().row(), b_.pmax().col()); + const bool bl = billboard.at_(b_.pmax().row(), b_.pmin().col()); + const bool br = billboard(b_.pmax()); + + // This separator is included in an element (picture, drawing...) + // => Ignore it. + if (tl && tr && ml && mc && mr && bl && br + && hline(c).bbox().width() < hmax_size) + hline(c).update_tag(component::Ignored); + } + + // FIXME: warning this call may produce inconsistent data + // Ignored components are still in the separator image... + doc.set_hline_separators(doc.hline_seps(), hline); } // Vertical separators @@ -117,29 +119,31 @@ namespace scribo { component_set<L> vline = doc.vline_seps_comps().duplicate(); for_all_comps(c, vline) - { - const mln_box(L)& b_ = vline(c).bbox(); - - const bool tl = billboard(b_.pmin()); - const bool tr = billboard.at_(b_.pmin().row(), b_.pmax().col()); - const bool ml = billboard.at_(b_.pcenter().row(), b_.pmin().col()); - const bool mc = billboard.at_(b_.pcenter().row(), b_.pcenter().col()); - const bool mr = billboard.at_(b_.pcenter().row(), b_.pmax().col()); - const bool bl = billboard.at_(b_.pmax().row(), b_.pmin().col()); - const bool br = billboard(b_.pmax()); - - // This separator is included in an element (picture, drawing...) - // => Ignore it. - if (tl && tr && ml && mc && mr && bl && br) - vline(c).update_tag(component::Ignored); - - // FIXME: warning this call may produce inconsistent data - // Ignored components are still in the separator image... - doc.set_vline_separators(doc.vline_seps(), vline); - } - - trace::exiting("scribo::filter::separators_in_paragraph"); + if (vline(c).is_valid()) + { + const mln_box(L)& b_ = vline(c).bbox(); + + const bool tl = billboard(b_.pmin()); + const bool tr = billboard.at_(b_.pmin().row(), b_.pmax().col()); + const bool ml = billboard.at_(b_.pcenter().row(), b_.pmin().col()); + const bool mc = billboard.at_(b_.pcenter().row(), b_.pcenter().col()); + const bool mr = billboard.at_(b_.pcenter().row(), b_.pmax().col()); + const bool bl = billboard.at_(b_.pmax().row(), b_.pmin().col()); + const bool br = billboard(b_.pmax()); + + // This separator is included in an element (picture, drawing...) + // => Ignore it. + if (tl && tr && ml && mc && mr && bl && br + && vline(c).bbox().height() < vmax_size) + vline(c).update_tag(component::Ignored); + } + + // FIXME: warning this call may produce inconsistent data + // Ignored components are still in the separator image... + doc.set_vline_separators(doc.vline_seps(), vline); } + + trace::exiting("scribo::filter::separators_in_paragraph"); } # endif // ! MLN_INCLUDE_ONLY diff --git a/scribo/scribo/filter/separators_vert_in_borders.hh b/scribo/scribo/filter/separators_vert_in_borders.hh new file mode 100644 index 0000000..4a9e806 --- /dev/null +++ b/scribo/scribo/filter/separators_vert_in_borders.hh @@ -0,0 +1,143 @@ +// Copyright (C) 2011 EPITA Research and Development Laboratory (LRDE) +// +// This file is part of Olena. +// +// Olena is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation, version 2 of the License. +// +// Olena is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Olena. If not, see <http://www.gnu.org/licenses/>. +// +// As a special exception, you may use this file as part of a free +// software project without restriction. Specifically, if other files +// instantiate templates or use macros or inline functions from this +// file, or you compile this file and link it with other files to produce +// an executable, this file does not by itself cause the resulting +// executable to be covered by the GNU General Public License. This +// exception does not however invalidate any other reasons why the +// executable file might be covered by the GNU General Public License. + +#ifndef SCRIBO_FILTER_SEPARATORS_VERT_IN_BORDERS_HH +# define SCRIBO_FILTER_SEPARATORS_VERT_IN_BORDERS_HH + +/// \file +/// +/// Invalidate false positive separators. +/// \fixme Share same test canvas as text::merging. + + +# include <mln/core/concept/image.hh> +# include <scribo/core/component_set.hh> +# include <scribo/core/document.hh> +# include <scribo/util/box_is_included.hh> + + +namespace scribo +{ + + namespace filter + { + + using namespace mln; + + + /// Invalidate separators located close to the image borders. + /// + /// \param[in,out] doc A document structure. + /// + /// Warning: it does not remove separators from separator + /// image. It only invalidate separator components in their + /// respective component_set. + /// + /// \verbatim + /// + /// ----------- + /// |_!____!__| + /// | ! ! <--------- Separators located in this area are + /// | ! ! | invalidated. + /// | ! ! | + /// |_!____!__| + /// | ! ! | + /// ----------- + /// + /// \endverbatim + // + template <typename L> + void + separators_vert_in_borders(document<L>& doc); + + +# ifndef MLN_INCLUDE_ONLY + + template <typename L> + void + separators_vert_in_borders(document<L>& doc) + { + trace::entering("scribo::filter::separators_vert_in_borders"); + + mln_precondition(doc.is_valid()); + + const mln::image2d<mln::value::rgb8>& ima = doc.image(); + + float border_size = std::min(43., 0.05 * ima.domain().width()); + + /// pt + /// ptl X------X--- + /// |_!____!__X ptr + /// | ! ! | + /// | ! ! | + /// | ! ! | + /// pbl X_!____!__| + /// | ! ! | + /// --X-------X + /// pb pbr + /// + point2d + ptl = ima.domain().pmin(), + pt(geom::min_row(ima), geom::max_col(ima) - border_size), + ptr(border_size, geom::max_col(ima)), + pbr = ima.domain().pmax(), + pb(geom::max_row(ima), border_size), + pbl(geom::max_row(ima) - border_size, geom::min_col(ima)); + + box2d + bt(ptl, ptr), + br(pt, pbr), + bb(pbl, pbr), + bl(ptl, pb); + + // Vertical separators + if (doc.has_vline_seps()) + { + component_set<L> vline = doc.vline_seps_comps().duplicate(); + for_all_comps(c, vline) + if (vline(c).is_valid()) + if (util::box_is_included(vline(c).bbox(), bt) + || util::box_is_included(vline(c).bbox(), br) + || util::box_is_included(vline(c).bbox(), bb) + || util::box_is_included(vline(c).bbox(), bl)) + { + vline(c).update_tag(component::Ignored); + } + + // FIXME: warning this call may produce inconsistent data + // Ignored components are still in the separator image... + doc.set_vline_separators(doc.vline_seps(), vline); + } + + trace::exiting("scribo::filter::separators_vert_in_borders"); + } + +# endif // ! MLN_INCLUDE_ONLY + + } // end of namespace scribo::filter + +} // end of namespace scribo + +#endif // ! SCRIBO_FILTER_SEPARATORS_VERT_IN_BORDERS_HH diff --git a/scribo/scribo/toolchain/internal/content_in_hdoc_functor.hh b/scribo/scribo/toolchain/internal/content_in_hdoc_functor.hh index e0c5b50..24d24a3 100644 --- a/scribo/scribo/toolchain/internal/content_in_hdoc_functor.hh +++ b/scribo/scribo/toolchain/internal/content_in_hdoc_functor.hh @@ -53,8 +53,10 @@ # include <scribo/filter/objects_small.hh> # include <scribo/filter/paragraphs_bbox_overlap.hh> # include <scribo/filter/paragraphs_in_image.hh> +# include <scribo/filter/paragraphs_in_borders.hh> # include <scribo/filter/separators_in_element.hh> # include <scribo/filter/separators_in_paragraph.hh> +# include <scribo/filter/separators_in_borders.hh> # include <scribo/filter/images_in_paragraph.hh> # include <scribo/primitive/group/from_single_link.hh> @@ -66,6 +68,8 @@ # include <scribo/preprocessing/denoise_fg.hh> +# include <scribo/postprocessing/images_to_drop_capital.hh> + # include <scribo/text/recognition.hh> # include <scribo/text/merging.hh> # include <scribo/text/link_lines.hh> @@ -84,6 +88,7 @@ # include <scribo/io/xml/save.hh> +#include <scribo/io/img/save.hh> namespace scribo { @@ -201,12 +206,22 @@ namespace scribo // Vertical and horizontal separators { + unsigned closing_size = std::min(0.01 * doc.image().domain().width(), + 0.01 * doc.image().domain().height()); + win::hline2d hl(closing_size); + + // Apply a closing::structural in order to disconnected + // parts of a single separator. mln_ch_value(I,bool) vseparators = preprocessing::rotate_90( - primitive::extract::lines_h_thick_and_thin( - preprocessing::rotate_90(processed_image), 101, 3, 0.2, 0.6, 10), false), - hseparators = primitive::extract::lines_h_thick_and_thin( - processed_image, 101, 3); + morpho::closing::structural( + primitive::extract::lines_h_thick_and_thin( + preprocessing::rotate_90(processed_image), + 101, 3, 0.2, 0.6, 10), hl), false), + + hseparators = morpho::closing::structural( + primitive::extract::lines_h_thick_and_thin( + processed_image, 101, 3), hl); doc.set_vline_separators(vseparators); doc.set_hline_separators(hseparators); @@ -509,9 +524,11 @@ namespace scribo on_new_progress_label("Filtering paragraphs"); - parset = filter::paragraphs_bbox_overlap(parset); + paragraph_set<L> parset_f = filter::paragraphs_bbox_overlap(parset); + doc.set_paragraphs(parset_f); - doc.set_paragraphs(parset); + // parset = filter::paragraphs_bbox_overlap(parset); + // doc.set_paragraphs(parset); on_progress(); @@ -540,16 +557,38 @@ namespace scribo on_progress(); +// TEMPORARY DEBUG + on_new_progress_label("Saving debug data"); + doc.set_paragraphs(parset); + scribo::io::img::save(doc, "debug_wo_filter.png", scribo::io::img::DebugWoImage); + scribo::io::img::save(doc, "full_wo_filter.png", scribo::io::img::DebugWithImage); + doc.set_paragraphs(parset_f); + on_progress(); +// END OF TEMPORARY DEBUG + on_new_progress_label("Cleanup miscellaneous false positive"); filter::separators_in_element(doc); - filter::separators_in_paragraph(doc); + filter::separators_in_paragraph(doc, 81, 121); + filter::separators_in_borders(doc, 0.05, 0.02); + filter::paragraphs_in_image(doc); - filter::images_in_paragraph(doc); + filter::paragraphs_in_borders(doc); on_progress(); + on_new_progress_label("Rebuild extracted images"); + elements = scribo::primitive::extract::non_text_hdoc(doc, closing_size); + doc.set_elements(elements); + + on_progress(); + + on_new_progress_label("Tag images as drop capital"); + + postprocessing::images_to_drop_capital(doc); + + on_progress(); // Saving results if (save_doc_as_xml) @@ -564,6 +603,9 @@ namespace scribo on_end(); + + sleep(10); + return doc; } -- 1.5.6.5
participants (1)
-
Guillaume Lazzara