* scribo/filter/paragraphs_in_borders.hh,
* scribo/filter/separators_in_borders.hh,
* scribo/filter/separators_vert_in_borders.hh: New.
* scribo/filter/images_in_paragraph.hh,
* scribo/filter/paragraphs_bbox_overlap.hh,
* scribo/filter/paragraphs_in_image.hh,
* scribo/filter/separators_in_element.hh,
* scribo/filter/separators_in_paragraph.hh: Improve filtering.
* scribo/toolchain/internal/content_in_hdoc_functor.hh: Make use
of new filters.
---
scribo/ChangeLog | 17 ++
scribo/scribo/filter/images_in_paragraph.hh | 8 +-
scribo/scribo/filter/paragraphs_bbox_overlap.hh | 175 ++++++++++++-----
scribo/scribo/filter/paragraphs_in_borders.hh | 140 +++++++++++++
scribo/scribo/filter/paragraphs_in_image.hh | 29 +++-
scribo/scribo/filter/separators_in_borders.hh | 206 ++++++++++++++++++++
scribo/scribo/filter/separators_in_element.hh | 84 ++++----
scribo/scribo/filter/separators_in_paragraph.hh | 92 +++++----
scribo/scribo/filter/separators_vert_in_borders.hh | 143 ++++++++++++++
.../toolchain/internal/content_in_hdoc_functor.hh | 58 +++++-
10 files changed, 799 insertions(+), 153 deletions(-)
create mode 100644 scribo/scribo/filter/paragraphs_in_borders.hh
create mode 100644 scribo/scribo/filter/separators_in_borders.hh
create mode 100644 scribo/scribo/filter/separators_vert_in_borders.hh
diff --git a/scribo/ChangeLog b/scribo/ChangeLog
index 84564da..450c4d5 100644
--- a/scribo/ChangeLog
+++ b/scribo/ChangeLog
@@ -1,5 +1,22 @@
2011-06-07 Guillaume Lazzara <z(a)lrde.epita.fr>
+ Improve output cleanup for historical document toolchain.
+
+ * scribo/filter/paragraphs_in_borders.hh,
+ * scribo/filter/separators_in_borders.hh,
+ * scribo/filter/separators_vert_in_borders.hh: New.
+
+ * scribo/filter/images_in_paragraph.hh,
+ * scribo/filter/paragraphs_bbox_overlap.hh,
+ * scribo/filter/paragraphs_in_image.hh,
+ * scribo/filter/separators_in_element.hh,
+ * scribo/filter/separators_in_paragraph.hh: Improve filtering.
+
+ * scribo/toolchain/internal/content_in_hdoc_functor.hh: Make use
+ of new filters.
+
+2011-06-07 Guillaume Lazzara <z(a)lrde.epita.fr>
+
* scribo/util/component_precise_outline.hh: Add support for
labeled_image.
diff --git a/scribo/scribo/filter/images_in_paragraph.hh
b/scribo/scribo/filter/images_in_paragraph.hh
index e05b202..3cf64e1 100644
--- a/scribo/scribo/filter/images_in_paragraph.hh
+++ b/scribo/scribo/filter/images_in_paragraph.hh
@@ -101,12 +101,12 @@ namespace scribo
// => Ignore it.
if (tl && tr && ml && mc && mr && bl
&& br)
elts(c).update_tag(component::Ignored);
-
- // FIXME: warning this call may produce inconsistent data
- // Ignored components are still in the separator image...
- doc.set_elements(elts);
}
+ // FIXME: warning this call may produce inconsistent data
+ // Ignored components are still in the separator image...
+ doc.set_elements(elts);
+
trace::exiting("scribo::filter::images_in_paragraph");
}
diff --git a/scribo/scribo/filter/paragraphs_bbox_overlap.hh
b/scribo/scribo/filter/paragraphs_bbox_overlap.hh
index aa1c8ac..188a77e 100644
--- a/scribo/scribo/filter/paragraphs_bbox_overlap.hh
+++ b/scribo/scribo/filter/paragraphs_bbox_overlap.hh
@@ -41,6 +41,7 @@
# include <scribo/core/paragraph_set.hh>
+#include <mln/labeling/colorize.hh>
namespace scribo
{
@@ -59,7 +60,7 @@ namespace scribo
/// Paragraph::Ignored.
template <typename L>
paragraph_set<L>
- paragraphs_bbox_overlap(const paragraph_set<L>& paragraphs);
+ paragraphs_bbox_overlap(const paragraph_set<L>& parset);
# ifndef MLN_INCLUDE_ONLY
@@ -70,23 +71,23 @@ namespace scribo
template <typename L>
struct order_paragraphs_id
{
- order_paragraphs_id(const scribo::paragraph_set<L>& paragraphs)
- : paragraphs_(paragraphs)
+ order_paragraphs_id(const scribo::paragraph_set<L>& parset)
+ : parset_(parset)
{
}
bool operator()(const scribo::paragraph_id_t& l1,
const scribo::paragraph_id_t& l2) const
{
- const unsigned l1_nsites = paragraphs_(l1).bbox().nsites();
- const unsigned l2_nsites = paragraphs_(l2).bbox().nsites();
+ const unsigned l1_nsites = parset_(l1).bbox().nsites();
+ const unsigned l2_nsites = parset_(l2).bbox().nsites();
if (l1_nsites == l2_nsites)
return l1 > l2;
return l1_nsites > l2_nsites;
}
- scribo::paragraph_set<L> paragraphs_;
+ scribo::paragraph_set<L> parset_;
};
} // end of namespace scribo::filter::internal
@@ -94,74 +95,150 @@ namespace scribo
template <typename L>
paragraph_set<L>
- paragraphs_bbox_overlap(const paragraph_set<L>& paragraphs)
+ paragraphs_bbox_overlap(const paragraph_set<L>& parset)
{
trace::entering("scribo::filter::paragraphs_bbox_overlap");
- mln_precondition(paragraphs.is_valid());
+ mln_precondition(parset.is_valid());
- L billboard;
- initialize(billboard, paragraphs.lines().components().labeled_image());
+ mln_ch_value(L, paragraph_id_t) billboard;
+ initialize(billboard, parset.lines().components().labeled_image());
data::fill(billboard, 0);
- mln::util::array<bool> not_to_ignore(paragraphs.nelements() + 1, true);
+ mln::util::array<bool> not_to_ignore(parset.nelements() + 1, true);
not_to_ignore(0) = false;
- for_all_paragraphs(cur_id, paragraphs)
+ paragraph_set<L> output = parset.duplicate();
+
+ mln::util::array<paragraph_id_t> candidate;
+ candidate.reserve(parset.nelements());
+ for_all_paragraphs(cur_id, parset)
+ if (parset(cur_id).is_valid())
+ candidate.append(cur_id);
+
+ std::sort(candidate.hook_std_vector_().begin(),
+ candidate.hook_std_vector_().end(),
+ internal::order_paragraphs_id<L>(parset));
+
+ for_all_elements(e, candidate)
{
- const box2d& b_ = paragraphs(cur_id).bbox();
+ paragraph_id_t cur_id = candidate(e);
+
+ const box2d& b_ = parset(cur_id).bbox();
- if (paragraphs(cur_id).nlines() > 1)
+ if (parset(cur_id).nlines() > 3)
{
mln::draw::box_plain(billboard, b_, cur_id);
continue;
}
- const unsigned tl = billboard(b_.pmin());
- const unsigned tr = billboard.at_(b_.pmin().row(), b_.pmax().col());
- const unsigned ml = billboard.at_(b_.pcenter().row(), b_.pmin().col());
const unsigned mc = billboard.at_(b_.pcenter().row(), b_.pcenter().col());
- const unsigned mr = billboard.at_(b_.pcenter().row(), b_.pmax().col());
- const unsigned bl = billboard.at_(b_.pmax().row(), b_.pmin().col());
- const unsigned br = billboard(b_.pmax());
-
- typedef std::set<unsigned> set_t;
- set_t labels;
- labels.insert(tl);
- labels.insert(tl);
- labels.insert(tr);
- labels.insert(ml);
- labels.insert(mc);
- labels.insert(mr);
- labels.insert(bl);
- labels.insert(br);
-
- for (set_t::const_iterator it = labels.begin();
- it != labels.end();
- ++it)
- if (not_to_ignore(*it))
+
+ // Box is mostly in the background => do nothing.
+ if (mc == 0)
+ {
+ mln::draw::box_plain(billboard, b_, cur_id);
+ continue;
+ }
+ else // Bbox center is inside another box. Check if we can
+ // merge the current box with it.
+ {
+ // Consider other potential overlapping bboxes.
+ const unsigned tl = billboard(b_.pmin());
+ const unsigned tr = billboard.at_(b_.pmin().row(), b_.pmax().col());
+ const unsigned bl = billboard.at_(b_.pmax().row(), b_.pmin().col());
+ const unsigned br = billboard(b_.pmax());
+
+ typedef std::set<unsigned> set_t;
+ set_t labels;
+ labels.insert(tl);
+ labels.insert(tr);
+ labels.insert(mc);
+ labels.insert(bl);
+ labels.insert(br);
+
+ // FIXME: check that there are at least 3 points (including
+ // the center) in another paragraph.
+
+ // The potential merged bbox is already ignored or the
+ // current bbox overlaps with several bboxes.
+ // => Ignore current bbox .
+ //
+ if (!not_to_ignore(mc)
+ || (labels.size() > 1 && labels.find(0) == labels.end()))
{
- box2d b2 = paragraphs(*it).bbox();
- box2d b_i = scribo::util::box_intersection(b_, b2);
+ mln::draw::box_plain(billboard, b_, cur_id); // Really?
+ not_to_ignore(cur_id) = false;
+ continue;
+ }
- // si b_ est inclus dans une boite donc le nombre de comp > 1 => invalid juste
b_
- // sinon => invalid b_ et b2
- if ((b_i.nsites() / (float)b_.nsites() > 0.4
- || (b_i.nsites() / (float)b2.nsites()) > 0.9))
+ for (set_t::const_iterator it = labels.begin();
+ it != labels.end(); ++it)
+ if (*it)
{
- not_to_ignore(cur_id) = false;
-
- if (paragraphs(*it).nlines() < 4)
- not_to_ignore(*it) = false;
+ mln_assertion(*it != mc);
+
+ box2d b2 = output(*it).bbox();
+ box2d b_i = scribo::util::box_intersection(b_, b2);
+ volatile float
+ b_ratio = b_i.nsites() / (float)b_.nsites();
+
+ // If the bbox is widely included in another box.
+ if (b_ratio > 0.8)
+ {
+ output(mc).fast_merge(output(cur_id));
+ mln::draw::box_plain(billboard, parset(mc).bbox(), mc);
+ }
+ else
+ mln::draw::box_plain(billboard, parset(cur_id).bbox(), cur_id);
+ break;
}
- }
- mln::draw::box_plain(billboard, b_, cur_id);
+ }
}
- paragraph_set<L> output = paragraphs.duplicate();
+ // if (not_to_ignore(*it))
+ // {
+ // box2d b2 = output(*it).bbox();
+ // box2d b_i = scribo::util::box_intersection(b_, b2);
+
+ // volatile float
+ // b_ratio = b_i.nsites() / (float)b_.nsites(),
+ // b2_ratio = b_i.nsites() / (float)b2.nsites();
+
+ // if (b2_ratio == 1)
+ // {
+ // // Merge paragraphs and redraw the new bbox.
+ // output(cur_id).fast_merge(output(*it));
+ // mln::draw::box_plain(billboard, output(cur_id).bbox(), cur_id);
+ // }
+ // else if (b_ratio == 1)
+ // {
+ // // Merge paragraphs and redraw the new bbox.
+ // output(*it).fast_merge(output(cur_id));
+ // mln::draw::box_plain(billboard, output(*it).bbox(), *it);
+ // }
+ // else if ((b_ratio > 0.4 || b2_ratio > 0.9))
+ // {
+ // // si b_ est inclus dans une boite dont le nombre de
+ // // comp > 4 => invalid juste b_ sinon => invalid b_ et
+ // // b2
+ // not_to_ignore(cur_id) = false;
+
+ // if (parset(*it).nlines() < 4)
+ // not_to_ignore(*it) = false;
+ // }
+ // }
+
+ // mln::draw::box_plain(billboard, b_, cur_id);
+ // }
+
output.invalidate(not_to_ignore);
+ for_all_paragraphs(p, output)
+ if (output(p).is_valid())
+ output(p).force_stats_update();
+
trace::exiting("scribo::filter::paragraphs_bbox_overlap");
return output;
}
diff --git a/scribo/scribo/filter/paragraphs_in_borders.hh
b/scribo/scribo/filter/paragraphs_in_borders.hh
new file mode 100644
index 0000000..8953282
--- /dev/null
+++ b/scribo/scribo/filter/paragraphs_in_borders.hh
@@ -0,0 +1,140 @@
+// Copyright (C) 2011 EPITA Research and Development Laboratory (LRDE)
+//
+// This file is part of Olena.
+//
+// Olena is free software: you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation, version 2 of the License.
+//
+// Olena is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with Olena. If not, see <http://www.gnu.org/licenses/>.
+//
+// As a special exception, you may use this file as part of a free
+// software project without restriction. Specifically, if other files
+// instantiate templates or use macros or inline functions from this
+// file, or you compile this file and link it with other files to produce
+// an executable, this file does not by itself cause the resulting
+// executable to be covered by the GNU General Public License. This
+// exception does not however invalidate any other reasons why the
+// executable file might be covered by the GNU General Public License.
+
+#ifndef SCRIBO_FILTER_PARAGRAPHS_IN_BORDERS_HH
+# define SCRIBO_FILTER_PARAGRAPHS_IN_BORDERS_HH
+
+/// \file
+///
+/// Invalidate false positive paragraphs.
+/// \fixme Share same test canvas as text::merging.
+
+
+# include <mln/core/concept/image.hh>
+# include <scribo/core/component_set.hh>
+# include <scribo/core/document.hh>
+# include <scribo/util/box_is_included.hh>
+
+namespace scribo
+{
+
+ namespace filter
+ {
+
+ using namespace mln;
+
+
+ /// Invalidate paragraphs located close to the image borders.
+ ///
+ /// \param[in,out] doc A document structure.
+ ///
+ /// Warning: it does not remove paragraphs from separator
+ /// image. It only invalidate separator components in their
+ /// respective component_set.
+ ///
+ /// \verbatim
+ ///
+ /// -----------
+ /// |_!____!__|
+ /// | ! ! <--------- Paragraphs located in this area are
+ /// | ! ! | invalidated.
+ /// | ! ! |
+ /// |_!____!__|
+ /// | ! ! |
+ /// -----------
+ ///
+ /// \endverbatim
+ //
+ template <typename L>
+ void
+ paragraphs_in_borders(document<L>& doc);
+
+
+# ifndef MLN_INCLUDE_ONLY
+
+ template <typename L>
+ void
+ paragraphs_in_borders(document<L>& doc)
+ {
+ trace::entering("scribo::filter::paragraphs_in_borders");
+
+ mln_precondition(doc.is_valid());
+
+ const mln::image2d<mln::value::rgb8>& ima = doc.image();
+
+ unsigned border_size = std::min(43., 0.02 * ima.domain().width());
+
+ /// pt
+ /// ptl X------X---
+ /// |_!____!__X ptr
+ /// | ! ! |
+ /// | ! ! |
+ /// | ! ! |
+ /// pbl X_!____!__|
+ /// | ! ! |
+ /// --X-------X
+ /// pb pbr
+ ///
+ point2d
+ ptl = ima.domain().pmin(),
+ pt(geom::min_row(ima), geom::max_col(ima) - border_size),
+ ptr(border_size, geom::max_col(ima)),
+ pbr = ima.domain().pmax(),
+ pb(geom::max_row(ima), border_size),
+ pbl(geom::max_row(ima) - border_size, geom::min_col(ima));
+
+ box2d
+ bt(ptl, ptr),
+ br(pt, pbr),
+ bb(pbl, pbr),
+ bl(ptl, pb);
+
+ // Horizontal paragraphs
+ if (doc.has_text())
+ {
+ paragraph_set<L> parset = doc.paragraphs();
+ for_all_paragraphs(p, parset)
+ if (parset(p).is_valid())
+ if (util::box_is_included(parset(p).bbox(), bt)
+ || util::box_is_included(parset(p).bbox(), br)
+ || util::box_is_included(parset(p).bbox(), bb)
+ || util::box_is_included(parset(p).bbox(), bl))
+ {
+ parset(p).invalidate();
+ }
+
+ doc.set_paragraphs(parset);
+ }
+
+ trace::exiting("scribo::filter::paragraphs_in_borders");
+ }
+
+# endif // ! MLN_INCLUDE_ONLY
+
+ } // end of namespace scribo::filter
+
+} // end of namespace scribo
+
+#endif // ! SCRIBO_FILTER_PARAGRAPHS_IN_BORDERS_HH
diff --git a/scribo/scribo/filter/paragraphs_in_image.hh
b/scribo/scribo/filter/paragraphs_in_image.hh
index 1029430..f67b863 100644
--- a/scribo/scribo/filter/paragraphs_in_image.hh
+++ b/scribo/scribo/filter/paragraphs_in_image.hh
@@ -89,8 +89,6 @@ namespace scribo
&& doc.elements()(e).type() == component::Image)
mln::draw::box_plain(billboard, doc.elements()(e).bbox(), true);
- mln::io::pbm::save(billboard, "billboard_parimage.pbm");
-
const paragraph_set<L>& parset = doc.paragraphs();
mln::util::array<bool> not_to_ignore(parset.nelements() + 1, true);
not_to_ignore(0) = false;
@@ -101,15 +99,34 @@ namespace scribo
const bool
tl = billboard(b_.pmin()),
tr = billboard.at_(b_.pmin().row(), b_.pmax().col()),
- ml = billboard.at_(b_.pcenter().row(), b_.pmin().col()),
mc = billboard.at_(b_.pcenter().row(), b_.pcenter().col()),
- mr = billboard.at_(b_.pcenter().row(), b_.pmax().col()),
bl = billboard.at_(b_.pmax().row(), b_.pmin().col()),
br = billboard(b_.pmax());
+ typedef mln::util::set<int> set_t;
+ set_t s;
+ s.insert(tl);
+ s.insert(tr);
+ s.insert(mc);
+ s.insert(bl);
+ s.insert(br);
+
+ if (s.nelements() > 2 || (s.nelements() == 2 && !s.has(0)))
+ continue;
+
// The paragraph is fully included in an image.
- if (tl && tr && ml && mc && mr && bl &&
br)
- not_to_ignore(cur_id) = false;
+ for_all_elements(e, s)
+ if (s[e] != 0
+ && (mc != 0 && mc == s[e]
+ && ((tl == mc && bl == mc)
+ || (tr == mc && br == mc)
+ || (tl == mc && tr == mc)
+ || (bl == mc && br == mc))))
+ {
+// if (tl && tr && ml && mc && mr && bl
&& br)
+ not_to_ignore(cur_id) = false;
+ break;
+ }
}
paragraph_set<L> output = parset.duplicate();
diff --git a/scribo/scribo/filter/separators_in_borders.hh
b/scribo/scribo/filter/separators_in_borders.hh
new file mode 100644
index 0000000..8ccb6b1
--- /dev/null
+++ b/scribo/scribo/filter/separators_in_borders.hh
@@ -0,0 +1,206 @@
+// Copyright (C) 2011 EPITA Research and Development Laboratory (LRDE)
+//
+// This file is part of Olena.
+//
+// Olena is free software: you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation, version 2 of the License.
+//
+// Olena is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with Olena. If not, see <http://www.gnu.org/licenses/>.
+//
+// As a special exception, you may use this file as part of a free
+// software project without restriction. Specifically, if other files
+// instantiate templates or use macros or inline functions from this
+// file, or you compile this file and link it with other files to produce
+// an executable, this file does not by itself cause the resulting
+// executable to be covered by the GNU General Public License. This
+// exception does not however invalidate any other reasons why the
+// executable file might be covered by the GNU General Public License.
+
+#ifndef SCRIBO_FILTER_SEPARATORS_IN_BORDERS_HH
+# define SCRIBO_FILTER_SEPARATORS_IN_BORDERS_HH
+
+/// \file
+///
+/// Invalidate false positive separators.
+/// \fixme Share same test canvas as text::merging.
+
+
+# include <mln/core/concept/image.hh>
+# include <scribo/core/component_set.hh>
+# include <scribo/core/document.hh>
+# include <scribo/util/box_is_included.hh>
+
+namespace scribo
+{
+
+ namespace filter
+ {
+
+ using namespace mln;
+
+
+ /// Invalidate separators located close to the image borders.
+ ///
+ /// \param[in,out] doc A document structure.
+ ///
+ /// Warning: it does not remove separators from separator
+ /// image. It only invalidate separator components in their
+ /// respective component_set.
+ ///
+ /// \verbatim
+ ///
+ /// -----------
+ /// |_!____!__|
+ /// | ! ! <--------- Separators located in this area are
+ /// | ! ! | invalidated.
+ /// | ! ! |
+ /// |_!____!__|
+ /// | ! ! |
+ /// -----------
+ ///
+ /// \endverbatim
+ //
+ template <typename L>
+ void
+ separators_in_borders(document<L>& doc, float vratio, float hratio);
+
+
+# ifndef MLN_INCLUDE_ONLY
+
+ template <typename L>
+ void
+ separators_in_borders(document<L>& doc, float vratio, float hratio)
+ {
+ trace::entering("scribo::filter::separators_in_borders");
+
+ mln_precondition(doc.is_valid());
+
+ const mln::image2d<mln::value::rgb8>& ima = doc.image();
+
+ // Horizontal separators
+ if (doc.has_hline_seps())
+ {
+ unsigned border_size = hratio * std::min(ima.domain().width(), ima.domain().height());
+
+ /// pt
+ /// ptl X------X---
+ /// |_!____!__X ptr
+ /// | ! ! |
+ /// | ! ! |
+ /// | ! ! |
+ /// pbl X_!____!__|
+ /// | ! ! |
+ /// --X-------X
+ /// pb pbr
+ ///
+ point2d
+ ptl = ima.domain().pmin(),
+ pt(geom::min_row(ima), geom::max_col(ima) - border_size),
+ ptr(border_size, geom::max_col(ima)),
+ pbr = ima.domain().pmax(),
+ pb(geom::max_row(ima), border_size),
+ pbl(geom::max_row(ima) - border_size, geom::min_col(ima));
+
+ box2d
+ bt(ptl, ptr),
+ br(pt, pbr),
+ bb(pbl, pbr),
+ bl(ptl, pb);
+
+
+ component_set<L> hline = doc.hline_seps_comps().duplicate();
+ for_all_comps(c, hline)
+ if (hline(c).is_valid())
+ if (util::box_is_included(hline(c).bbox(), bt)
+ || util::box_is_included(hline(c).bbox(), br)
+ || util::box_is_included(hline(c).bbox(), bb)
+ || util::box_is_included(hline(c).bbox(), bl))
+ {
+ hline(c).update_tag(component::Ignored);
+ }
+
+ // FIXME: warning this call may produce inconsistent data
+ // Ignored components are still in the separator image...
+ doc.set_hline_separators(doc.hline_seps(), hline);
+ }
+
+
+ // Vertical separators
+ if (doc.has_vline_seps())
+ {
+ unsigned border_size = vratio * std::min(ima.domain().width(), ima.domain().height());
+
+ /// pt
+ /// ptl X------X---
+ /// |_!____!__X ptr
+ /// | ! ! |
+ /// | ! ! |
+ /// | ! ! |
+ /// pbl X_!____!__|
+ /// | ! ! |
+ /// --X-------X
+ /// pb pbr
+ ///
+ point2d
+ ptl = ima.domain().pmin(),
+ pt(geom::min_row(ima), geom::max_col(ima) - border_size),
+ ptr(border_size, geom::max_col(ima)),
+ pbr = ima.domain().pmax(),
+ pb(geom::max_row(ima), border_size),
+ pbl(geom::max_row(ima) - border_size, geom::min_col(ima));
+
+ box2d
+ bt(ptl, ptr),
+ br(pt, pbr),
+ bb(pbl, pbr),
+ bl(ptl, pb);
+
+
+ component_set<L> vline = doc.vline_seps_comps().duplicate();
+ for_all_comps(c, vline)
+ if (vline(c).is_valid())
+ {
+ if (util::box_is_included(vline(c).bbox(), bt)
+ || util::box_is_included(vline(c).bbox(), br)
+ || util::box_is_included(vline(c).bbox(), bb)
+ || util::box_is_included(vline(c).bbox(), bl))
+ {
+ // std::cout << vline(c).bbox() << " is included in ";
+ // if (util::box_is_included(vline(c).bbox(), bt))
+ // std::cout << bt << std::endl;
+ // if (util::box_is_included(vline(c).bbox(), br))
+ // std::cout << br << std::endl;
+ // if (util::box_is_included(vline(c).bbox(), bb))
+ // std::cout << bb << std::endl;
+ // if (util::box_is_included(vline(c).bbox(), bl))
+ // std::cout << bl << std::endl;
+
+ vline(c).update_tag(component::Ignored);
+ }
+ // else
+ // {
+ // std::cout << vline(c).bbox() << " is not included in "
<< bt << " - " << br << " - " << bb
<< " - " << bl << std::endl;
+ // }
+ }
+ // FIXME: warning this call may produce inconsistent data
+ // Ignored components are still in the separator image...
+ doc.set_vline_separators(doc.vline_seps(), vline);
+ }
+
+ trace::exiting("scribo::filter::separators_in_borders");
+ }
+
+# endif // ! MLN_INCLUDE_ONLY
+
+ } // end of namespace scribo::filter
+
+} // end of namespace scribo
+
+#endif // ! SCRIBO_FILTER_SEPARATORS_IN_BORDERS_HH
diff --git a/scribo/scribo/filter/separators_in_element.hh
b/scribo/scribo/filter/separators_in_element.hh
index 228d82f..a8b0ebb 100644
--- a/scribo/scribo/filter/separators_in_element.hh
+++ b/scribo/scribo/filter/separators_in_element.hh
@@ -90,26 +90,26 @@ namespace scribo
{
component_set<L> hline = doc.hline_seps_comps().duplicate();
for_all_comps(c, hline)
- {
- const mln_box(L)& b_ = hline(c).bbox();
-
- const bool tl = billboard(b_.pmin());
- const bool tr = billboard.at_(b_.pmin().row(), b_.pmax().col());
- const bool ml = billboard.at_(b_.pcenter().row(), b_.pmin().col());
- const bool mc = billboard.at_(b_.pcenter().row(), b_.pcenter().col());
- const bool mr = billboard.at_(b_.pcenter().row(), b_.pmax().col());
- const bool bl = billboard.at_(b_.pmax().row(), b_.pmin().col());
- const bool br = billboard(b_.pmax());
-
- // This separator is included in an element (picture, drawing...)
- // => Ignore it.
- if (tl && tr && ml && mc && mr && bl
&& br)
- hline(c).update_tag(component::Ignored);
-
- // FIXME: warning this call may produce inconsistent data
- // Ignored components are still in the separator image...
- doc.set_hline_separators(doc.hline_seps(), hline);
- }
+ if (hline(c).is_valid())
+ {
+ const mln_box(L)& b_ = hline(c).bbox();
+
+ const bool tl = billboard(b_.pmin());
+ const bool tr = billboard.at_(b_.pmin().row(), b_.pmax().col());
+ const bool ml = billboard.at_(b_.pcenter().row(), b_.pmin().col());
+ const bool mc = billboard.at_(b_.pcenter().row(), b_.pcenter().col());
+ const bool mr = billboard.at_(b_.pcenter().row(), b_.pmax().col());
+ const bool bl = billboard.at_(b_.pmax().row(), b_.pmin().col());
+ const bool br = billboard(b_.pmax());
+
+ // This separator is included in an element (picture, drawing...)
+ // => Ignore it.
+ if (tl && tr && ml && mc && mr && bl
&& br)
+ hline(c).update_tag(component::Ignored);
+ }
+ // FIXME: warning this call may produce inconsistent data
+ // Ignored components are still in the separator image...
+ doc.set_hline_separators(doc.hline_seps(), hline);
}
// Vertical separators
@@ -117,29 +117,29 @@ namespace scribo
{
component_set<L> vline = doc.vline_seps_comps().duplicate();
for_all_comps(c, vline)
- {
- const mln_box(L)& b_ = vline(c).bbox();
-
- const bool tl = billboard(b_.pmin());
- const bool tr = billboard.at_(b_.pmin().row(), b_.pmax().col());
- const bool ml = billboard.at_(b_.pcenter().row(), b_.pmin().col());
- const bool mc = billboard.at_(b_.pcenter().row(), b_.pcenter().col());
- const bool mr = billboard.at_(b_.pcenter().row(), b_.pmax().col());
- const bool bl = billboard.at_(b_.pmax().row(), b_.pmin().col());
- const bool br = billboard(b_.pmax());
-
- // This separator is included in an element (picture, drawing...)
- // => Ignore it.
- if (tl && tr && ml && mc && mr && bl
&& br)
- vline(c).update_tag(component::Ignored);
-
- // FIXME: warning this call may produce inconsistent data
- // Ignored components are still in the separator image...
- doc.set_vline_separators(doc.vline_seps(), vline);
- }
-
- trace::exiting("scribo::filter::separators_in_element");
+ if (vline(c).is_valid())
+ {
+ const mln_box(L)& b_ = vline(c).bbox();
+
+ const bool tl = billboard(b_.pmin());
+ const bool tr = billboard.at_(b_.pmin().row(), b_.pmax().col());
+ const bool ml = billboard.at_(b_.pcenter().row(), b_.pmin().col());
+ const bool mc = billboard.at_(b_.pcenter().row(), b_.pcenter().col());
+ const bool mr = billboard.at_(b_.pcenter().row(), b_.pmax().col());
+ const bool bl = billboard.at_(b_.pmax().row(), b_.pmin().col());
+ const bool br = billboard(b_.pmax());
+
+ // This separator is included in an element (picture, drawing...)
+ // => Ignore it.
+ if (tl && tr && ml && mc && mr && bl
&& br)
+ vline(c).update_tag(component::Ignored);
+ }
+ // FIXME: warning this call may produce inconsistent data
+ // Ignored components are still in the separator image...
+ doc.set_vline_separators(doc.vline_seps(), vline);
}
+
+ trace::exiting("scribo::filter::separators_in_element");
}
# endif // ! MLN_INCLUDE_ONLY
diff --git a/scribo/scribo/filter/separators_in_paragraph.hh
b/scribo/scribo/filter/separators_in_paragraph.hh
index 3e7a150..7c157be 100644
--- a/scribo/scribo/filter/separators_in_paragraph.hh
+++ b/scribo/scribo/filter/separators_in_paragraph.hh
@@ -58,14 +58,14 @@ namespace scribo
///
template <typename L>
void
- separators_in_paragraph(document<L>& doc);
+ separators_in_paragraph(document<L>& doc, unsigned hmax_size, unsigned
vmax_size);
# ifndef MLN_INCLUDE_ONLY
template <typename L>
void
- separators_in_paragraph(document<L>& doc)
+ separators_in_paragraph(document<L>& doc, unsigned hmax_size, unsigned
vmax_size)
{
trace::entering("scribo::filter::separators_in_paragraph");
@@ -90,26 +90,28 @@ namespace scribo
{
component_set<L> hline = doc.hline_seps_comps().duplicate();
for_all_comps(c, hline)
- {
- const mln_box(L)& b_ = hline(c).bbox();
-
- const bool tl = billboard(b_.pmin());
- const bool tr = billboard.at_(b_.pmin().row(), b_.pmax().col());
- const bool ml = billboard.at_(b_.pcenter().row(), b_.pmin().col());
- const bool mc = billboard.at_(b_.pcenter().row(), b_.pcenter().col());
- const bool mr = billboard.at_(b_.pcenter().row(), b_.pmax().col());
- const bool bl = billboard.at_(b_.pmax().row(), b_.pmin().col());
- const bool br = billboard(b_.pmax());
-
- // This separator is included in an element (picture, drawing...)
- // => Ignore it.
- if (tl && tr && ml && mc && mr && bl
&& br)
- hline(c).update_tag(component::Ignored);
-
- // FIXME: warning this call may produce inconsistent data
- // Ignored components are still in the separator image...
- doc.set_hline_separators(doc.hline_seps(), hline);
- }
+ if (hline(c).is_valid())
+ {
+ const mln_box(L)& b_ = hline(c).bbox();
+
+ const bool tl = billboard(b_.pmin());
+ const bool tr = billboard.at_(b_.pmin().row(), b_.pmax().col());
+ const bool ml = billboard.at_(b_.pcenter().row(), b_.pmin().col());
+ const bool mc = billboard.at_(b_.pcenter().row(), b_.pcenter().col());
+ const bool mr = billboard.at_(b_.pcenter().row(), b_.pmax().col());
+ const bool bl = billboard.at_(b_.pmax().row(), b_.pmin().col());
+ const bool br = billboard(b_.pmax());
+
+ // This separator is included in an element (picture, drawing...)
+ // => Ignore it.
+ if (tl && tr && ml && mc && mr && bl
&& br
+ && hline(c).bbox().width() < hmax_size)
+ hline(c).update_tag(component::Ignored);
+ }
+
+ // FIXME: warning this call may produce inconsistent data
+ // Ignored components are still in the separator image...
+ doc.set_hline_separators(doc.hline_seps(), hline);
}
// Vertical separators
@@ -117,29 +119,31 @@ namespace scribo
{
component_set<L> vline = doc.vline_seps_comps().duplicate();
for_all_comps(c, vline)
- {
- const mln_box(L)& b_ = vline(c).bbox();
-
- const bool tl = billboard(b_.pmin());
- const bool tr = billboard.at_(b_.pmin().row(), b_.pmax().col());
- const bool ml = billboard.at_(b_.pcenter().row(), b_.pmin().col());
- const bool mc = billboard.at_(b_.pcenter().row(), b_.pcenter().col());
- const bool mr = billboard.at_(b_.pcenter().row(), b_.pmax().col());
- const bool bl = billboard.at_(b_.pmax().row(), b_.pmin().col());
- const bool br = billboard(b_.pmax());
-
- // This separator is included in an element (picture, drawing...)
- // => Ignore it.
- if (tl && tr && ml && mc && mr && bl
&& br)
- vline(c).update_tag(component::Ignored);
-
- // FIXME: warning this call may produce inconsistent data
- // Ignored components are still in the separator image...
- doc.set_vline_separators(doc.vline_seps(), vline);
- }
-
- trace::exiting("scribo::filter::separators_in_paragraph");
+ if (vline(c).is_valid())
+ {
+ const mln_box(L)& b_ = vline(c).bbox();
+
+ const bool tl = billboard(b_.pmin());
+ const bool tr = billboard.at_(b_.pmin().row(), b_.pmax().col());
+ const bool ml = billboard.at_(b_.pcenter().row(), b_.pmin().col());
+ const bool mc = billboard.at_(b_.pcenter().row(), b_.pcenter().col());
+ const bool mr = billboard.at_(b_.pcenter().row(), b_.pmax().col());
+ const bool bl = billboard.at_(b_.pmax().row(), b_.pmin().col());
+ const bool br = billboard(b_.pmax());
+
+ // This separator is included in an element (picture, drawing...)
+ // => Ignore it.
+ if (tl && tr && ml && mc && mr && bl
&& br
+ && vline(c).bbox().height() < vmax_size)
+ vline(c).update_tag(component::Ignored);
+ }
+
+ // FIXME: warning this call may produce inconsistent data
+ // Ignored components are still in the separator image...
+ doc.set_vline_separators(doc.vline_seps(), vline);
}
+
+ trace::exiting("scribo::filter::separators_in_paragraph");
}
# endif // ! MLN_INCLUDE_ONLY
diff --git a/scribo/scribo/filter/separators_vert_in_borders.hh
b/scribo/scribo/filter/separators_vert_in_borders.hh
new file mode 100644
index 0000000..4a9e806
--- /dev/null
+++ b/scribo/scribo/filter/separators_vert_in_borders.hh
@@ -0,0 +1,143 @@
+// Copyright (C) 2011 EPITA Research and Development Laboratory (LRDE)
+//
+// This file is part of Olena.
+//
+// Olena is free software: you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation, version 2 of the License.
+//
+// Olena is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with Olena. If not, see <http://www.gnu.org/licenses/>.
+//
+// As a special exception, you may use this file as part of a free
+// software project without restriction. Specifically, if other files
+// instantiate templates or use macros or inline functions from this
+// file, or you compile this file and link it with other files to produce
+// an executable, this file does not by itself cause the resulting
+// executable to be covered by the GNU General Public License. This
+// exception does not however invalidate any other reasons why the
+// executable file might be covered by the GNU General Public License.
+
+#ifndef SCRIBO_FILTER_SEPARATORS_VERT_IN_BORDERS_HH
+# define SCRIBO_FILTER_SEPARATORS_VERT_IN_BORDERS_HH
+
+/// \file
+///
+/// Invalidate false positive separators.
+/// \fixme Share same test canvas as text::merging.
+
+
+# include <mln/core/concept/image.hh>
+# include <scribo/core/component_set.hh>
+# include <scribo/core/document.hh>
+# include <scribo/util/box_is_included.hh>
+
+
+namespace scribo
+{
+
+ namespace filter
+ {
+
+ using namespace mln;
+
+
+ /// Invalidate separators located close to the image borders.
+ ///
+ /// \param[in,out] doc A document structure.
+ ///
+ /// Warning: it does not remove separators from separator
+ /// image. It only invalidate separator components in their
+ /// respective component_set.
+ ///
+ /// \verbatim
+ ///
+ /// -----------
+ /// |_!____!__|
+ /// | ! ! <--------- Separators located in this area are
+ /// | ! ! | invalidated.
+ /// | ! ! |
+ /// |_!____!__|
+ /// | ! ! |
+ /// -----------
+ ///
+ /// \endverbatim
+ //
+ template <typename L>
+ void
+ separators_vert_in_borders(document<L>& doc);
+
+
+# ifndef MLN_INCLUDE_ONLY
+
+ template <typename L>
+ void
+ separators_vert_in_borders(document<L>& doc)
+ {
+ trace::entering("scribo::filter::separators_vert_in_borders");
+
+ mln_precondition(doc.is_valid());
+
+ const mln::image2d<mln::value::rgb8>& ima = doc.image();
+
+ float border_size = std::min(43., 0.05 * ima.domain().width());
+
+ /// pt
+ /// ptl X------X---
+ /// |_!____!__X ptr
+ /// | ! ! |
+ /// | ! ! |
+ /// | ! ! |
+ /// pbl X_!____!__|
+ /// | ! ! |
+ /// --X-------X
+ /// pb pbr
+ ///
+ point2d
+ ptl = ima.domain().pmin(),
+ pt(geom::min_row(ima), geom::max_col(ima) - border_size),
+ ptr(border_size, geom::max_col(ima)),
+ pbr = ima.domain().pmax(),
+ pb(geom::max_row(ima), border_size),
+ pbl(geom::max_row(ima) - border_size, geom::min_col(ima));
+
+ box2d
+ bt(ptl, ptr),
+ br(pt, pbr),
+ bb(pbl, pbr),
+ bl(ptl, pb);
+
+ // Vertical separators
+ if (doc.has_vline_seps())
+ {
+ component_set<L> vline = doc.vline_seps_comps().duplicate();
+ for_all_comps(c, vline)
+ if (vline(c).is_valid())
+ if (util::box_is_included(vline(c).bbox(), bt)
+ || util::box_is_included(vline(c).bbox(), br)
+ || util::box_is_included(vline(c).bbox(), bb)
+ || util::box_is_included(vline(c).bbox(), bl))
+ {
+ vline(c).update_tag(component::Ignored);
+ }
+
+ // FIXME: warning this call may produce inconsistent data
+ // Ignored components are still in the separator image...
+ doc.set_vline_separators(doc.vline_seps(), vline);
+ }
+
+ trace::exiting("scribo::filter::separators_vert_in_borders");
+ }
+
+# endif // ! MLN_INCLUDE_ONLY
+
+ } // end of namespace scribo::filter
+
+} // end of namespace scribo
+
+#endif // ! SCRIBO_FILTER_SEPARATORS_VERT_IN_BORDERS_HH
diff --git a/scribo/scribo/toolchain/internal/content_in_hdoc_functor.hh
b/scribo/scribo/toolchain/internal/content_in_hdoc_functor.hh
index e0c5b50..24d24a3 100644
--- a/scribo/scribo/toolchain/internal/content_in_hdoc_functor.hh
+++ b/scribo/scribo/toolchain/internal/content_in_hdoc_functor.hh
@@ -53,8 +53,10 @@
# include <scribo/filter/objects_small.hh>
# include <scribo/filter/paragraphs_bbox_overlap.hh>
# include <scribo/filter/paragraphs_in_image.hh>
+# include <scribo/filter/paragraphs_in_borders.hh>
# include <scribo/filter/separators_in_element.hh>
# include <scribo/filter/separators_in_paragraph.hh>
+# include <scribo/filter/separators_in_borders.hh>
# include <scribo/filter/images_in_paragraph.hh>
# include <scribo/primitive/group/from_single_link.hh>
@@ -66,6 +68,8 @@
# include <scribo/preprocessing/denoise_fg.hh>
+# include <scribo/postprocessing/images_to_drop_capital.hh>
+
# include <scribo/text/recognition.hh>
# include <scribo/text/merging.hh>
# include <scribo/text/link_lines.hh>
@@ -84,6 +88,7 @@
# include <scribo/io/xml/save.hh>
+#include <scribo/io/img/save.hh>
namespace scribo
{
@@ -201,12 +206,22 @@ namespace scribo
// Vertical and horizontal separators
{
+ unsigned closing_size = std::min(0.01 * doc.image().domain().width(),
+ 0.01 * doc.image().domain().height());
+ win::hline2d hl(closing_size);
+
+ // Apply a closing::structural in order to disconnected
+ // parts of a single separator.
mln_ch_value(I,bool)
vseparators = preprocessing::rotate_90(
- primitive::extract::lines_h_thick_and_thin(
- preprocessing::rotate_90(processed_image), 101, 3, 0.2, 0.6, 10), false),
- hseparators = primitive::extract::lines_h_thick_and_thin(
- processed_image, 101, 3);
+ morpho::closing::structural(
+ primitive::extract::lines_h_thick_and_thin(
+ preprocessing::rotate_90(processed_image),
+ 101, 3, 0.2, 0.6, 10), hl), false),
+
+ hseparators = morpho::closing::structural(
+ primitive::extract::lines_h_thick_and_thin(
+ processed_image, 101, 3), hl);
doc.set_vline_separators(vseparators);
doc.set_hline_separators(hseparators);
@@ -509,9 +524,11 @@ namespace scribo
on_new_progress_label("Filtering paragraphs");
- parset = filter::paragraphs_bbox_overlap(parset);
+ paragraph_set<L> parset_f = filter::paragraphs_bbox_overlap(parset);
+ doc.set_paragraphs(parset_f);
- doc.set_paragraphs(parset);
+ // parset = filter::paragraphs_bbox_overlap(parset);
+ // doc.set_paragraphs(parset);
on_progress();
@@ -540,16 +557,38 @@ namespace scribo
on_progress();
+// TEMPORARY DEBUG
+ on_new_progress_label("Saving debug data");
+ doc.set_paragraphs(parset);
+ scribo::io::img::save(doc, "debug_wo_filter.png",
scribo::io::img::DebugWoImage);
+ scribo::io::img::save(doc, "full_wo_filter.png",
scribo::io::img::DebugWithImage);
+ doc.set_paragraphs(parset_f);
+ on_progress();
+// END OF TEMPORARY DEBUG
+
on_new_progress_label("Cleanup miscellaneous false positive");
filter::separators_in_element(doc);
- filter::separators_in_paragraph(doc);
+ filter::separators_in_paragraph(doc, 81, 121);
+ filter::separators_in_borders(doc, 0.05, 0.02);
+
filter::paragraphs_in_image(doc);
- filter::images_in_paragraph(doc);
+ filter::paragraphs_in_borders(doc);
on_progress();
+ on_new_progress_label("Rebuild extracted images");
+ elements = scribo::primitive::extract::non_text_hdoc(doc, closing_size);
+ doc.set_elements(elements);
+
+ on_progress();
+
+ on_new_progress_label("Tag images as drop capital");
+
+ postprocessing::images_to_drop_capital(doc);
+
+ on_progress();
// Saving results
if (save_doc_as_xml)
@@ -564,6 +603,9 @@ namespace scribo
on_end();
+
+ sleep(10);
+
return doc;
}
--
1.5.6.5