* scribo/primitive/extract/non_text_hdoc.hh: Make parameters
depend on the image size.
* scribo/toolchain/internal/content_in_hdoc_functor.hh: Add new
filters.
---
scribo/ChangeLog | 10 ++
scribo/scribo/primitive/extract/non_text_hdoc.hh | 34 ++++-
.../toolchain/internal/content_in_hdoc_functor.hh | 148 +++++++-------------
3 files changed, 86 insertions(+), 106 deletions(-)
diff --git a/scribo/ChangeLog b/scribo/ChangeLog
index b1be73d..ecece77 100644
--- a/scribo/ChangeLog
+++ b/scribo/ChangeLog
@@ -1,5 +1,15 @@
2011-05-26 Guillaume Lazzara <z(a)lrde.epita.fr>
+ Improve and cleanup Results in hdoc toolchain.
+
+ * scribo/primitive/extract/non_text_hdoc.hh: Make parameters
+ depend on the image size.
+
+ * scribo/toolchain/internal/content_in_hdoc_functor.hh: Add new
+ filters.
+
+2011-05-26 Guillaume Lazzara <z(a)lrde.epita.fr>
+
* scribo/primitive/extract/lines_h_thick_and_thin.hh: Improve
result quality.
diff --git a/scribo/scribo/primitive/extract/non_text_hdoc.hh
b/scribo/scribo/primitive/extract/non_text_hdoc.hh
index 97e1f0e..b851bde 100644
--- a/scribo/scribo/primitive/extract/non_text_hdoc.hh
+++ b/scribo/scribo/primitive/extract/non_text_hdoc.hh
@@ -98,27 +98,47 @@ namespace scribo
mln_ch_value(L,bool)
element_image = duplicate(doc.binary_image_wo_seps());
- for_all_lines(l, doc.lines())
- if (doc.lines()(l).is_textline())
- mln::draw::box_plain(element_image, doc.lines()(l).bbox(), false);
+ // Mask text areas.
+ const paragraph_set<L>& parset = doc.paragraphs();
+ for_all_paragraphs(p, parset)
+ if (parset(p).is_valid())
+ for_all_paragraph_lines(l, parset(p).line_ids())
+ {
+ line_id_t lid = parset(p).line_ids()(l);
+ mln::draw::box_plain(element_image, doc.lines()(lid).bbox(), false);
+ }
element_image = morpho::closing::structural(element_image,
win::rectangle2d(closing_size,
closing_size));
+ // Debug
+ {
+ debug::logger().log_image(debug::AuxiliaryResults,
+ element_image,
+ "non_text_hdoc_element_image");
+ }
+
mln_value(L) ncomps;
+
+ // FIXME: we should not tag elements as image here since we
+ // just don't know!
component_set<L>
elements = primitive::extract::components(element_image,
- c8(), ncomps);
+ c8(), ncomps,
+ component::Image);
elements = scribo::filter::components_small(elements, 200);
elements = scribo::filter::components_on_border(elements);
- elements = scribo::filter::objects_v_thin(elements, 100);
- elements = scribo::filter::objects_h_thin(elements, 100);
+
+ elements = scribo::filter::objects_v_thin(elements,
+ 0.03 * doc.image().domain().height());
+ elements = scribo::filter::objects_h_thin(elements,
+ 0.03 * doc.image().domain().width());
// Debug
{
- debug::logger().log_image(debug::Special,
+ debug::logger().log_image(debug::Results,
elements.labeled_image(),
"non_text_hdoc_components");
}
diff --git a/scribo/scribo/toolchain/internal/content_in_hdoc_functor.hh
b/scribo/scribo/toolchain/internal/content_in_hdoc_functor.hh
index ef33b31..e0c5b50 100644
--- a/scribo/scribo/toolchain/internal/content_in_hdoc_functor.hh
+++ b/scribo/scribo/toolchain/internal/content_in_hdoc_functor.hh
@@ -27,6 +27,10 @@
#ifndef SCRIBO_TOOLCHAIN_INTERNAL_CONTENT_IN_HDOC_FUNCTOR_HH
# define SCRIBO_TOOLCHAIN_INTERNAL_CONTENT_IN_HDOC_FUNCTOR_HH
+# ifndef SCRIBO_NDEBUG
+# include <mln/util/timer.hh>
+# endif // ! SCRIBO_NDEBUG
+
# include <scribo/core/def/lbl_type.hh>
# include <scribo/core/document.hh>
# include <scribo/core/line_set.hh>
@@ -34,9 +38,6 @@
# include <scribo/primitive/extract/non_text_hdoc.hh>
# include <scribo/primitive/extract/components.hh>
-// # include <scribo/primitive/extract/separators.hh>
-// # include <scribo/primitive/extract/vertical_separators.hh>
-// # include <scribo/primitive/extract/horizontal_separators.hh>
# include <scribo/primitive/extract/lines_h_thick_and_thin.hh>
# include <scribo/primitive/extract/alignments.hh>
@@ -50,6 +51,11 @@
# include <scribo/filter/line_links_x_height.hh>
# include <scribo/filter/object_links_bbox_h_ratio.hh>
# include <scribo/filter/objects_small.hh>
+# include <scribo/filter/paragraphs_bbox_overlap.hh>
+# include <scribo/filter/paragraphs_in_image.hh>
+# include <scribo/filter/separators_in_element.hh>
+# include <scribo/filter/separators_in_paragraph.hh>
+# include <scribo/filter/images_in_paragraph.hh>
# include <scribo/primitive/group/from_single_link.hh>
@@ -198,7 +204,7 @@ namespace scribo
mln_ch_value(I,bool)
vseparators = preprocessing::rotate_90(
primitive::extract::lines_h_thick_and_thin(
- preprocessing::rotate_90(processed_image), 101, 3, 0.2, 0.6, 1), false),
+ preprocessing::rotate_90(processed_image), 101, 3, 0.2, 0.6, 10), false),
hseparators = primitive::extract::lines_h_thick_and_thin(
processed_image, 101, 3);
@@ -226,27 +232,31 @@ namespace scribo
// Debug
if (enable_line_seps)
{
- debug::logger().log_image(debug::Special,
+ debug::logger().log_image(debug::AuxiliaryResults,
doc.vline_seps(),
"vseparators");
- debug::logger().log_image(debug::Special,
+ debug::logger().log_image(debug::AuxiliaryResults,
doc.hline_seps(),
"hseparators");
- debug::logger().log_image(debug::Special,
+ debug::logger().log_image(debug::AuxiliaryResults,
input_cleaned,
"input_wo_separators");
}
# endif // ! SCRIBO_NDEBUG
+ unsigned min_area = std::min(0.005 * doc.image().domain().width(),
+ 0.005 * doc.image().domain().height());
// Denoise
if (enable_denoising)
{
on_new_progress_label("Denoise...");
- input_cleaned = preprocessing::denoise_fg(input_cleaned, c8(), 10);
+ std::cout << ">> min_area = " << min_area <<
std::endl;
+
+ input_cleaned = preprocessing::denoise_fg(input_cleaned, c8(), min_area);
// Debug
# ifndef SCRIBO_NDEBUG
@@ -286,7 +296,7 @@ namespace scribo
on_new_progress_label("Filtering components");
- components = scribo::filter::components_small(components, 10);
+ components = scribo::filter::components_small(components, min_area);
on_progress();
@@ -297,6 +307,7 @@ namespace scribo
object_links<L> left_link
= primitive::link::with_single_left_link_dmax_ratio(
components,
+// primitive::link::internal::dmax_width_and_height(1),
primitive::link::internal::dmax_default(1),
anchor::MassCenter);
@@ -304,6 +315,7 @@ namespace scribo
object_links<L> right_link
= primitive::link::with_single_right_link_dmax_ratio(
components,
+// primitive::link::internal::dmax_width_and_height(1),
primitive::link::internal::dmax_default(1),
anchor::MassCenter);
@@ -398,7 +410,7 @@ namespace scribo
if (debug::logger().is_enabled())
{
if (enable_whitespace_seps)
- debug::logger().log_image(debug::Special,
+ debug::logger().log_image(debug::AuxiliaryResults,
whitespaces, "whitespaces");
// Bboxes image.
@@ -428,7 +440,7 @@ namespace scribo
# endif // ! SCRIBO_NDEBUG
//===== END OF DEBUG =====
-
+ on_new_progress_label("Merging segmented lines");
lines = scribo::text::merging(lines);
@@ -488,96 +500,17 @@ namespace scribo
on_progress();
}
-// // Link text lines
-// on_new_progress_label("Linking text lines");
-// line_links<L> llinks = scribo::text::link_lines(lines);
-
-
-// //===== DEBUG =====
-// # ifndef SCRIBO_NDEBUG
-// if (debug::logger().is_enabled())
-// {
-// image2d<value::rgb8>
-// debug = data::convert(value::rgb8(), original_image);
-// for_all_lines(l, lines)
-// {
-// if (! lines(l).is_textline())
-// continue;
-
-// mln::draw::box(debug, lines(l).bbox(), literal::blue);
-// mln::draw::line(debug, lines(l).bbox().pcenter(),
-// lines(llinks(l)).bbox().pcenter(), literal::green);
-// }
-
-// debug::logger().log_image(debug::AuxiliaryResults,
-// debug, "links_raw");
-// }
-// # endif // ! SCRIBO_NDEBUG
-// //===== END OF DEBUG =====
-
-// on_progress();
-
-
-// // Filter line links.
-// on_new_progress_label("Filter line links");
-// llinks = scribo::filter::line_links_x_height(llinks);
-
-// //===== DEBUG =====
-// # ifndef SCRIBO_NDEBUG
-// if (debug::logger().is_enabled())
-// {
-// image2d<value::rgb8>
-// debug = data::convert(value::rgb8(), original_image);
-// for_all_links(i, llinks)
-// if (llinks(i) && llinks(i) != i)
-// mln::draw::line(debug, lines(i).bbox().pcenter(),
-// lines(llinks(i)).bbox().pcenter(), literal::red);
-
-// debug::logger().log_image(debug::AuxiliaryResults,
-// debug, "links");
-
-// for (unsigned i = 1; i < llinks.nelements(); ++i)
-// llinks(i) = scribo::make::internal::find_root(llinks, i);
-
-// debug = data::convert(value::rgb8(), original_image);
-// mln::util::array<accu::shape::bbox<point2d> >
-// nbbox(llinks.nelements());
-
-// for_all_lines(i, lines)
-// {
-// if (! lines(i).is_textline())
-// continue;
-
-// mln::draw::box(debug, lines(i).bbox(), literal::red);
-// nbbox(llinks(i)).take(lines(i).bbox());
-// }
-
-// for (unsigned i = 1; i < nbbox.nelements(); ++i)
-// if (nbbox(i).is_valid())
-// {
-// box2d b = nbbox(i).to_result();
-// mln::draw::box(debug, b, literal::green);
-// b.enlarge(1);
-// mln::draw::box(debug, b, literal::green);
-// b.enlarge(1);
-// mln::draw::box(debug, b, literal::green);
-// }
-
-// debug::logger().log_image(debug::AuxiliaryResults,
-// debug, "par");
-// }
-// # endif // ! SCRIBO_NDEBUG
-// //===== END OF DEBUG =====
-
-// on_progress();
-
-
-// // Construct paragraphs
-// on_new_progress_label("Constructing paragraphs");
-// scribo::paragraph_set<L> parset = scribo::make::paragraph(llinks);
+ on_new_progress_label("Extracting paragraphs");
scribo::paragraph_set<L>
parset = extract_paragraphs(lines, doc.binary_image());
+
+ on_progress();
+
+ on_new_progress_label("Filtering paragraphs");
+
+ parset = filter::paragraphs_bbox_overlap(parset);
+
doc.set_paragraphs(parset);
on_progress();
@@ -585,8 +518,16 @@ namespace scribo
// Extract other Elements
on_new_progress_label("Extracting Elements");
+
+ unsigned closing_size = std::min(0.01 * doc.image().domain().width(),
+ 0.01 * doc.image().domain().height());
+ if (!(closing_size % 2))
+ closing_size += 1;
+
+ std::cout << ">> CLosing size = " << closing_size <<
std::endl;
+
component_set<L>
- elements = scribo::primitive::extract::non_text_hdoc(doc, 31);
+ elements = scribo::primitive::extract::non_text_hdoc(doc, closing_size);
on_progress();
@@ -599,6 +540,15 @@ namespace scribo
on_progress();
+ on_new_progress_label("Cleanup miscellaneous false positive");
+
+ filter::separators_in_element(doc);
+ filter::separators_in_paragraph(doc);
+ filter::paragraphs_in_image(doc);
+ filter::images_in_paragraph(doc);
+
+ on_progress();
+
// Saving results
--
1.5.6.5