scribo/ChangeLog | 5 +
scribo/toolchain/text_in_doc.hh | 294 +++++++++++++++++++++++++++++++++++++++
2 files changed, 299 insertions(+), 0 deletions(-)
create mode 100644 scribo/toolchain/text_in_doc.hh
diff --git a/scribo/ChangeLog b/scribo/ChangeLog
index b682f14..f091afa 100644
--- a/scribo/ChangeLog
+++ b/scribo/ChangeLog
@@ -1,5 +1,10 @@
2010-05-25 Guillaume Lazzara <z(a)lrde.epita.fr>
+ * toolchain/text_in_doc.hh: Introduce a new routine for extracting
+ text in a document.
+2010-05-25 Guillaume Lazzara <z(a)lrde.epita.fr>
* core/def/lbl_type.hh: Introduce a global label type.
2010-05-25 Guillaume Lazzara <z(a)lrde.epita.fr>
diff --git a/scribo/toolchain/text_in_doc.hh b/scribo/toolchain/text_in_doc.hh
new file mode 100644
index 0000000..50db26a
--- /dev/null
+++ b/scribo/toolchain/text_in_doc.hh
@@ -0,0 +1,294 @@
+// Copyright (C) 2009, 2010 EPITA Research and Development Laboratory
+// (LRDE)
+// This file is part of Olena.
+// Olena is free software: you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation, version 2 of the License.
+// Olena is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// General Public License for more details.
+// You should have received a copy of the GNU General Public License
+// along with Olena. If not, see <http://www.gnu.org/licenses/>.
+// As a special exception, you may use this file as part of a free
+// software project without restriction. Specifically, if other files
+// instantiate templates or use macros or inline functions from this
+// file, or you compile this file and link it with other files to produce
+// an executable, this file does not by itself cause the resulting
+// executable to be covered by the GNU General Public License. This
+// exception does not however invalidate any other reasons why the
+// executable file might be covered by the GNU General Public License.
+/// \file
+/// Extract text from a document.
+# include <mln/io/ppm/save.hh>
+# include <scribo/core/def/lbl_type.hh>
+# include <scribo/primitive/extract/components.hh>
+# include <scribo/primitive/extract/vertical_separators.hh>
+# include <scribo/primitive/remove/separators.hh>
+# include <scribo/filter/object_links_bbox_h_ratio.hh>
+# include <scribo/filter/objects_small.hh>
+# include <scribo/primitive/group/from_single_link.hh>
+# include <scribo/primitive/link/merge_double_link.hh>
+# include <scribo/primitive/link/internal/dmax_width_and_height.hh>
+# include <scribo/primitive/link/with_single_left_link_dmax_ratio.hh>
+# include <scribo/primitive/link/with_single_right_link_dmax_ratio.hh>
+# include <scribo/preprocessing/denoise_fg.hh>
+# include <scribo/text/recognition.hh>
+# include <scribo/text/merging.hh>
+# include <scribo/make/debug_filename.hh>
+# include <scribo/debug/save_bboxes_image.hh>
+# include <scribo/debug/bboxes_enlarged_image.hh>
+# include <scribo/debug/mean_and_base_lines_image.hh>
+# include <scribo/debug/looks_like_a_text_line_image.hh>
+namespace scribo
+ namespace toolchain
+ {
+ template <typename I>
+ line_set<mln_ch_value(I, def::lbl_type)>
+ text_in_doc(const Image<I>& input, bool denoise, bool debug = false);
+ template <typename I>
+ line_set<mln_ch_value(I, def::lbl_type)>
+ text_in_doc(const Image<I>& input, bool denoise, bool debug = false)
+ {
+ typedef value::label<30> V;
+ typedef image2d<V> L;
+ // Add whitespace separators.
+ // win::rectangle2d win = win::rectangle2d(151, 41);
+ // image2d<bool> whitespaces = morpho::closing::structural(input, win);
+ // logical::not_inplace(whitespaces);
+ // Remove separators
+ if (debug)
+ std::cout << "Find vertical separators..." << std::endl;
+ image2d<bool>
+ separators = primitive::extract::vertical_separators(input, 81);
+ if (debug)
+ std::cout << "Remove separators..." << std::endl;
+ image2d<bool> input_cleaned = primitive::remove::separators(input,
+ separators);
+// whitespaces += separators;
+ if (debug)
+ {
+ mln::io::pbm::save(separators,
+ scribo::make::debug_filename("vseparators.pbm"));
+// mln::io::pbm::save(whitespaces, "separators.pbm");
+ mln::io::pbm::save(input_cleaned,
+ }
+ // Denoise
+ if (denoise)
+ {
+ if (debug)
+ std::cout << "Denoise..." << std::endl;
+ input_cleaned = preprocessing::denoise_fg(input_cleaned, c8(), 3);
+ if (debug)
+ mln::io::pbm::save(input_cleaned,
+ scribo::make::debug_filename("denoised.pbm"));
+ }
+ /// Finding components.
+ if (debug)
+ std::cout << "Finding components..." << std::endl;
+ V ncomponents;
+ component_set<L>
+ components = scribo::primitive::extract::components(input_cleaned, c8(),
+ ncomponents);
+ /// Set separator components.
+ components.add_separators(separators);
+// components.add_separators(whitespaces);
+ components = scribo::filter::components_small(components, 3);
+ /// Linking potential objects
+ if (debug)
+ std::cout << "Linking objects..." << std::endl;
+ object_links<L> left_link
+ = primitive::link::with_single_left_link_dmax_ratio(components,
+ primitive::link::internal::dmax_width_and_height(1),
+ anchor::MassCenter);
+ object_links<L> right_link
+ = primitive::link::with_single_right_link_dmax_ratio(components,
+ primitive::link::internal::dmax_width_and_height(1),
+ anchor::MassCenter);
+ // Validating left and right links.
+ object_links<L>
+ merged_links = primitive::link::merge_double_link(left_link,
+ right_link);
+ // Remove links if bboxes have too different sizes.
+ object_links<L> hratio_filtered_links
+ = filter::object_links_bbox_h_ratio(merged_links, 2.5f);
+// #ifndef NOUT
+// if (argc == 4)
+// {
+// image2d<value::rgb8>
+// hratio_decision_image = scribo::debug::decision_image(input,
+// merged_links,
+// hratio_filtered_links);
+// io::ppm::save(hratio_decision_image,
+// scribo::make::debug_filename("hratio_links_decision_image.ppm"));
+// }
+// #endif
+ object_groups<L>
+ groups = primitive::group::from_single_link(hratio_filtered_links);
+ // Construct a line set.
+ line_set<L>
+ lines = scribo::make::line_set(groups);
+ //===== DEBUG =====
+ if (debug)
+ {
+ // Bboxes image.
+ scribo::debug::save_bboxes_image(input, lines,
+ scribo::make::debug_filename("step1_bboxes.ppm"));
+ // Bboxes enlarged
+ mln::io::ppm::save(scribo::debug::bboxes_enlarged_image(input, lines),
+ scribo::make::debug_filename("step1_bboxes_enlarged.ppm"));
+ // Looks like a text line
+ mln::io::ppm::save(scribo::debug::looks_like_a_text_line_image(input, lines),
+ scribo::make::debug_filename("step1_looks_like_a_text_line.ppm"));
+ // mean and base lines.
+ mln::io::ppm::save(scribo::debug::mean_and_base_lines_image(input, lines),
+ scribo::make::debug_filename("step1_x_height.ppm"));
+ }
+ //===== END OF DEBUG =====
+ if (debug)
+ std::cout << "Merging lines..." << std::endl;
+ lines = scribo::text::merging(lines);
+ //===== DEBUG =====
+ if (debug)
+ {
+ // mean and base lines.
+ mln::io::ppm::save(scribo::debug::mean_and_base_lines_image(input, lines),
+ scribo::make::debug_filename("step2_x_height.ppm"));
+ // Looks like a text line
+ mln::io::ppm::save(scribo::debug::looks_like_a_text_line_image(input, lines),
+ scribo::make::debug_filename("step2_looks_like_a_text_line.ppm"));
+ // Bboxes image.
+ scribo::debug::save_bboxes_image(input, lines,
+ scribo::make::debug_filename("step2_bboxes.ppm"));
+ }
+ if (debug)
+ {
+ std::ofstream
+ for_all_lines(l, lines)
+ if (lines(l).tag() != line::Merged
+ && lines(l).tag() != line::Ignored
+ && lines(l).tag() != line::Pathological)
+ {
+ file << lines(l).bbox().pmin().row() << " "
+ << lines(l).bbox().pmin().col() << " "
+ << lines(l).bbox().pmax().row() << " "
+ << lines(l).bbox().pmax().col() << " "
+ << lines(l).card() << " "
+ << lines(l).baseline() << " "
+ << lines(l).x_height() << " "
+ << lines(l).meanline() << " "
+ << lines(l).d_height() << " "
+ << lines(l).a_height() << " "
+ << lines(l).char_space() << " "
+ << lines(l).char_width() << std::endl;
+ }
+ file.close();
+ }
+ //===== END OF DEBUG =====
+ scribo::text::recognition(lines, "fra");
+ return lines;
+ }
+# endif // ! MLN_INCLUDE_ONLY
+ } // end of namespace scribo::toolchain
+} // end of namespace scribo