last-svn-commit-738-g332c502 Add support for paragraphs.

* scribo/core/document.hh: Add support for paragraphs. * scribo/core/macros.hh: Add new macro for_all_paragraph_lines. * scribo/core/line_links.hh, * scribo/core/paragraph_info.hh, * scribo/core/paragraph_set.hh: New. New structures. * scribo/filter/line_links_x_height.hh, * scribo/text/link_lines.hh: New. New routine. --- scribo/ChangeLog | 15 ++ scribo/scribo/core/document.hh | 20 ++ scribo/scribo/core/line_links.hh | 263 ++++++++++++++++++++ scribo/scribo/core/macros.hh | 3 + scribo/scribo/core/paragraph_info.hh | 120 +++++++++ scribo/scribo/core/paragraph_set.hh | 163 ++++++++++++ .../crop.hh => filter/line_links_x_height.hh} | 69 +++--- scribo/scribo/text/link_lines.hh | 165 ++++++++++++ 8 files changed, 784 insertions(+), 34 deletions(-) create mode 100644 scribo/scribo/core/line_links.hh create mode 100644 scribo/scribo/core/paragraph_info.hh create mode 100644 scribo/scribo/core/paragraph_set.hh copy scribo/scribo/{preprocessing/crop.hh => filter/line_links_x_height.hh} (53%) create mode 100644 scribo/scribo/text/link_lines.hh diff --git a/scribo/ChangeLog b/scribo/ChangeLog index 2415ee4..564ed4d 100644 --- a/scribo/ChangeLog +++ b/scribo/ChangeLog @@ -1,5 +1,20 @@ 2011-01-25 Guillaume Lazzara <z@lrde.epita.fr> + Add support for paragraphs. + + * scribo/core/document.hh: Add support for paragraphs. + + * scribo/core/macros.hh: Add new macro for_all_paragraph_lines. + + * scribo/core/line_links.hh, + * scribo/core/paragraph_info.hh, + * scribo/core/paragraph_set.hh: New. New structures. + + * scribo/filter/line_links_x_height.hh, + * scribo/text/link_lines.hh: New. New routine. + +2011-01-25 Guillaume Lazzara <z@lrde.epita.fr> + * tests/toolchain/nepomuk/text_extraction.cc: Fix test. 2011-01-18 Guillaume Lazzara <z@lrde.epita.fr> diff --git a/scribo/scribo/core/document.hh b/scribo/scribo/core/document.hh index b6b79df..f4a78ff 100644 --- a/scribo/scribo/core/document.hh +++ b/scribo/scribo/core/document.hh @@ -36,6 +36,7 @@ # include <scribo/core/component_set.hh> # include <scribo/core/line_set.hh> +# include <scribo/core/paragraph_set.hh> namespace scribo { @@ -62,6 +63,9 @@ namespace scribo bool has_text() const; void set_text(const line_set<L>& line); + const paragraph_set<L>& paragraphs() const; + void set_paragraphs(const paragraph_set<L>& parset); + const component_set<L>& elements() const; bool has_elements() const; void set_elements(const component_set<L>& elements); @@ -74,6 +78,7 @@ namespace scribo mln::image2d<mln::value::rgb8> image_; line_set<L> lines_; + paragraph_set<L> parset_; component_set<L> elements_; }; @@ -167,6 +172,21 @@ namespace scribo lines_ = line; } + template <typename L> + const paragraph_set<L>& + document<L>::paragraphs() const + { + return parset_; + } + + + template <typename L> + void + document<L>::set_paragraphs(const paragraph_set<L>& parset) + { + parset_ = parset; + } + template <typename L> const component_set<L>& diff --git a/scribo/scribo/core/line_links.hh b/scribo/scribo/core/line_links.hh new file mode 100644 index 0000000..85c45e8 --- /dev/null +++ b/scribo/scribo/core/line_links.hh @@ -0,0 +1,263 @@ +// Copyright (C) 2011 EPITA Research and Development Laboratory (LRDE) +// +// This file is part of Olena. +// +// Olena is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation, version 2 of the License. +// +// Olena is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Olena. If not, see <http://www.gnu.org/licenses/>. +// +// As a special exception, you may use this file as part of a free +// software project without restriction. Specifically, if other files +// instantiate templates or use macros or inline functions from this +// file, or you compile this file and link it with other files to produce +// an executable, this file does not by itself cause the resulting +// executable to be covered by the GNU General Public License. This +// exception does not however invalidate any other reasons why the +// executable file might be covered by the GNU General Public License. + +#ifndef SCRIBO_CORE_LINE_LINKS_HH +# define SCRIBO_CORE_LINE_LINKS_HH + +/// \file +/// +/// \brief Line links representation. + + +# include <mln/util/array.hh> +# include <mln/util/tracked_ptr.hh> + +# include <scribo/core/line_set.hh> + + +namespace scribo +{ + + using namespace mln; + + // Forward declaration. + template <typename L> class line_links; + + + namespace internal + { + /// Data structure for \c scribo::line_links<I>. + template <typename L> + struct line_links_data + { + line_links_data(); + line_links_data(const line_set<L>& lines, unsigned size); + line_links_data(const line_set<L>& lines, + unsigned size, line_id_t value); + + mln::util::array<line_id_t> line_to_link_; + line_set<L> lines_; + }; + + } // end of namespace scribo::internal + + + + + /// \brief Line group representation. + // + template <typename L> + class line_links + { + typedef internal::line_links_data<L> data_t; + + public: + line_links(); + line_links(const line_set<L>& lines); + line_links(const line_set<L>& lines, line_id_t value); + + const line_set<L>& lines() const; + + bool is_valid() const; + + unsigned nelements() const; + + line_id_t& operator()(line_id_t comp_id); + const line_id_t& operator()(line_id_t comp_id) const; + + const mln::util::array<line_id_t>& line_to_link() const; + + void init(); + + line_links<L> duplicate() const; + + private: + mln::util::tracked_ptr<data_t> data_; + }; + + + template <typename L> + std::ostream& + operator<<(std::ostream& ostr, const line_links<L>& links); + + +# ifndef MLN_INCLUDE_ONLY + + + namespace internal + { + + + /// Data structure for \c scribo::line_links<I>. + template <typename L> + line_links_data<L>::line_links_data() + { + } + + + template <typename L> + line_links_data<L>::line_links_data(const line_set<L>& lines, + unsigned size) + : line_to_link_(size), lines_(lines) + { + }; + + + template <typename L> + line_links_data<L>::line_links_data(const line_set<L>& lines, + unsigned size, line_id_t value) + : line_to_link_(size, value), lines_(lines) + { + }; + + + } // end of namespace scribo::internal + + + + template <typename L> + line_links<L>::line_links() + { + data_ = new data_t(); + } + + + template <typename L> + line_links<L>::line_links(const line_set<L>& lines) + { + data_ = new data_t(lines, value::next(lines.nelements())); + } + + + template <typename L> + line_links<L>::line_links(const line_set<L>& lines, + line_id_t value) + { + data_ = new data_t(lines, value::next(lines.nelements()), + value); + } + + + template <typename L> + const line_set<L>& + line_links<L>::lines() const + { + return data_->lines_; + } + + + template <typename L> + bool + line_links<L>::is_valid() const + { + return data_->lines_.is_valid() + && data_->lines_.nelements() == (this->nelements() - 1); + } + + + template <typename L> + unsigned + line_links<L>::nelements() const + { + return data_->line_to_link_.nelements(); + } + + + template <typename L> + line_id_t& + line_links<L>::operator()(line_id_t comp_id) + { + return data_->line_to_link_(comp_id); + } + + + template <typename L> + const line_id_t& + line_links<L>::operator()(line_id_t comp_id) const + { + return data_->line_to_link_(comp_id); + } + + + template <typename L> + const mln::util::array<line_id_t>& + line_links<L>::line_to_link() const + { + return data_->line_to_link_; + } + + + template <typename L> + void + line_links<L>::init() + { + for (unsigned i = 0; i < nelements(); ++i) + if (! data_->lines_(i).is_valid() + || data_->lines_(i).is_hidden() + || data_->lines_(i).type() != line::Text) + { + data_->line_to_link_(i) = 0; + } + else + { + data_->line_to_link_(i) = i; + } + } + + template <typename L> + inline + line_links<L> + line_links<L>::duplicate() const + { + line_links<L> output; + output.data_ = new data_t(); + + *(output.data_.ptr_) = *(data_.ptr_); + return output; + } + + + template <typename L> + std::ostream& + operator<<(std::ostream& ostr, const line_links<L>& links) + { + ostr << "line_links["; + + for_all_links(l, links) + ostr << l << "->" << links.line_to_link()[l] << ", "; + + ostr << "]"; + + return ostr; + } + + +# endif // ! MLN_INCLUDE_ONLY + + +} // end of namespace scribo + + +#endif // ! SCRIBO_CORE_LINE_LINKS_HH diff --git a/scribo/scribo/core/macros.hh b/scribo/scribo/core/macros.hh index f7de5b6..1060358 100644 --- a/scribo/scribo/core/macros.hh +++ b/scribo/scribo/core/macros.hh @@ -50,6 +50,9 @@ # define for_all_lines(E, S) \ for_all_comps(E, S) +# define for_all_paragraphs(E, S) \ + for_all_comps(E, S) + # define for_all_line_comps(E, S) \ for_all_elements(E, S) diff --git a/scribo/scribo/core/paragraph_info.hh b/scribo/scribo/core/paragraph_info.hh new file mode 100644 index 0000000..a8c623a --- /dev/null +++ b/scribo/scribo/core/paragraph_info.hh @@ -0,0 +1,120 @@ +// Copyright (C) 2011 EPITA Research and Development Laboratory (LRDE) +// +// This file is part of Olena. +// +// Olena is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation, version 2 of the License. +// +// Olena is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Olena. If not, see <http://www.gnu.org/licenses/>. +// +// As a special exception, you may use this file as part of a free +// software project without restriction. Specifically, if other files +// instantiate templates or use macros or inline functions from this +// file, or you compile this file and link it with other files to produce +// an executable, this file does not by itself cause the resulting +// executable to be covered by the GNU General Public License. This +// exception does not however invalidate any other reasons why the +// executable file might be covered by the GNU General Public License. + +#ifndef SCRIBO_CORE_PARAGRAPH_INFO_HH +# define SCRIBO_CORE_PARAGRAPH_INFO_HH + +# include <scribo/core/line_info.hh> +# include <scribo/core/line_links.hh> +# include <mln/util/array.hh> +# include <mln/accu/shape/bbox.hh> + +namespace scribo +{ + + /*! \brief Paragraph structure information. + + */ + template <typename L> + class paragraph_info + { + public: + paragraph_info(); + paragraph_info(const line_links<L>& llinks); + + void add_line(const line_info<L>& line); + + const mln::box2d& bbox() const; + + const line_info<L>& line(line_id_t id) const; + + const mln::util::array<line_id_t>& line_ids() const; + + bool is_valid() const; + + private: + mln::util::array<line_id_t> line_ids_; + mln::accu::shape::bbox<mln_site(L)> bbox_; + line_links<L> llinks_; + }; + + +# ifndef MLN_INCLUDE_ONLY + + + template <typename L> + paragraph_info<L>::paragraph_info() + { + } + + template <typename L> + paragraph_info<L>::paragraph_info(const line_links<L>& llinks) + : llinks_(llinks) + { + } + + template <typename L> + void + paragraph_info<L>::add_line(const line_info<L>& line) + { + line_ids_.append(line.id()); + bbox_.take(line.bbox()); + } + + template <typename L> + const mln::box2d& + paragraph_info<L>::bbox() const + { + return bbox_.to_result(); + } + + template <typename L> + const line_info<L>& + paragraph_info<L>::line(line_id_t id) const + { + mln_precondition(is_valid()); + return llinks_.lines()(id); + } + + template <typename L> + const mln::util::array<line_id_t>& + paragraph_info<L>::line_ids() const + { + return line_ids_; + } + + template <typename L> + bool + paragraph_info<L>::is_valid() const + { + return llinks_.is_valid(); + } + + +# endif // ! MLN_INCLUDE_ONLY + +} // end of namespace scribo + +#endif // ! SCRIBO_CORE_PARAGRAPH_INFO_HH diff --git a/scribo/scribo/core/paragraph_set.hh b/scribo/scribo/core/paragraph_set.hh new file mode 100644 index 0000000..afb59c5 --- /dev/null +++ b/scribo/scribo/core/paragraph_set.hh @@ -0,0 +1,163 @@ +// Copyright (C) 2011 EPITA Research and Development Laboratory (LRDE) +// +// This file is part of Olena. +// +// Olena is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation, version 2 of the License. +// +// Olena is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Olena. If not, see <http://www.gnu.org/licenses/>. +// +// As a special exception, you may use this file as part of a free +// software project without restriction. Specifically, if other files +// instantiate templates or use macros or inline functions from this +// file, or you compile this file and link it with other files to produce +// an executable, this file does not by itself cause the resulting +// executable to be covered by the GNU General Public License. This +// exception does not however invalidate any other reasons why the +// executable file might be covered by the GNU General Public License. + +#ifndef SCRIBO_CORE_PARAGRAPH_SET_HH +# define SCRIBO_CORE_PARAGRAPH_SET_HH + +# include <mln/util/array.hh> +# include <mln/make/relabelfun.hh> +# include <mln/value/int_u16.hh> +# include <scribo/core/line_links.hh> +# include <scribo/core/line_set.hh> +# include <scribo/core/paragraph_info.hh> + +namespace scribo +{ + + /*! \brief Paragraph container. + + Paragraph ids start from 1. + + */ + template <typename L> + class paragraph_set + { + public: + paragraph_set(); + paragraph_set(const line_links<L>& llinks, unsigned npars); + + unsigned nelements() const; + + paragraph_info<L>& operator()(unsigned i); + const paragraph_info<L>& operator()(unsigned i) const; + + + private: + mln::util::array<paragraph_info<L> > pars_; + }; + + + + namespace make + { + + /// \brief Construct a paragraph set from line links information. + template <typename L> + scribo::paragraph_set<L> + paragraph(const line_links<L>& llinks); + + } // end of namespace scribo::make + + +# ifndef MLN_INCLUDE_ONLY + + template <typename L> + paragraph_set<L>::paragraph_set() + { + } + + template <typename L> + paragraph_set<L>::paragraph_set(const line_links<L>& llinks, unsigned npars) + : pars_(npars + 1, paragraph_info<L>(llinks)) + { + } + + template <typename L> + unsigned + paragraph_set<L>::nelements() const + { + return pars_.nelements() - 1; + } + + template <typename L> + paragraph_info<L>& + paragraph_set<L>::operator()(unsigned i) + { + return pars_[i]; + } + + template <typename L> + const paragraph_info<L>& + paragraph_set<L>::operator()(unsigned i) const + { + return pars_[i]; + } + + + + namespace make + { + + namespace internal + { + + template <typename L> + inline + unsigned + find_root(line_links<L>& parent, unsigned x) + { + if (parent(x) == x) + return x; + else + return parent(x) = find_root(parent, parent(x)); + } + + } // end of namespace scribo::make::internal + + + template <typename L> + scribo::paragraph_set<L> + paragraph(const line_links<L>& llinks) + { + line_links<L> links = llinks.duplicate(); + + for (unsigned i = 1; i < links.nelements(); ++i) + links(i) = internal::find_root(links, i); + + unsigned npars; + mln::fun::i2v::array<unsigned> + par_ids = mln::make::relabelfun(links.line_to_link(), + links.nelements() - 1, npars); + paragraph_set<L> parset(links, npars); + + const scribo::line_set<L>& lines = links.lines(); + for_all_links(l, links) + if (links(l)) + { + value::int_u16 par_id = par_ids(l); + parset(par_id).add_line(lines(l)); + } + + return parset; + } + + } // end of namespace scribo::make + + +# endif // ! MLN_INCLUDE_ONLY + +} // end of namespace scribo + +#endif // ! SCRIBO_CORE_PARAGRAPH_SET_HH diff --git a/scribo/scribo/preprocessing/crop.hh b/scribo/scribo/filter/line_links_x_height.hh similarity index 53% copy from scribo/scribo/preprocessing/crop.hh copy to scribo/scribo/filter/line_links_x_height.hh index c289f86..47f214e 100644 --- a/scribo/scribo/preprocessing/crop.hh +++ b/scribo/scribo/filter/line_links_x_height.hh @@ -1,4 +1,4 @@ -// Copyright (C) 2010 EPITA Research and Development Laboratory (LRDE) +// Copyright (C) 2011 EPITA Research and Development Laboratory (LRDE) // // This file is part of Olena. // @@ -23,62 +23,63 @@ // exception does not however invalidate any other reasons why the // executable file might be covered by the GNU General Public License. -#ifndef SCRIBO_PREPROCESSING_CROP_HH -# define SCRIBO_PREPROCESSING_CROP_HH - -# include <mln/core/concept/image.hh> -# include <mln/data/paste.hh> +#ifndef SCRIBO_TEXT_LINE_LINKS_X_HEIGHT_HH +# define SCRIBO_TEXT_LINE_LINKS_X_HEIGHT_HH /// \file /// -/// \brief Crop an image preserving the localization. +/// \brief Filter line links according to character x height. + +# include <scribo/core/macros.hh> +# include <scribo/core/line_set.hh> namespace scribo { - namespace preprocessing + namespace filter { - using namespace mln; - - - /*! \brief crop an image preserving the localization. - - \param[in] input An image. - \param[in] domain A region of interest. + template <typename L> + line_links<L> + line_links_x_height(const line_links<L>& links); - \return An image defined on the domain \p domain with the - corresponding data copied from \p input. - */ - template <typename I> - mln_concrete(I) - crop(const Image<I>& input, const mln_box(I)& domain); +# ifndef MLN_INCLUDE_ONLY + template <typename L> + line_links<L> + line_links_x_height(const line_links<L>& links) + { + line_links<L> output = links.duplicate(); -# ifndef MLN_INCLUDE_ONLY + float min_x, max_x;//, min_a, max_a; + const line_set<L>& lines = links.lines(); + for_all_links(l, links) + if (links(l)) + { + max_x = lines(l).x_height(); + // max_a = lines(l).a_height; + min_x = lines(output(l)).x_height(); + // min_a = lines(output(l)).a_height; + if (lines(l).x_height() < lines(output(l)).x_height()) + std::swap(max_x, min_x); - template <typename I> - mln_concrete(I) - crop(const Image<I>& input, const mln_box(I)& domain) - { - trace::entering("scribo::preprocessing::crop"); - mln_assertion(exact(input).is_valid()); + // if (lines(l).a_height < lines(output(l)).a_height) + // std::swap(max_a, min_a); - mln_concrete(I) output(domain); - data::paste(input | domain, output); + if (min_x / max_x < 0.72f)// || min_a / max_a < 0.80f) + output(l) = l; + } - trace::exiting("scribo::preprocessing::crop"); return output; } - # endif // ! MLN_INCLUDE_ONLY - } // end of namespace scribo::preprocessing + } // end of namespace scribo::filter } // end of namespace scribo -#endif // ! SCRIBO_PREPROCESSING_CROP_HH +#endif // ! SCRIBO_TEXT_LINE_LINKS_X_HEIGHT_HH diff --git a/scribo/scribo/text/link_lines.hh b/scribo/scribo/text/link_lines.hh new file mode 100644 index 0000000..6c985f5 --- /dev/null +++ b/scribo/scribo/text/link_lines.hh @@ -0,0 +1,165 @@ +// Copyright (C) 2011 EPITA Research and Development Laboratory (LRDE) +// +// This file is part of Olena. +// +// Olena is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation, version 2 of the License. +// +// Olena is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Olena. If not, see <http://www.gnu.org/licenses/>. +// +// As a special exception, you may use this file as part of a free +// software project without restriction. Specifically, if other files +// instantiate templates or use macros or inline functions from this +// file, or you compile this file and link it with other files to produce +// an executable, this file does not by itself cause the resulting +// executable to be covered by the GNU General Public License. This +// exception does not however invalidate any other reasons why the +// executable file might be covered by the GNU General Public License. + +#ifndef SCRIBO_TEXT_LINK_LINES_HH +# define SCRIBO_TEXT_LINK_LINES_HH + +/// \file +/// +/// \brief Verticaly link text lines to prepare paragraph +/// construction. + +#include <mln/value/int_u16.hh> +#include <mln/data/fill.hh> +#include <mln/geom/rotate.hh> +#include <mln/geom/ncols.hh> +#include <mln/draw/box_plain.hh> +#include <mln/util/array.hh> + +#include <scribo/core/line_links.hh> + + +namespace scribo +{ + + namespace text + { + + using namespace mln; + + + template <typename L> + line_links<L> + link_lines(const line_set<L>& lines); + + +# ifndef MLN_INCLUDE_ONLY + + + template <typename L> + line_links<L> + link_lines(const line_set<L>& lines) + { + typedef value::int_u16 V; + + const mln_concrete(L)& lbl = lines.components().labeled_image(); + + // Rotate the domain in order to browse the image efficiently. + mln_ch_value(L,V) blocks(geom::rotate(lbl.domain(), + -90, lbl.domain().pcenter())); + mln::data::fill(blocks, literal::zero); + + // Construct a list of rotated bboxes for each lines. + util::array<mln_box(L)> rbbox(1); + rbbox.reserve(lines.nelements()); + + for_all_lines(l, lines) + { + if (! lines(l).is_valid() || lines(l).is_hidden() || lines(l).type() != line::Text) + { + rbbox.resize(rbbox.nelements() + 1); + continue; + } + + mln_box(L) b = mln::geom::rotate(lines(l).bbox(), -90, lbl.domain().pcenter()); + rbbox.append(b); + mln::draw::box_plain(blocks, b, l); + } + + // Looking for neighbor lines. + line_links<L> links(lines); + links.init(); + + for_all_lines(l, lines) + { + if (! lines(l).is_valid() || lines(l).is_hidden() || lines(l).type() != line::Text) + continue; + + int dmax = 1.5 * lines(l).x_height(); // FIXME: better ratio? + mln_site(L) c = rbbox(l).pcenter(); + + int + midcol = (rbbox(l).pmax().col() + - rbbox(l).pmin().col()) / 2; + + int + nleftima = std::abs(c.col() - blocks.domain().pmin().col()), // abs, useful? + nleft = std::min(nleftima, midcol + dmax); + + // Left + { + const V + *p = &blocks(c), + *pstop = p - nleft; + + for (; p != pstop; --p) + { + if (*p != literal::zero // Not the background + && *p != l // Not the current component + && links(*p) != l) // No loops + { + links(l) = *p; + break; + } + } + } + + // Right + { + int + nrightima = geom::ncols(blocks) - c.col() + blocks.domain().pmin().col(), + nright = std::min(nrightima, midcol + dmax); + + const V + *p = &blocks(c), + *pstop = p + nright - 1; + + for (; p != pstop; ++p) + { + if (*p != literal::zero // Not the background + && *p != l // Not the current component + && links(l) != *p // No loops + && links(*p) == *p) // Not already connected + { + links(*p) = l; + break; + } + } + } + } + + return links; + } + + +# endif // ! MLN_INCLUDE_ONLY + + + } // end of namespace scribo::text + +} // end of namespace scribo + + +#endif // ! SCRIBO_TEXT_LINK_LINES_HH -- 1.5.6.5
participants (1)
-
Guillaume Lazzara