
* core/component_info.hh, * core/component_set.hh: Add new methods and an output operator. * core/line_info.hh: Add new methods and improve stats computation. * core/line_set.hh: Add new methods. * core/macros.hh: Add more specific macros. * core/tag/component.hh, * core/tag/line.hh: Add output operators. --- scribo/ChangeLog | 16 ++ scribo/core/component_info.hh | 43 ++++ scribo/core/component_set.hh | 152 +++++++++++- scribo/core/line_info.hh | 517 ++++++++++++++++++++++++++++++++++------- scribo/core/line_set.hh | 213 ++++++++--------- scribo/core/macros.hh | 12 +- scribo/core/tag/component.hh | 63 +++++- scribo/core/tag/line.hh | 29 ++- 8 files changed, 823 insertions(+), 222 deletions(-) diff --git a/scribo/ChangeLog b/scribo/ChangeLog index 2c2f0c2..a24406b 100644 --- a/scribo/ChangeLog +++ b/scribo/ChangeLog @@ -1,3 +1,19 @@ +2010-03-11 Guillaume Lazzara <z@lrde.epita.fr> + + Improve core classes in Scribo. + + * core/component_info.hh, + * core/component_set.hh: Add new methods and an output operator. + + * core/line_info.hh: Add new methods and improve stats computation. + + * core/line_set.hh: Add new methods. + + * core/macros.hh: Add more specific macros. + + * core/tag/component.hh, + * core/tag/line.hh: Add output operators. + 2010-03-09 Guillaume Lazzara <z@lrde.epita.fr> Share internal data in groups and links structures. diff --git a/scribo/core/component_info.hh b/scribo/core/component_info.hh index 1e4aaf5..4ee438c 100644 --- a/scribo/core/component_info.hh +++ b/scribo/core/component_info.hh @@ -58,10 +58,17 @@ namespace scribo component_id_t id() const; const mln::box2d& bbox() const; const mln::point2d& mass_center() const; + + // The number of pixels in this component. unsigned card() const; + component::Tag tag() const; void update_tag(component::Tag tag); + component::Type type() const; + void update_type(component::Type type); + + // The line it is rattached to. 0 means an invalid line. line_id_t line_id() const; bool is_valid() const; @@ -71,12 +78,19 @@ namespace scribo mln::box2d bbox_; mln::point2d mass_center_; unsigned card_; + component::Tag tag_; + component::Type type_; line_id_t line_id_; }; + + std::ostream& + operator<<(std::ostream& ostr, const component_info& info); + + # ifndef MLN_INCLUDE_ONLY @@ -138,6 +152,21 @@ namespace scribo tag_ = tag; } + + component::Type + component_info::type() const + { + return type_; + } + + + void + component_info::update_type(component::Type type) + { + type_ = type; + } + + component_info::line_id_t component_info::line_id() const { @@ -151,6 +180,20 @@ namespace scribo } + std::ostream& + operator<<(std::ostream& ostr, const component_info& info) + { + return ostr << "component_info(" + << "id=" << info.id() + << ", bbox=" << info.bbox() + << ", mass_center=" << info.mass_center() + << ", card=" << info.card() + << ", tag=" << info.tag() + << ", line_id=" << info.line_id() + << ")" << std::endl; + } + + # endif // ! MLN_INCLUDE_ONLY diff --git a/scribo/core/component_set.hh b/scribo/core/component_set.hh index 2f9d10e..14cdc4c 100644 --- a/scribo/core/component_set.hh +++ b/scribo/core/component_set.hh @@ -1,4 +1,5 @@ -// Copyright (C) 2009 EPITA Research and Development Laboratory (LRDE) +// Copyright (C) 2009, 2010 EPITA Research and Development Laboratory +// (LRDE) // // This file is part of Olena. // @@ -30,6 +31,11 @@ /// /// \brief Definition of a component set. +# include <mln/core/concept/site_set.hh> +# include <mln/core/concept/function.hh> + +# include <mln/data/fill.hh> + # include <mln/util/array.hh> # include <mln/accu/pair.hh> @@ -40,6 +46,10 @@ # include <mln/convert/from_to.hh> +# include <mln/core/image/dmorph/image_if.hh> +# include <mln/pw/all.hh> + +# include <mln/core/routine/duplicate.hh> # include <scribo/core/macros.hh> # include <scribo/core/component_info.hh> @@ -86,6 +96,7 @@ namespace scribo mln_value(L) ncomps_; mln::util::array<scribo::component_info> infos_; + mln_ch_value(L, bool) separators_; }; } // end of namespace scribo::internal @@ -131,13 +142,6 @@ namespace scribo /// Return component information for a given component id \p id. component_info& info(const mln_value(L)& id); -// /// Return component information for a given component id \p id. -// component_info& operator()(const mln_value(L)& id); - -// /// Return component information for a given component id \p id. -// const component_info& operator()(const mln_value(L)& id) const; - - /// Return component information for a given component id \p id. component_info& operator()(const component_id_t& id); @@ -155,12 +159,38 @@ namespace scribo /// Return the underlying labeled image const L& labeled_image() const; + /// Is this component set valid? + bool is_valid() const; + + + /// Separators components related routines. + /// @{ + + /// Return true if an image of separator exists. + bool has_separators() const; + + /// Add separators in the underlying labeled image. + void add_separators(const mln_ch_value(L, bool)& ima); + + /// Return the Boolean image of separators. + const mln_ch_value(L, bool)& separators() const; + + /// Remove any existing separators. + void clear_separators(); + + /// @} + + + /// Internal methods /// @{ /// Return all the component infos. const mln::util::array<scribo::component_info>& infos_() const; + /// Unique set Id. + unsigned id_() const; + /// @} private: @@ -171,6 +201,18 @@ namespace scribo }; + template <typename L> + bool + operator==(const component_set<L>& lhs, const component_set<L>& rhs); + + + template <typename L> + std::ostream& + operator<<(std::ostream& ostr, const component_set<L>& info); + + + + # ifndef MLN_INCLUDE_ONLY @@ -195,6 +237,9 @@ namespace scribo const mln_value(L)& ncomps) : ima_(ima), ncomps_(ncomps) { + initialize(separators_, ima); // FIXME: do we really want that? + mln::data::fill(separators_, false); + typedef mln::accu::shape::bbox<mln_site(L)> bbox_accu_t; typedef mln::accu::center<mln_site(L)> center_accu_t; typedef mln::accu::pair<bbox_accu_t, center_accu_t> pair_accu_t; @@ -214,6 +259,9 @@ namespace scribo const mln::util::array<pair_accu_t>& attribs) : ima_(ima), ncomps_(ncomps) { + initialize(separators_, ima); // FIXME: do we really want that? + mln::data::fill(separators_, false); + fill_infos(attribs); } @@ -224,6 +272,9 @@ namespace scribo const mln::util::array<pair_data_t>& attribs) : ima_(ima), ncomps_(ncomps) { + initialize(separators_, ima); // FIXME: do we really want that? + mln::data::fill(separators_, false); + fill_infos(attribs); } @@ -234,6 +285,8 @@ namespace scribo const mln::util::array<scribo::component_info>& infos) : ima_(ima), ncomps_(ncomps), infos_(infos) { + initialize(separators_, ima); // FIXME: do we really want that? + mln::data::fill(separators_, false); } @@ -247,7 +300,7 @@ namespace scribo infos_.reserve(static_cast<unsigned>(ncomps_) + 1); infos_.append(component_info()); // Component 0, i.e. the background. - for_all_components(i, attribs) + for_all_comp_data(i, attribs) { component_info info(i, attribs[i].first(), attribs[i].second(), attribs[i].second_accu().nsites()); @@ -265,7 +318,7 @@ namespace scribo infos_.reserve(static_cast<unsigned>(ncomps_) + 1); infos_.append(component_info()); // Component 0, i.e. the background. - for_all_components(i, attribs) + for_all_comp_data(i, attribs) { component_info info(i, attribs[i].first, attribs[i].second.first, attribs[i].second.second); @@ -379,7 +432,7 @@ namespace scribo { const F& f = exact(f_); - for_all_components(i, data_->infos_) + for_all_comp_data(i, data_->infos_) if (!f(i)) data_->infos_[i].update_tag(tag); } @@ -405,6 +458,61 @@ namespace scribo return this->data_->ima_; } + + template <typename L> + inline + bool + component_set<L>::is_valid() const + { + return this->data_->ima_.is_valid(); + } + + + template <typename L> + inline + unsigned + component_set<L>::id_() const + { + return (unsigned)data_.ptr_; + } + + + template <typename L> + inline + bool + component_set<L>::has_separators() const + { + return this->data_->separators_.is_valid(); + } + + + template <typename L> + inline + void + component_set<L>::add_separators(const mln_ch_value(L, bool)& ima) + { + this->data_->separators_ = ima; + } + + + template <typename L> + inline + const mln_ch_value(L, bool)& + component_set<L>::separators() const + { + return this->data_->separators_; + } + + + template <typename L> + inline + void + component_set<L>::clear_separators() + { + this->data_->separators_.destroy(); + } + + template <typename L> inline const mln::util::array<scribo::component_info>& @@ -419,12 +527,32 @@ namespace scribo component_set<L>::init_(const component_set<L>& set) { data_ = new internal::component_set_data<L>(); - data_->ima_ = set.labeled_image(); + data_->ima_ = mln::duplicate(set.labeled_image()); data_->ncomps_ = set.nelements(); data_->infos_ = set.infos_(); + data_->separators_ = set.separators(); } + template <typename L> + bool + operator==(const component_set<L>& lhs, const component_set<L>& rhs) + { + return lhs.id_() == rhs.id_(); + } + + template <typename L> + std::ostream& + operator<<(std::ostream& ostr, const component_set<L>& info) + { + ostr << "component_set[" << std::endl; + for_all_comps(i, info) + ostr << info(i); + ostr << "]" << std::endl; + + return ostr; + } + # endif // ! MLN_INCLUDE_ONLY } // end of namespace scribo diff --git a/scribo/core/line_info.hh b/scribo/core/line_info.hh index 42438b0..0445f85 100644 --- a/scribo/core/line_info.hh +++ b/scribo/core/line_info.hh @@ -1,4 +1,5 @@ -// Copyright (C) 2009 EPITA Research and Development Laboratory (LRDE) +// Copyright (C) 2009, 2010 EPITA Research and Development Laboratory +// (LRDE) // // This file is part of Olena. // @@ -29,6 +30,9 @@ /// \file /// /// \brief Line information data structure. +/// +/// \fixme The meanline should not be stored! The user can deduce it +/// from the x_height and the baseline. # include <mln/core/alias/box2d.hh> @@ -40,13 +44,15 @@ # include <scribo/core/tag/component.hh> # include <scribo/core/tag/line.hh> -// # include <scribo/filter/object_links_bottom_aligned.hh> -// # include <scribo/filter/object_links_top_aligned.hh> - +# include <scribo/core/line_set.hh> +# include <scribo/core/component_set.hh> namespace scribo { + // Forward declarations. + template <typename L> class line_set; + typedef mln::util::object_id<scribo::LineId, unsigned> line_id_t; template <typename L> @@ -56,18 +62,26 @@ namespace scribo typedef mln::util::object_id<scribo::LineId, unsigned> line_id_t; public: + + /// Constructors + /// @{ + line_info(); - line_info(const line_id_t& id, - const mln::box2d& bbox, - const mln::util::array<component_id_t>& comps, - unsigned absolute_median, - unsigned absolute_baseline, - unsigned char_space, - unsigned char_width); - line_info(const object_links<L>& links, + + line_info(const line_set<L>& holder, const line_id_t& id, const mln::util::array<component_id_t>& comps); + /// The line id of the target instance is preserved if it is valid. + line_info(const line_info<L>& other); + /// @} + + /// The line id of the target instance is preserved if it is valid. + line_info<L>& operator=(const line_info<L>& other); + + /// If the line info is valid, the line id never changes for a + /// given instance. + // line_id_t id() const; line::Tag tag() const; @@ -75,12 +89,21 @@ namespace scribo const mln::box2d& bbox() const; + /// Extended bounding box. + /// The width is extended with char_width() + char_space() on each side. + /// The height is adjusted to max(a_height, - d_height) on each side. + // + const mln::box2d& ebbox() const; + const mln::util::array<component_id_t>& components() const; unsigned card() const; - unsigned baseline() const; - unsigned median() const; - int x_height() const; + int baseline() const; + int meanline() const; + int ascent() const; + int descent() const; + + unsigned x_height() const; int d_height() const; int a_height() const; @@ -90,7 +113,10 @@ namespace scribo unsigned word_space() const; line::ReadingDirection reading_direction() const; + line::Type type() const; + void update_type(line::Type type); + bool reverse_video() const; float orientation() const; @@ -101,6 +127,15 @@ namespace scribo bool is_valid() const; + /// Hidden status. + /// + /// When a line is hidden, it should not be used in routines + /// computing data over lines. + /// + /// @{ + bool hidden() const; + void set_hidden(bool b); + /// @} /// Merge related routines. /// @{ @@ -109,15 +144,18 @@ namespace scribo /// /// After this merge, the line is tagged with /// line::Needs_Precise_Stats_Update. + /// + /// The \p other line is tagged with line::Merged and if \p hide + /// is set to 'True', it is set as hidden as well. // - void fast_merge(line_info<L>& other); + void fast_merge(line_info<L>& other, bool hide = true); /// This merge updates the component list and recompute from /// scratch statistics, bounding box and other line attributes. /// /// After this merge, the line is tagged with line::None. // - void precise_merge(line_info<L>& other); + void precise_merge(line_info<L>& other, bool hide = true); /// @} @@ -126,21 +164,49 @@ namespace scribo void force_stats_update(); - private: + /// Returns the line set holding this element. + const line_set<L>& holder() const; + + /// Returns the delta used to compute the extended bbox. + int delta_of_line() const; + + private: // Members + void copy_data(const line_info<L>& other); + + /// Enlarge the width of a given bbox \p b with a \p delta. + mln::box2d enlarge(const mln::box2d& b, int delta) const; + + /// Update bbox and ebbox_ attributes. + void update_bbox_and_ebox(line_info<L>& other); + + /// Update the extended bbox. + void update_ebbox(); + + mln::box2d merged_ebbox(const scribo::line_info<L>& info_l, + const scribo::line_info<L>& info); + + void update_components_type(component::Type type); + + private: // Attributes + // WARNING: NEVER FORGET TO UPDATE COPY CONSTRUCTOR REDEFINITION!!!! + line_id_t id_; + bool hidden_; line::Tag tag_; mln::box2d bbox_; + mln::box2d ebbox_; mln::util::array<component_id_t> components_; - // Value relative to the line bbox. - unsigned baseline_; - unsigned median_; + // Values relative to the line bbox. + int baseline_; + int meanline_; // Values relative to the baseline. - int x_height_; + unsigned x_height_; int d_height_; int a_height_; + // WARNING: NEVER FORGET TO UPDATE COPY CONSTRUCTOR REDEFINITION!!!! // Character related stats. unsigned char_space_; @@ -166,8 +232,10 @@ namespace scribo bool indented_; - // Related object links information. - const object_links<L>* links_; + // Line set holding this element. + line_set<L> holder_; + + // WARNING: NEVER FORGET TO UPDATE COPY CONSTRUCTOR REDEFINITION!!!! }; @@ -181,66 +249,111 @@ namespace scribo template <typename L> line_info<L>::line_info() - : id_(0) + : id_(0), hidden_(false) { } + template <typename L> + inline + void + line_info<L>::copy_data(const line_info<L>& other) + { + // Id MUST NOT change except if this instance have no id. + if (! is_valid()) + { + id_ = other.id(); + hidden_ = other.hidden_; + } + + tag_ = other.tag(); + bbox_ = other.bbox(); + ebbox_ = other.ebbox(); + components_ = other.components(); + + baseline_ = other.baseline(); + meanline_ = other.meanline(); + + x_height_ = other.x_height(); + d_height_ = other.d_height(); + a_height_ = other.a_height(); + + char_space_ = other.char_space(); + char_width_ = other.char_width(); + + word_space_ = other.word_space(); + + reading_direction_ = other.reading_direction(); + + type_ = other.type(); + + reverse_video_ = other.reverse_video(); + + orientation_ = other.orientation(); + + reading_orientation_ = other.reading_orientation(); + + indented_ = other.indented(); + + holder_ = other.holder(); + } + + template <typename L> + inline + line_info<L>::line_info(const line_info<L>& other) + : id_(0), hidden_(false) + { + copy_data(other); + } + /*! Exemple: - --------------------- - | | | |x| | | | | | | ----> a_height = 4 - --------------------- - | | | |x| | | | | | | - --------------------- - | |x|x|x| | |x|x|x| | ----> Median = 2 ^ - --------------------- | - | |x| |x| | |x| |x| | | x_height = 3 - --------------------- | - | |x|x|x| | |x|x|x| | ----> Baseline = 4 v - --------------------- - | | | | | | |x| | | | - --------------------- - | | | | | | |x| | | | ----> d_height = -2 - --------------------- - - All the metrics are computed relatively to the Baseline. - - The baseline is defined as an absolute row index. + \verbatim + + 0 1 2 3 4 5 6 7 8 9 + --------------------- + 0 | | | |x| | | | | | | ----> a_height = 4 + --------------------- + 1 | | | |x| | | | | | | + --------------------- + 2 | |x|x|x| | |x|x|x| | ----> Meanline = 2 ^ + --------------------- | + 3 | |x| |x| | |x| |x| | | x_height = 3 + --------------------- | + 4 | |x|x|x| | |x|x|x| | ----> Baseline = 4 v + --------------------- + 5 | | | | | | |x| | | | + --------------------- + 6 | | | | | | |x| | | | ----> d_height = -2 + --------------------- + + \endverbatim + + The baseline, the meanline, the ascent and the descent are defined + as an absolute row index. + + All other metrics, such as x_height, are computed relatively to + the Baseline. */ template <typename L> - line_info<L>::line_info(const object_links<L>& links, + line_info<L>::line_info(const line_set<L>& holder, const line_id_t& id, const mln::util::array<component_id_t>& comps) - : id_(id), tag_(line::None), components_(comps), links_(&links) + : id_(id), hidden_(false), tag_(line::None), components_(comps), + type_(line::Undefined), holder_(holder) { - force_stats_update(); - - -// typedef mln_site(L) P; -// const component_set<L>& comp_set = links_->component_set_(); -// mln::accu::shape::bbox<P> bbox; -// for_all_elements(i, components_) -// { -// unsigned c = components_(i); -// const box2d& bb = comp_set(c).bbox(); -// // Bounding box. -// bbox.take(bb); -// } -// bbox_ = bbox.to_result(); - - + force_stats_update(); - // FIXME: set valid information for these attributes. + // FIXME: set valid information for these attributes in + // force_stats_update. word_space_ = 0; reading_direction_ = line::LeftToRight; - type_ = line::Paragraph; reverse_video_ = false; orientation_ = 0.; @@ -251,12 +364,22 @@ namespace scribo template <typename L> + inline + line_info<L>& + line_info<L>::operator=(const line_info<L>& other) + { + copy_data(other); + return *this; + } + + template <typename L> typename line_info<L>::line_id_t line_info<L>::id() const { return id_; } + template <typename L> line::Tag line_info<L>::tag() const @@ -280,6 +403,13 @@ namespace scribo return bbox_; } + template <typename L> + const mln::box2d& + line_info<L>::ebbox() const + { + return ebbox_; + } + template <typename L> const mln::util::array<typename line_info<L>::component_id_t>& @@ -297,7 +427,7 @@ namespace scribo template <typename L> - unsigned + int line_info<L>::baseline() const { return baseline_; @@ -305,14 +435,31 @@ namespace scribo template <typename L> - unsigned - line_info<L>::median() const + int + line_info<L>::meanline() const { - return median_; + return meanline_; } + template <typename L> int + line_info<L>::ascent() const + { + return baseline_ - a_height() + 1; + } + + + template <typename L> + int + line_info<L>::descent() const + { + return baseline_ - d_height() + 1; + } + + + template <typename L> + unsigned line_info<L>::x_height() const { return x_height_; @@ -375,6 +522,32 @@ namespace scribo template <typename L> + void + line_info<L>::update_components_type(component::Type type) + { + for_all_elements(i, components_) + { + unsigned c = components_[i]; + holder_.components_()(c).update_type(type); + } + } + + + template <typename L> + void + line_info<L>::update_type(line::Type type) + { + type_ = type; + + // Some line types may involve updating components type as well. + if (type == line::Punctuation) + update_components_type(component::Punctuation); + else if (type == line::Text) + update_components_type(component::Character); + } + + + template <typename L> bool line_info<L>::reverse_video() const { @@ -415,22 +588,170 @@ namespace scribo template <typename L> + bool + line_info<L>::hidden() const + { + return hidden_; + } + + + template <typename L> void - line_info<L>::fast_merge(line_info<L>& other) + line_info<L>::set_hidden(bool b) + { + hidden_ = b; + } + + + template <typename L> + inline + int + line_info<L>::delta_of_line() const + { + return char_width() + 2 * char_space(); + // FIXME: choose between: + // not enough: char_width + char_space + // too much: 2 * char_width + // looks good: char_width + 2 * char_space + } + + + template <typename L> + mln::box2d + line_info<L>::enlarge(const mln::box2d& b, int delta) const + { + mln::box2d b_(mln::point2d(b.pmin().row(), b.pmin().col() - delta), + mln::point2d(b.pmax().row(), b.pmax().col() + delta)); + return b_; + } + + + template <typename L> + void + line_info<L>::update_ebbox() + { + int A = a_height_ - x_height_; + int D = - d_height_; + if (A <= 2 && D > 2) + A = D; + if (D <= 2 && A > 2) + D = A; + + int delta = delta_of_line(); + + ebbox_ = mln::make::box2d(meanline_ - A, bbox().pmin().col() - delta, + baseline_ + D, bbox().pmax().col() + delta); + + ebbox_.crop_wrt(holder_.components().labeled_image().domain()); + } + + + template <typename L> + mln::box2d + line_info<L>::merged_ebbox(const scribo::line_info<L>& info_l, + const scribo::line_info<L>& info) + { + // line data + int + baseline_l = info_l.baseline(), + d_height = info_l.d_height(); + unsigned + a_height = info_l.a_height(), + x_height = info_l.x_height(); + int A_l = a_height - x_height; + int D_l = - d_height; + if (A_l <= 2 && D_l > 2) + A_l = D_l; + if (D_l <= 2 && A_l > 2) + D_l = A_l; + unsigned delta_l = info_l.delta_of_line(); + int meanline_l = info_l.meanline(); + + // non-line data + unsigned delta_ = info.delta_of_line(); + + mln::box2d b = mln::make::box2d(// pmin + meanline_l - A_l, + std::min(info_l.bbox().pmin().col(), info.bbox().pmin().col()) - std::max(delta_l, delta_), + // pmax + baseline_l + D_l, + std::max(info_l.bbox().pmax().col(), info.bbox().pmax().col()) + std::max(delta_l, delta_)); + + return b; + } + + + template <typename L> + void + line_info<L>::update_bbox_and_ebox(line_info<L>& other) + { + // Merging ebboxes depending on the type of the line. + + if (type() == line::Text) // /this/ IS a text line + { + if (other.type() == line::Text) // /other/ IS a text line. + { + // Adjusting ebboxes with the highest delta and merging ebboxes. + int d_delta = other.delta_of_line() - this->delta_of_line(); + if (d_delta < 0) // other.delta_of_line() < this->delta_of_line() + ebbox_.merge(enlarge(other.ebbox(), - d_delta)); + else + { + mln::box2d b = ebbox_; + ebbox_ = other.bbox(); + ebbox_.merge(enlarge(b, d_delta)); + } + + ebbox_.crop_wrt(holder_.components().labeled_image().domain()); + } + else // /other/ IS NOT a text line. + { + ebbox_.merge(other.ebbox()); + ebbox_.merge(merged_ebbox(*this, other)); + } + } + else // /this/ is NOT a text line + { + if (other.type() != line::Text) + { + std::cerr << "error in 'line_info::update_bbox_and_ebox':" + << "Merging two non text lines." << std::endl; + std::abort(); + } + + update_type(line::Text); + ebbox_.merge(other.ebbox()); + ebbox_.merge(merged_ebbox(other, *this)); + } + + // Merging bboxes. + bbox_.merge(other.bbox()); + + // Make sure the ebbox is included in the image domain. + ebbox_.crop_wrt(holder_.components().labeled_image().domain()); + } + + + template <typename L> + void + line_info<L>::fast_merge(line_info<L>& other, bool hide) { tag_ = line::Needs_Precise_Stats_Update; other.update_tag(line::Merged); + other.set_hidden(hide); + + // Update bbox and ebbox + update_bbox_and_ebox(other); - bbox_.merge(other.bbox()); components_.append(other.components()); } template <typename L> void - line_info<L>::precise_merge(line_info<L>& other) + line_info<L>::precise_merge(line_info<L>& other, bool hide) { - fast_merge(other); + fast_merge(other, hide); force_stats_update(); } @@ -439,7 +760,7 @@ namespace scribo line_info<L>::force_stats_update() { typedef mln_site(L) P; - const component_set<L>& comp_set = links_->component_set_(); + const component_set<L>& comp_set = holder_.components(); // FIXME: int_u<12> may not be enought but we can't use unsigned // or any other larger types since there is no median @@ -449,7 +770,7 @@ namespace scribo typedef mln::value::int_u<12> median_data_t; typedef mln::accu::stat::median_h<median_data_t> median_t; median_t - absolute_median, + absolute_meanline, absolute_baseline, char_space, char_width; @@ -460,11 +781,22 @@ namespace scribo { unsigned c = components_(i); - const box2d& bb = comp_set(c).bbox(); + const mln::box2d& bb = comp_set(c).bbox(); + + // Bounding box. + bbox.take(bb); + + // Ignore punctuation for stats computation but not for bbox + // computation. + if (holder_.components()(c).type() == component::Punctuation) + continue; + + // Space between characters. int space = bb.pmin().col() - - comp_set((*links_)[c]).bbox().pmax().col(); + - comp_set(holder_.links()(c)).bbox().pmax().col(); + // -- Ignore overlapped characters. if (space > 0) char_space.take(space); @@ -474,16 +806,13 @@ namespace scribo if (bb.width() <= 1000) char_width.take(bb.width()); - // Median (compute an absolute value, from the top left + // Meanline (compute an absolute value, from the top left // corner of the image). - absolute_median.take(bb.pmin().row()); + absolute_meanline.take(bb.pmin().row()); // Baseline (compute an absolute value, from the top left // corner of the image). absolute_baseline.take(bb.pmax().row()); - - // Bounding box. - bbox.take(bb); } // Finalization @@ -492,7 +821,7 @@ namespace scribo bbox_ = bbox.to_result(); // Char space - if (card() == 1) + if (char_space.card() < 2) char_space_ = 0; else char_space_ = char_space.to_result(); @@ -506,21 +835,31 @@ namespace scribo baseline_ = absolute_baseline.to_result(); - median_ = absolute_baseline - absolute_median; - x_height_ = absolute_baseline - absolute_median + 1; - d_height_ = absolute_baseline - bbox.to_result().pmax().row(); + meanline_ = absolute_meanline.to_result(); + x_height_ = absolute_baseline - absolute_meanline + 1; + d_height_ = absolute_baseline - bbox.to_result().pmax().row() + 1; a_height_ = absolute_baseline - bbox.to_result().pmin().row() + 1; //FIXME // //word_space_ = ...; //reading_direction_ = ...; - //type_ = ...; //reverse_video_ = ...; //orientation_ = ...; //reading_orientation_ = ...; //indented_ = ...; + + update_ebbox(); } + + } + + + template <typename L> + const line_set<L>& + line_info<L>::holder() const + { + return holder_; } @@ -531,10 +870,14 @@ namespace scribo return ostr << "line_info(" << "id=" << info.id() << ", tag=" << info.tag() + << ", type=" << info.type() << ", bbox=" << info.bbox() + << ", ebbox=" << info.ebbox() << ", components=" << info.components() << ", baseline=" << info.baseline() - << ", median=" << info.median() + << ", meanline=" << info.meanline() + << ", ascent=" << info.ascent() + << ", descent=" << info.descent() << ", x_height=" << info.x_height() << ", d_height=" << info.d_height() << ", a_height=" << info.a_height() diff --git a/scribo/core/line_set.hh b/scribo/core/line_set.hh index 2d846f0..80a79b3 100644 --- a/scribo/core/line_set.hh +++ b/scribo/core/line_set.hh @@ -1,4 +1,5 @@ -// Copyright (C) 2009 EPITA Research and Development Laboratory (LRDE) +// Copyright (C) 2009, 2010 EPITA Research and Development Laboratory +// (LRDE) // // This file is part of Olena. // @@ -46,7 +47,9 @@ # include <scribo/core/macros.hh> # include <scribo/core/line_info.hh> -# include <scribo/core/line_stats_extra.hh> + +# include <scribo/core/object_links.hh> +# include <scribo/core/object_groups.hh> namespace scribo @@ -63,17 +66,24 @@ namespace scribo struct line_set_data { line_set_data(); + line_set_data(const object_groups<L>& comp_set); line_set_data(const mln::util::array<scribo::line_info<L> >& infos, - const component_set<L>& comp_set); + const object_groups<L>& comp_set); mln::util::array<scribo::line_info<L> > infos_; - component_set<L> comp_set_; + component_set<L> components_; + object_links<L> links_; + object_groups<L> groups_; }; } // end of namespace scribo::internal + /*! \brief Lines container. + + Line ids start from 1. + */ template <typename L> class line_set { @@ -85,12 +95,11 @@ namespace scribo line_set(); /// Constructor from object groups. - line_set(const object_links<L>& links, const object_groups<L>& groups); + line_set(const object_groups<L>& groups); /// @} /// Compute line stats and fill the underlying information. - void compute_lines(const object_links<L>& links, - const object_groups<L>& groups); + void compute_lines(const object_groups<L>& groups); /// Return the line count. mln_value(L) nelements() const; @@ -115,7 +124,24 @@ namespace scribo line_set<L> duplicate() const; /// Return the underlying component set. - const component_set<L>& component_set_() const; + const component_set<L>& components() const; + + /// Return the underlying component set (non-const version). + component_set<L>& components_(); + + /// Return the underlying component group. + const object_groups<L>& groups() const; + + /// Return the underlying links. + const object_links<L>& links() const; + + + /// Massive line computation + /// @{ + + void force_stats_update(); + + /// @} /// Internal methods /// @{ @@ -138,8 +164,7 @@ namespace scribo template <typename L> scribo::line_set<L> - line_set(const object_links<L>& links, - const object_groups<L>& groups); + line_set(const object_groups<L>& groups); } // End of namespace scribo::make @@ -165,9 +190,19 @@ namespace scribo template <typename L> inline + line_set_data<L>::line_set_data(const object_groups<L>& groups) + : components_(groups.components()), links_(groups.links()), + groups_(groups) + { + } + + + template <typename L> + inline line_set_data<L>::line_set_data(const mln::util::array<scribo::line_info<L> >& infos, - const component_set<L>& comp_set) - : infos_(infos), comp_set_(comp_set) + const object_groups<L>& groups) + : infos_(infos), components_(groups.components()), + links_(groups.links()), groups_(groups) { } @@ -182,28 +217,22 @@ namespace scribo { } + template <typename L> inline - line_set<L>::line_set(const object_links<L>& links, - const object_groups<L>& groups) + line_set<L>::line_set(const object_groups<L>& groups) { - compute_lines(links, groups); + compute_lines(groups); } - // FIXME: groups should have a reference to the links data and we - // should only required groups as argument. template <typename L> -// util::array<line_stats_extra> void - line_set<L>::compute_lines(const object_links<L>& links, - const object_groups<L>& groups) + line_set<L>::compute_lines(const object_groups<L>& groups) { - data_ = new internal::line_set_data<L>(); + data_ = new internal::line_set_data<L>(groups); typedef mln_site(L) P; - data_->comp_set_ = groups.component_set_(); - const component_set<L>& comp_set = groups.component_set_(); mln_value(L) n_groups = groups.nelements() - 1; mln::fun::i2v::array<mln_value(L)> @@ -211,93 +240,32 @@ namespace scribo n_groups, n_groups); // FIXME: object_groups should store the relation 'group -> comp'. - // it would avoid the use of accumulator arrays. - - // FIXME: int_u<11> may not be enought but we can't use unsigned - // or any other larger types since there is no median - // implementation for high quantification types... - - // Init. -// typedef mln::value::int_u<12> median_data_t; -// typedef mln::accu::stat::median_h<median_data_t> median_t; -// util::array<median_t> -// absolute_median(static_cast<unsigned>(n_groups) + 1), -// absolute_baseline(static_cast<unsigned>(n_groups) + 1), -// char_space(static_cast<unsigned>(n_groups) + 1), -// char_width(static_cast<unsigned>(n_groups) + 1); - -// util::array<mln::accu::shape::bbox<P> > -// bbox(static_cast<unsigned>(n_groups) + 1); + mln::util::array< mln::util::array<component_id_t> > + group_to_comps(unsigned(n_groups) + 1); - util::array< util::array<component_id_t> > - comps(static_cast<unsigned>(n_groups) + 1); // 1st pass - Compute data. - for (unsigned i = 1; i < packed_groups.size(); ++i) - if (comp_set(i).is_valid()) + for_all_comps(i, data_->components_) + if (data_->components_(i).is_valid()) { unsigned group_id = packed_groups(i); if (group_id != 0) // Is this component part of a group? { -// const box2d& bb = comp_set(i).bbox(); - -// // Space between characters. -// int space = bb.pmin().col() -// - comp_set(links[i]).bbox().pmax().col(); -// // -- Ignore overlapped characters. -// if (space > 0) -// char_space(group_id).take(space); - -// // Character width -// // -- Ignore too large components. -// if (bb.width() <= 1000) -// char_width(group_id).take(bb.width()); - -// // Median (compute an absolute value, from the top left -// // corner of the image). -// absolute_median(group_id).take(bb.pmin().row()); - -// // Baseline (compute an absolute value, from the top left -// // corner of the image). -// absolute_baseline(group_id).take(bb.pmax().row()); - -// // Bounding box. -// bbox(group_id).take(bb); - // Component id. - comps(group_id).append(i); + group_to_comps(group_id).append(i); } } // 2nd pass - Store data. - data_->infos_.reserve(groups.nelements()); + data_->infos_.reserve(group_to_comps.size()); data_->infos_.append(line_info<L>()); // line with id 0 is invalid. -// util::array<line_stats_extra> stats_extra; -// stats_extra.reserve(static_cast<unsigned>(n_groups) + 1); -// stats_extra.append(line_stats_extra()); - - for (unsigned i = 1; i <= n_groups; ++i) + for_all_groups(i, group_to_comps) { - // Add line info. - line_info<L> info(links, i, comps(i)); + line_info<L> info(*this, i, group_to_comps(i)); data_->infos_.append(info); - - - // Prepare extra stats to be returned. -// line_stats_extra stats(absolute_median(i) * absolute_median(i).card(), -// absolute_median(i).card(), -// absolute_baseline(i) * absolute_baseline(i).card(), -// absolute_baseline(i).card(), -// char_space(i) * char_space(i).card(), -// char_space(i).card(), -// char_width(i) * char_width(i).card(), -// char_width(i).card()); -// stats_extra.append(stats); } - -// return stats_extra; } @@ -351,7 +319,7 @@ namespace scribo { const F& f = exact(f_); - for_all_elements(i, data_->infos_) + for_all_lines_info(i, data_->infos_) if (!f(i)) data_->infos_[i].update_tag(tag); } @@ -370,11 +338,44 @@ namespace scribo template <typename L> inline const component_set<L>& - line_set<L>::component_set_() const + line_set<L>::components() const + { + return data_->components_; + } + + template <typename L> + inline + component_set<L>& + line_set<L>::components_() { - return data_->comp_set_; + return data_->components_; } + template <typename L> + inline + const object_groups<L>& + line_set<L>::groups() const + { + return data_->groups_; + } + + template <typename L> + inline + const object_links<L>& + line_set<L>::links() const + { + return data_->links_; + } + + template <typename L> + inline + void + line_set<L>::force_stats_update() + { + for_all_lines_info(i, data_->infos_) + if (data_->infos_(i).tag() == line::Needs_Precise_Stats_Update) + data_->infos_(i).force_stats_update(); + } template <typename L> inline @@ -389,7 +390,7 @@ namespace scribo void line_set<L>::init_(const line_set<L>& set) { - data_ = new internal::line_set_data<L>(set.infos_(), set.component_set_()); + data_ = new internal::line_set_data<L>(set.infos_(), set.groups()); } @@ -400,27 +401,13 @@ namespace scribo template <typename L> scribo::line_set<L> - line_set(const object_links<L>& links, - const object_groups<L>& groups) + line_set(const object_groups<L>& groups) { - mln_precondition(exact(ima).is_valid()); - scribo::line_set<L> tmp(links, groups); + mln_precondition(groups.is_valid()); + scribo::line_set<L> tmp(groups); return tmp; } - -// template <typename L> -// scribo::line_set<L> -// line_set(const object_links<L>& links, -// const object_groups<L>& groups, -// util::array<line_stats_extra>& line_stats) -// { -// mln_precondition(exact(ima).is_valid()); -// scribo::line_set<L> tmp; -// line_stats = tmp.compute_lines(links, groups); -// return tmp; -// } - } // end of namespace scribo::make diff --git a/scribo/core/macros.hh b/scribo/core/macros.hh index bf2afde..f644db3 100644 --- a/scribo/core/macros.hh +++ b/scribo/core/macros.hh @@ -29,8 +29,8 @@ # define for_all_ncomponents(C, NCOMP) \ for (unsigned C = 1; C <= NCOMP; ++C) -# define for_all_components(C, S) \ - for (unsigned C = 1; C < S.nelements(); ++C) +// # define for_all_components(C, S) +// for (unsigned C = 1; C <= S.nelements(); ++C) # define for_all_elements(E, S) \ for (unsigned E = 0; E < S.nelements(); ++E) @@ -43,10 +43,16 @@ # define for_all_comp_data(E, S) \ for (unsigned E = 1; E < S.nelements(); ++E) +# define for_all_links(E, S) \ + for_all_comp_data(E, S) + +# define for_all_groups(E, S) \ + for_all_comp_data(E, S) + # define for_all_lines(E, S) \ for_all_comps(E, S) -# define for_all_groups(E, S) \ +# define for_all_lines_info(E, S) \ for_all_comp_data(E, S) #endif // ! SCRIBO_CORE_MACROS_HH diff --git a/scribo/core/tag/component.hh b/scribo/core/tag/component.hh index 7998f08..3c061b3 100644 --- a/scribo/core/tag/component.hh +++ b/scribo/core/tag/component.hh @@ -44,11 +44,72 @@ namespace scribo enum Tag { None = 0, - Separator, Ignored }; + enum Type + { + Undefined = 0, + Character, + Separator, + Noise, + Punctuation + }; + +# ifndef MLN_INCLUDE_ONLY + + + std::ostream& + operator<<(std::ostream& ostr, const Tag& tag) + { + std::string str; + switch(tag) + { + default: + case None: + str = "None"; + break; + case Ignored: + str = "Ignored"; + break; + } + + return ostr << str; + } + + + std::ostream& + operator<<(std::ostream& ostr, const Type& type) + { + std::string str; + switch(type) + { + default: + case Undefined: + str = "Undefined"; + break; + case Character: + str = "Character"; + break; + case Separator: + str = "Separator"; + break; + case Noise: + str = "Noise"; + break; + case Punctuation: + str = "Punctuation"; + break; + } + + return ostr << str; + } + + +# endif // ! MLN_INCLUDE_ONLY + + } // end of namespace scribo::component } // end of namespace scribo diff --git a/scribo/core/tag/line.hh b/scribo/core/tag/line.hh index 383e331..f571188 100644 --- a/scribo/core/tag/line.hh +++ b/scribo/core/tag/line.hh @@ -43,7 +43,6 @@ namespace scribo enum Tag { None = 0, - Separator, Ignored, Needs_Precise_Stats_Update, Merged, @@ -77,7 +76,12 @@ namespace scribo Header, Heading, PageNumber, - Paragraph + Paragraph, + + // These types are not supported by the XSD. + Punctuation, + Text, + Undefined }; @@ -104,9 +108,6 @@ namespace scribo case None: str = "None"; break; - case Separator: - str = "Separator"; - break; case Ignored: str = "Ignored"; break; @@ -160,7 +161,6 @@ namespace scribo case Caption: str = "caption"; break; - default: case Credit: str = "credit"; break; @@ -169,17 +169,34 @@ namespace scribo break; case Floating: str = "floating"; + break; case Footer: str = "footer"; + break; case Header: str = "header"; + break; case Heading: str = "heading"; + break; case PageNumber: str = "page-number"; + break; case Paragraph: str = "paragraph"; break; + + // Values unsupported by the XSD + case Punctuation: + str = "punctuation"; + break; + case Text: + str = "text"; + break; + default: + case Undefined: + str = "undefined"; + break; } return ostr << str; -- 1.5.6.5