last-svn-commit-838-g8bae071 Paragraphs extraction

--- scribo/scribo/core/line_info.hh | 199 +++++- scribo/scribo/core/stats.hh | 320 +++++++++ scribo/scribo/text/look_like_text_lines.hh | 16 + scribo/scribo/text/merging.hh | 306 ++++++--- scribo/scribo/text/paragraphs.hh | 1027 ++++++++++++++++++++++++++++ 5 files changed, 1767 insertions(+), 101 deletions(-) create mode 100644 scribo/scribo/core/stats.hh create mode 100644 scribo/scribo/text/paragraphs.hh diff --git a/scribo/scribo/core/line_info.hh b/scribo/scribo/core/line_info.hh index 73d7856..b44b379 100644 --- a/scribo/scribo/core/line_info.hh +++ b/scribo/scribo/core/line_info.hh @@ -59,6 +59,9 @@ # include <scribo/core/concept/serializable.hh> +// DEBUG + +# include <scribo/core/stats.hh> namespace scribo { @@ -138,6 +141,9 @@ namespace scribo // Line set holding this element. line_set<L> holder_; + // DEBUG + stats< float > meanline_clusters_; + stats< float > baseline_clusters_; }; @@ -279,7 +285,8 @@ namespace scribo /// @} - + bool chars_same_width() const; + unsigned get_first_char_height() const; /// Force a new computation of statistics. void force_stats_update(); @@ -307,6 +314,9 @@ namespace scribo void update_components_type(component::Type type); + int compute_baseline(); + int compute_meanline(); + private: // Attributes line_id_t id_; mln::util::tracked_ptr<data_t> data_; @@ -817,7 +827,7 @@ namespace scribo int line_info<L>::delta_of_line() const { - return char_width() + 2 * char_space(); + return 2 * char_width() + 2 * char_space(); // FIXME: choose between: // not enough: char_width + char_space // too much: 2 * char_width @@ -967,6 +977,145 @@ namespace scribo } template <typename L> + bool + line_info<L>::chars_same_width() const + { + // Only for the case of two-character words + if (card() == 2) + { + const component_set<L>& comp_set = data_->holder_.components(); + + const unsigned c1 = data_->components_(0); + const unsigned c2 = data_->components_(1); + + if (data_->holder_.components()(c1).type() == component::Punctuation + || data_->holder_.components()(c2).type() == component::Punctuation) + return false; + + const mln::box2d& bb1 = comp_set(c1).bbox(); + const mln::box2d& bb2 = comp_set(c2).bbox(); + + const float w1 = bb1.width(); + const float h1 = bb1.height(); + const float w2 = bb2.width(); + const float h2 = bb2.height(); + + const float space = std::max(bb1.pmin().col(), bb2.pmin().col()) - + std::min(bb1.pmax().col(), bb2.pmax().col()); + + const int dy = bb1.pmax().row() - bb2.pmax().row(); + + // The two characters must be distinct + if (space < 0) + return false; + + if (// Approximately the same width + ((std::max(w1, w2) / std::min(w1, w2)) > 1.1f || + // One character must not be smaller than the space between + // the two characters + (w1 < space || w2 < space)) + // If the two characters have a different width they must also + // have a different height + && not (std::max(h1, h2) / std::min(h1, h2) <= 1.5f)) + return false; + + // Approximately aligned on baseline + if (std::abs(dy) > 10) + return false; + + return true; + } + + return false; + } + + template< typename L > + unsigned + line_info<L>::get_first_char_height() const + { + const component_set<L>& comp_set = data_->holder_.components(); + const unsigned c1 = data_->components_(0); + const mln::box2d& bb1 = comp_set(c1).bbox(); + + return bb1.height(); + } + + template <typename L> + int + line_info<L>::compute_baseline() + { + const unsigned nelements = data_->baseline_clusters_.nelements(); + + if (nelements == 2) + return data_->baseline_clusters_.mean(); + + mln::util::array< cluster_stats< float > >& clusters_b = data_->baseline_clusters_.clusters(); + + unsigned index = 0; + float min_base = 0.0f; + const unsigned clusters_b_nelements = clusters_b.nelements(); + + for (unsigned i = 0; i < clusters_b_nelements; ++i) + { + const unsigned clusters_b_i_nelements = clusters_b[i].nelements(); + + if (clusters_b_i_nelements >= min_base * 2.0f) + { + min_base = clusters_b_i_nelements; + index = i; + } + else if (clusters_b_i_nelements >= 0.5f * min_base) + { + if (clusters_b_i_nelements > 1 && + clusters_b[index].median() > clusters_b[i].median()) + { + if (clusters_b_i_nelements > min_base) + min_base = clusters_b_i_nelements; + index = i; + } + } + } + + if (clusters_b[index].nelements() <= 2 && nelements <= 5) + return data_->baseline_clusters_.mean(); + + return clusters_b[index].median(); + } + + template <typename L> + int + line_info<L>::compute_meanline() + { + mln::util::array< cluster_stats< float > >& clusters_m = data_->meanline_clusters_.clusters(); + + unsigned index = 0; + float max_mean = 0.0f; + const unsigned clusters_m_nelements = clusters_m.nelements(); + + for (unsigned i = 0; i < clusters_m_nelements; ++i) + { + const unsigned clusters_m_i_nelements = clusters_m[i].nelements(); + + if (clusters_m_i_nelements >= max_mean * 2.0f) + { + max_mean = clusters_m_i_nelements; + index = i; + } + else if (clusters_m_i_nelements >= 0.5f * max_mean) + { + if (clusters_m[index].median() < clusters_m[i].median()) + { + if (clusters_m_i_nelements > max_mean) + max_mean = clusters_m_i_nelements; + index = i; + } + } + } + + return clusters_m[index].median(); + } + + template <typename L> void line_info<L>::force_stats_update() { @@ -977,8 +1126,8 @@ namespace scribo typedef mln::value::int_u<12> median_data_t; typedef mln::accu::stat::median_h<median_data_t> median_t; median_t - meanline, - baseline, + // meanline, + // baseline, char_space, char_width; @@ -1002,6 +1151,10 @@ namespace scribo mln::def::coord ref_line = mln_max(mln::def::coord); + // DEBUG + data_->baseline_clusters_.reset(); + data_->meanline_clusters_.reset(); + // Find a reference line to compute baselines and other attributes. // Workaround to avoid overflow with int_u<12> in median accumulators. // @@ -1040,18 +1193,18 @@ namespace scribo // incremented. ++used_comps; - // Compute boldness - boldness.take(comp_set(c).features().boldness); - sum2_boldness += mln::math::sqr<float>(comp_set(c).features().boldness); + // // Compute boldness + // boldness.take(comp_set(c).features().boldness); + // sum2_boldness += mln::math::sqr<float>(comp_set(c).features().boldness); - // Compute color - color_red.take(comp_set(c).features().color.red()); - color_green.take(comp_set(c).features().color.green()); - color_blue.take(comp_set(c).features().color.blue()); + // // Compute color + // color_red.take(comp_set(c).features().color.red()); + // color_green.take(comp_set(c).features().color.green()); + // color_blue.take(comp_set(c).features().color.blue()); - sum2_red += mln::math::sqr<unsigned>(comp_set(c).features().color.red()); - sum2_green += mln::math::sqr<unsigned>(comp_set(c).features().color.green()); - sum2_blue += mln::math::sqr<unsigned>(comp_set(c).features().color.blue()); + // sum2_red += mln::math::sqr<unsigned>(comp_set(c).features().color.red()); + // sum2_green += mln::math::sqr<unsigned>(comp_set(c).features().color.green()); + // sum2_blue += mln::math::sqr<unsigned>(comp_set(c).features().color.blue()); // FIXME: we must guaranty here that the relationship is from @@ -1086,12 +1239,14 @@ namespace scribo // Meanline (compute an absolute value, from the top left corner // of the highest character bounding box, excluding // punctuation). - meanline.take(bb.pmin().row() - ref_line); + // meanline.take(bb.pmin().row() - ref_line); + data_->meanline_clusters_.take(bb.pmin().row()); // Baseline (compute an absolute value, from the top left corner // of the highest character bounding box, excluding // punctuation). - baseline.take(bb.pmax().row() - ref_line); + // baseline.take(bb.pmax().row() - ref_line); + data_->baseline_clusters_.take(bb.pmax().row()); } // Finalization @@ -1143,12 +1298,14 @@ namespace scribo else data_->char_width_ = char_width.to_result(); - mln::def::coord - absolute_baseline_r = baseline.to_result() + ref_line, - absolute_meanline_r = meanline.to_result() + ref_line; + // // mln::def::coord + // // absolute_baseline_r = baseline.to_result() + ref_line, + // // absolute_meanline_r = meanline.to_result() + ref_line; - data_->baseline_ = absolute_baseline_r; - data_->meanline_ = absolute_meanline_r; + // data_->baseline_ = absolute_baseline_r; + // data_->meanline_ = absolute_meanline_r; + data_->baseline_ = compute_baseline(); + data_->meanline_ = compute_meanline(); data_->x_height_ = data_->baseline_ - data_->meanline_ + 1; data_->d_height_ = data_->baseline_ - bbox.to_result().pmax().row(); data_->a_height_ = data_->baseline_ - bbox.to_result().pmin().row() + 1; diff --git a/scribo/scribo/core/stats.hh b/scribo/scribo/core/stats.hh new file mode 100644 index 0000000..570325c --- /dev/null +++ b/scribo/scribo/core/stats.hh @@ -0,0 +1,320 @@ +#ifndef STATS_HH_ +# define STATS_HH_ + +# include <vector> +# include <algorithm> + +# include <mln/util/array.hh> + +using namespace mln; + +//--------------------------------------------------------------------------- +// compare_values +//--------------------------------------------------------------------------- + +template< typename T > +struct compare_values +{ + bool operator() (const T& lhs, + const T& rhs) + { + return (lhs < rhs); + } +}; + + +//--------------------------------------------------------------------------- +// cluster_stats +//--------------------------------------------------------------------------- + +template< typename T > +class cluster_stats +{ +public: + cluster_stats() + : mean_needs_update_(false), median_needs_update_(false), + variance_needs_update_(false), std_needs_update_(false), size_(0) + { + } + + cluster_stats(const unsigned size) + : mean_needs_update_(false), median_needs_update_(false), + variance_needs_update_(false), std_needs_update_(false), size_(0) + { + data_.reserve(size); + } + + void reset() + { + std::vector< T >& data = data_.hook_std_vector_(); + data.clear(); + + size_ = 0; + needs_update(); + } + + void take(const T& value) + { + if (not size_) + { + min_ = value; + max_ = value; + } + else + { + if (value < min_) + min_ = value; + else if (value > max_) + max_ = value; + } + + ++size_; + data_.append(value); + needs_update(); + } + + T mean() + { + if (mean_needs_update_) + { + mean_ = 0; + + for (unsigned i = 0; i < size_; ++i) + mean_ += data_[i]; + + mean_ /= size_; + mean_needs_update_ = false; + } + + return mean_; + } + + T median() + { + if (median_needs_update_) + { + std::vector< T >& data = data_.hook_std_vector_(); + std::sort(data.begin(), data.end(), compare_values< T >()); + + median_ = data[(size_ - 1) >> 1]; + median_needs_update_ = false; + } + + return median_; + } + + T variance() + { + if (variance_needs_update_) + { + mean(); + variance_ = 0; + + for (unsigned i = 0; i < size_; ++i) + { + const T tmp = mean_ - data_[i]; + + variance_ += (tmp * tmp); + } + + variance_ /= size_; + std_ = sqrt(variance_); + + variance_needs_update_ = false; + std_needs_update_ = false; + } + + return variance_; + } + + T standard_deviation() + { + if (std_needs_update_) + variance(); + + return std_; + } + + T min() { return min_; } + T max() { return max_; } + + void sort() + { + std::vector< T >& data = data_.hook_std_vector_(); + std::sort(data.begin(), data.end(), compare_values< T >()); + } + + unsigned nelements() { return size_; } + + T operator[] (const unsigned index) + { + return data_[index]; + } + +private: + void needs_update() + { + mean_needs_update_ = true; + median_needs_update_ = true; + variance_needs_update_ = true; + std_needs_update_ = true; + } + +private: + bool mean_needs_update_ : 1; + bool median_needs_update_ : 1; + bool variance_needs_update_ : 1; + bool std_needs_update_ : 1; + T mean_; + T median_; + T min_; + T max_; + T variance_; + T std_; + unsigned size_; + util::array< T > data_; +}; + +//--------------------------------------------------------------------------- +// stats +//--------------------------------------------------------------------------- + +template< typename T > +class stats +{ +public: + stats() + : clusters_need_update_(true), data_sorted_(false) + { + } + + stats(const int size) + : clusters_need_update_(true), data_sorted_(false), data_(size) + { + } + + void reset() + { + data_.reset(); + std::vector< cluster_stats< T > >& clusters = clusters_.hook_std_vector_(); + clusters.clear(); + data_sorted_ = false; + clusters_need_update_ = true; + } + + void take(const T& value) + { + data_.take(value); + clusters_need_update_ = true; + } + + T mean() + { + return data_.mean(); + } + + T median() + { + return data_.median(); + } + + T variance() + { + return data_.variance(); + } + + T standard_deviation() + { + return data_.standard_deviation(); + } + + T min() { return data_.min(); } + T max() { return data_.max(); } + + unsigned nelements() { return data_.nelements(); } + + util::array< cluster_stats< T > >& clusters() + { + if (clusters_need_update_) + { + compute_clusters(); + clusters_need_update_ = false; + } + + return clusters_; + } + +private: + void compute_clusters() + { + std::vector< unsigned > clusters; + unsigned cluster_index = 1; + + clusters.reserve(data_.nelements()); + + if (not data_sorted_) + { + data_.sort(); + data_sorted_ = true; + } + + unsigned i = 0; + const unsigned nelements = data_.nelements(); + + clusters[0] = cluster_index; + const T std = data_.standard_deviation(); + + for (i = 1; i < nelements - 1; ++i) + { + const T left_distance = data_[i] - data_[i - 1]; + const T right_distance = data_[i + 1] - data_[i]; + + if (not ((left_distance <= 2 || left_distance < right_distance) + && left_distance <= std)) + ++cluster_index; + + clusters[i] = cluster_index; + } + + if (nelements > 1 + && data_[i] - data_[i - 1] > std) + ++cluster_index; + + clusters[i] = cluster_index; + + clusters_.clear(); + clusters_.reserve(cluster_index); + cluster_index = 1; + + i = 0; + while (i < nelements) + { + unsigned tmp = i; + + while (tmp < nelements && clusters[tmp] == clusters[i]) + ++tmp; + + cluster_stats< T > cluster(tmp - i); + + tmp = i; + while (tmp < nelements && clusters[tmp] == clusters[i]) + { + cluster.take(data_[tmp]); + ++tmp; + } + + clusters_.append(cluster); + + i = tmp; + ++cluster_index; + } + } + +private: + bool clusters_need_update_ : 1; + bool data_sorted_ : 1; + cluster_stats< T > data_; + util::array< cluster_stats< T > > clusters_; +}; + +#endif /* STATS_HH_ */ diff --git a/scribo/scribo/text/look_like_text_lines.hh b/scribo/scribo/text/look_like_text_lines.hh index 2ced7ce..80d9995 100644 --- a/scribo/scribo/text/look_like_text_lines.hh +++ b/scribo/scribo/text/look_like_text_lines.hh @@ -64,6 +64,22 @@ namespace scribo inline bool looks_like_a_text_line(const scribo::line_info<L>& l) { + // Special case for two-letter words + if (l.card() == 2) + { + const float ratio = (float) l.bbox().width() / l.bbox().height(); + + if (// Minimal width / height ratio + ratio > 0.4f && ratio < 2.0f + // Minimal height + && l.bbox().height() >= 15 + // Characters must have approximately the same width + && l.chars_same_width()) + { + return true; + } + } + return l.card() >= 3 // at least 3 components && l.bbox().height() > 10 // and minimal height diff --git a/scribo/scribo/text/merging.hh b/scribo/scribo/text/merging.hh index 26e2da7..8ee349b 100644 --- a/scribo/scribo/text/merging.hh +++ b/scribo/scribo/text/merging.hh @@ -188,21 +188,24 @@ namespace scribo swap_ordering(l1, l2); parent[l2] = l1; // The smallest label value is root. - if (lines(l2).card() > lines(l1).card()) + line_info<L>& l1_info = lines(l1); + line_info<L>& l2_info = lines(l2); + + if (l2_info.card() > l1_info.card()) { // we transfer data from the largest item to the root one. - scribo::line_info<L> tmp = lines(l1); - std::swap(lines(l1), lines(l2)); - lines(l1).fast_merge(tmp); + scribo::line_info<L> tmp = l1_info; + std::swap(l1_info, l2_info); + l1_info.fast_merge(tmp); // We must set manually the tag for lines(l2) since it is // not used directly in merge process so its tag cannot be // updated automatically. - lines(l2).update_tag(line::Merged); - lines(l2).set_hidden(true); + l2_info.update_tag(line::Merged); + l2_info.set_hidden(true); } else - lines(l1).fast_merge(lines(l2)); + l1_info.fast_merge(l2_info); // l1's tag is automatically set to line::Needs_Precise_Stats_Update // l2's tag is automatically set to line::Merged @@ -226,34 +229,57 @@ namespace scribo bool between_separators(const scribo::line_info<L>& l1, const scribo::line_info<L>& l2) { - // No separators found in image. + // No separators found in image. mln_precondition(l1.holder().components().has_separators()); - unsigned - col1 = l1.bbox().pcenter().col(), - col2 = l2.bbox().pcenter().col(); + const box2d& l1_bbox = l1.bbox(); + const box2d& l2_bbox = l2.bbox(); + + const unsigned + col1 = l1_bbox.pcenter().col(), + col2 = l2_bbox.pcenter().col(); const mln_ch_value(L, bool)& separators = l1.holder().components().separators(); + // Checking for separators starting from 1 / 4, 3/ 4 and the + // center of the box typedef const bool* sep_ptr_t; - sep_ptr_t sep_ptr, end; + sep_ptr_t sep_ptr, sep_ptr_top, sep_ptr_bottom, end; if (col1 < col2) { - sep_ptr = &separators(l1.bbox().pcenter()); + const unsigned quarter = + ((l1_bbox.pcenter().row() - l1_bbox.pmin().row()) >> 1); + + sep_ptr = &separators(l1_bbox.pcenter()); + sep_ptr_top = &separators(point2d(l1_bbox.pmin().row() + quarter, + l1_bbox.pcenter().col())); + sep_ptr_bottom = &separators(point2d(l1_bbox.pmax().row() - quarter, + l1_bbox.pcenter().col())); end = sep_ptr + col2 - col1; } else { - sep_ptr = &separators(l2.bbox().pcenter()); + const unsigned quarter = + ((l2_bbox.pcenter().row() - l2_bbox.pmin().row()) >> 1); + + sep_ptr = &separators(l2_bbox.pcenter()); + sep_ptr_top = &separators(point2d(l2_bbox.pmin().row() + quarter, + l2_bbox.pcenter().col())); + sep_ptr_bottom = &separators(point2d(l2_bbox.pmax().row() - quarter, + l2_bbox.pcenter().col())); end = sep_ptr + col1 - col2; } // If sep_ptr is true, then a separator is reached. - while (!*sep_ptr && sep_ptr != end) + while (!*sep_ptr && !*sep_ptr_top && !*sep_ptr_bottom && sep_ptr != end) + { ++sep_ptr; + ++sep_ptr_top; + ++sep_ptr_bottom; + } - return *sep_ptr; + return (*sep_ptr || *sep_ptr_top || *sep_ptr_bottom); } @@ -266,16 +292,78 @@ namespace scribo */ template <typename L> - bool lines_can_merge(const scribo::line_info<L>& l1, + bool lines_can_merge(scribo::line_info<L>& l1, const scribo::line_info<L>& l2) { // Parameters. - const float x_ratio_max = 1.7, baseline_delta_max = 3; + const float x_ratio_max = 1.7f; + const float baseline_delta_max = + 0.5f * std::min(l1.x_height(), l2.x_height()); + + const box2d& l1_bbox = l1.bbox(); + const box2d& l2_bbox = l2.bbox(); + + const point2d& l1_pmin = l1_bbox.pmin(); + const point2d& l2_pmin = l2_bbox.pmin(); + const point2d& l1_pmax = l1_bbox.pmax(); + const point2d& l2_pmax = l2_bbox.pmax(); + + const bool l1_has_separators = l1.holder().components().has_separators(); + const bool l1_l2_between_separators = (l1_has_separators) ? + between_separators(l1, l2) : false; + const float l_ted_cw = l2.char_width(); + + const float dx = std::max(l1_pmin.col(), l2_pmin.col()) + - std::min(l1_pmax.col(), l2_pmax.col()); + const float dy = std::max(l1_pmin.row(), l2_pmin.row()) + - std::min(l1_pmax.row(), l2_pmax.row()); + + // Particular case of " + { + if (// Must have 2 characters + (l1.card() == 2 + // The box height must be smaller than the touched line x height + && l1_bbox.height() < l2.x_height()) + // The line must be vertically and horizontally close to + // the touched line + && (dx < l_ted_cw && dy < 0) + // No separator between the two lines + && not (l1_l2_between_separators)) + { + // Line is then considered as punctuation + l1.update_type(line::Punctuation); + return true; + } + } + + // Particular case like merging between a line and [5] + { + const mln::def::coord + top_row_l2 = l2_pmin.row(), + top_row_l1 = l1_pmin.row(), + bot_row = l2_pmax.row(); + const float x1 = l1.x_height(), x2 = l2.x_height(); + const float x_ratio = std::max(x1, x2) / std::min(x1, x2); + + if (// No separator + !l1_l2_between_separators + // The x height ration must be lower than 2 + && (x_ratio < 2.0f) + // Baseline alignment + && (std::abs(bot_row - l1.baseline()) < baseline_delta_max) + // The top of the boxes must be aligned + && (std::abs(top_row_l2 - top_row_l1) < 5) + // Distance between the line and the touched line. + && dx < 5.0f * l_ted_cw) + { + return true; + } + } // Similarity of x_height. { - float x1 = l1.x_height(), x2 = l2.x_height(); - float x_ratio = std::max(x1, x2) / std::min(x1, x2); + const float x1 = l1.x_height(), x2 = l2.x_height(); + const float x_ratio = std::max(x1, x2) / std::min(x1, x2); if (x_ratio > x_ratio_max) return false; } @@ -287,22 +375,21 @@ namespace scribo } // left / right - unsigned - col1 = l1.bbox().pcenter().col(), - col2 = l2.bbox().pcenter().col(); + const unsigned + col1 = l1_bbox.pcenter().col(), + col2 = l2_bbox.pcenter().col(); if (col1 < col2) { - if ((col1 + l1.bbox().width() / 4) >= (col2 - l2.bbox().width() / 4)) + if ((col1 + l1_bbox.width() / 4) >= (col2 - l2_bbox.width() / 4)) return false; } else - if ((col2 + l2.bbox().width() / 4) >= (col1 - l1.bbox().width() / 4)) + if ((col2 + l2_bbox.width() / 4) >= (col1 - l1_bbox.width() / 4)) return false; - // Check that there is no separator in between. - if (l1.holder().components().has_separators()) - return ! between_separators(l1, l2); + if (l1_has_separators) + return ! l1_l2_between_separators; return true; } @@ -310,14 +397,14 @@ namespace scribo - template <typename L> - int horizontal_distance(const scribo::line_info<L>& l1, - const scribo::line_info<L>& l2) + inline + int horizontal_distance(const box2d& l1, + const box2d& l2) { - if (l1.bbox().pcenter().col() < l2.bbox().pcenter().col()) - return l2.bbox().pmin().col() - l1.bbox().pmax().col(); + if (l1.pcenter().col() < l2.pcenter().col()) + return l2.pmin().col() - l1.pmax().col(); else - return l1.bbox().pmin().col() - l2.bbox().pmax().col(); + return l1.pmin().col() - l2.pmax().col(); } @@ -339,7 +426,7 @@ namespace scribo */ template <typename L> - bool non_text_and_text_can_merge(const scribo::line_info<L>& l_cur, // current + bool non_text_and_text_can_merge(scribo::line_info<L>& l_cur, // current const scribo::line_info<L>& l_ted) // touched { if (l_cur.type() == line::Text || l_ted.type() != line::Text) @@ -353,22 +440,42 @@ namespace scribo && between_separators(l_cur, l_ted)) return false; + const box2d& l_cur_bbox = l_cur.bbox(); + const box2d& l_ted_bbox = l_ted.bbox(); + + const point2d& l_cur_pmin = l_cur_bbox.pmin(); + const point2d& l_ted_pmin = l_ted_bbox.pmin(); + const point2d& l_cur_pmax = l_cur_bbox.pmax(); + const point2d& l_ted_pmax = l_ted_bbox.pmax(); + + const float dx = std::max(l_cur_pmin.col(), l_ted_pmin.col()) + - std::min(l_cur_pmax.col(), l_ted_pmax.col()); + const float l_ted_cw = l_ted.char_width(); + const float l_ted_x_height = l_ted.x_height(); + + const unsigned l_cur_height = l_cur_bbox.height(); + const unsigned l_cur_width = l_cur_bbox.width(); // General case (for tiny components like --> ',:."; <--): - if (l_cur.bbox().height() < l_ted.x_height() - && float(l_cur.bbox().width()) / float(l_cur.card()) < l_ted.char_width()) + if (l_cur_height < l_ted_x_height + && l_cur_height > 0.05f * l_ted_x_height + && float(l_cur_width) / float(l_cur.card()) < l_ted.char_width() + && dx < l_ted_cw) + { + l_cur.update_type(line::Punctuation); return true; - + } // Special case for '---': if (// small height: - l_cur.bbox().height() < l_ted.x_height() + l_cur_height < l_ted_x_height // // not so long width: - && l_cur.bbox().width() < 5 * l_ted.char_width() + && l_cur_width > 0.8 * l_ted_cw + && l_cur_width < 5 * l_ted_cw // align with the 'x' center: && std::abs((l_ted.baseline() + l_ted.meanline()) / 2 - l_cur.bbox().pcenter().row()) < 7 // tiny spacing: - && unsigned(horizontal_distance(l_cur, l_ted)) < l_ted.char_width() + && unsigned(horizontal_distance(l_cur_bbox, l_ted_bbox)) < 2 * l_ted_cw ) { return true; @@ -378,10 +485,11 @@ namespace scribo // Special case // Looking for alignement. - mln::def::coord + const mln::def::coord top_row = l_cur.bbox().pmin().row(), bot_row = l_cur.bbox().pmax().row(); + const box2d& l_ted_ebbox = l_ted.ebbox(); // std::cout << "top_row = " << top_row << " - bot_row = " << bot_row << std::endl; // std::cout << std::abs(bot_row - l_ted.baseline()) @@ -402,10 +510,11 @@ namespace scribo // << std::endl; if ((std::abs(bot_row - l_ted.baseline()) < 5 - || std::abs(bot_row - l_ted.ebbox().pmax().row()) < 5) + || std::abs(bot_row - l_ted_ebbox.pmax().row()) < 5) && (std::abs(top_row - l_ted.meanline()) < 5 - || std::abs(top_row - l_ted.ebbox().pmin().row()) < 5)) + || std::abs(top_row - l_ted_ebbox.pmin().row()) < 5) + && dx < 5.0f * l_ted_cw) { return true; } @@ -536,35 +645,33 @@ namespace scribo if (parent[l] != l) // not a root, so has already merged, thus ignore it continue; - box2d b = lines(l).bbox(); + const box2d& b = lines(l).bbox(); - unsigned tl, tr, ml, mc, mr, bl, br; + // unsigned tl, tr, ml, mc, mr, bl, br; + + const box2d& b_ = lines(l).ebbox(); + + /* + tl tr + x---------------x + | | + | mc | + ml x x x mr + | | + | | + x---------------x + bl br + + */ - { - box2d b_ = lines(l).ebbox(); - - /* - tl tr - x---------------x - | | - | mc | - ml x x x mr - | | - | | - x---------------x - bl br - - */ - - - tl = billboard(b_.pmin()); - tr = billboard.at_(b_.pmin().row(), b_.pmax().col()); - ml = billboard.at_(b_.pcenter().row(), b_.pmin().col()); - mc = billboard.at_(b_.pcenter().row(), b_.pcenter().col()); - mr = billboard.at_(b_.pcenter().row(), b_.pmax().col()); - bl = billboard.at_(b_.pmax().row(), b_.pmin().col()); - br = billboard(b_.pmax()); - } + + const unsigned tl = billboard(b_.pmin()); + const unsigned tr = billboard.at_(b_.pmin().row(), b_.pmax().col()); + const unsigned ml = billboard.at_(b_.pcenter().row(), b_.pmin().col()); + const unsigned mc = billboard.at_(b_.pcenter().row(), b_.pcenter().col()); + const unsigned mr = billboard.at_(b_.pcenter().row(), b_.pmax().col()); + const unsigned bl = billboard.at_(b_.pmax().row(), b_.pmin().col()); + const unsigned br = billboard(b_.pmax()); typedef std::set<unsigned> set_t; std::set<unsigned> labels; @@ -596,11 +703,47 @@ namespace scribo { // Main case: it is an "included" box (falling in an already drawn box) - if (lines(l).type() == line::Text) // the current object IS a text line + const line_info<L>& l_info = lines(l); + const line_info<L>& mc_info = lines(mc); + + if (l_info.type() == line::Text) // the current object IS a text line { - if (lines(mc).type() == line::Text) // included in a text line => weird + if (mc_info.type() == line::Text) // included in a text line => weird { ++count_txtline_IN_txtline; + + // Particular case of " + // { + // if ((lines(l).card() == 2 && + // lines(l).bbox().height() < lines(mc).x_height()) && + // not (lines(l).holder().components().has_separators() + // && between_separators(lines(l), + // lines(mc)))) + + const box2d& l_bbox = l_info.bbox(); + const box2d& mc_bbox = mc_info.bbox(); + + const point2d& l_pmin = l_bbox.pmin(); + const point2d& mc_pmin = mc_bbox.pmin(); + const point2d& l_pmax = l_bbox.pmax(); + const point2d& mc_pmax = mc_bbox.pmax(); + + const float dx = std::max(l_pmin.col(), mc_pmin.col()) + - std::min(l_pmax.col(), mc_pmax.col()); + const float dy = std::max(l_pmin.row(), mc_pmin.row()) + - std::min(l_pmax.row(), mc_pmax.row()); + const float l_ted_cw = mc_info.char_width(); + + // We accept a line included into another only if it + // is horizontally close to the line's bbox and + // vertically aligned + // Obviously no separators between the two lines + if (dx < l_ted_cw && dy < 0 + && not (l_info.holder().components().has_separators() + && between_separators(l_info, mc_info))) + l = do_union(lines, l, mc, parent); + // } + // std::cout << "weird: inclusion of a txt_line in a txt_line!" << std::endl; /// Merge is perform if the current line is a @@ -649,15 +792,15 @@ namespace scribo // could be noise or garbage... So adding new // criterions could fix this issue. // - //if (!non_text_and_text_can_merge(lines(l), lines(mc))) - // continue; + if (!non_text_and_text_can_merge(lines(l), lines(mc))) + continue; // Avoid the case when a large title ebbox overlap // with a text column. In that case, the title may // merge with punctuation from the text. - if (lines(l).holder().components().has_separators() - && between_separators(lines(l), lines(mc))) - continue; + // if (lines(l).holder().components().has_separators() + // && between_separators(lines(l), lines(mc))) + // continue; // Mark current line as punctuation. lines(l).update_type(line::Punctuation); @@ -832,9 +975,12 @@ namespace scribo bool operator()(const scribo::line_id_t& l1, const scribo::line_id_t& l2) const { - if (lines_(l1).bbox().nsites() == lines_(l2).bbox().nsites()) + const unsigned l1_nsites = lines_(l1).bbox().nsites(); + const unsigned l2_nsites = lines_(l2).bbox().nsites(); + + if (l1_nsites == l2_nsites) return l1 < l2; - return lines_(l1).bbox().nsites() < lines_(l2).bbox().nsites(); + return l1_nsites < l2_nsites; } scribo::line_set<L> lines_; diff --git a/scribo/scribo/text/paragraphs.hh b/scribo/scribo/text/paragraphs.hh new file mode 100644 index 0000000..a3ff802 --- /dev/null +++ b/scribo/scribo/text/paragraphs.hh @@ -0,0 +1,1027 @@ +#include <mln/util/array.hh> +#include <mln/accu/shape/bbox.hh> +#include <mln/core/image/image2d.hh> +#include <mln/core/alias/neighb2d.hh> +#include <mln/draw/box.hh> +#include <mln/data/convert.hh> +#include <mln/value/int_u16.hh> +#include <mln/value/label_16.hh> +#include <mln/value/int_u8.hh> +#include <mln/value/rgb8.hh> +#include <mln/io/ppm/save.hh> +#include <mln/io/pgm/save.hh> +#include <mln/geom/rotate.hh> +#include <mln/literal/colors.hh> + +#include <scribo/core/macros.hh> +#include <scribo/core/line_set.hh> +#include <scribo/core/line_info.hh> + +using namespace mln; + +namespace scribo +{ + + namespace internal + { + +//------------------------------------- +// Extracting root of links +//------------------------------------- + template <typename T> + inline + unsigned + find_root(util::array<T>& parent, unsigned x) + { + unsigned tmp_x = x; + + while (parent(tmp_x) != tmp_x) + tmp_x = parent(tmp_x); + + while (parent(x) != x) + { + const unsigned tmp = parent(x); + x = parent(x); + parent(tmp) = tmp_x; + } + + return x; + } + } + + namespace filter + { + +//--------------------------------------------------------------------- +// This method aims to cut the links between lines that do not fit the +// different criteria +//--------------------------------------------------------------------- + + template <typename L> + inline + void paragraph_links(const util::array<value::int_u16>& left, + const util::array<value::int_u16>& right, + util::array<value::int_u16>& output, + const util::array< line_info<L> >& lines, + const image2d<bool>& input) + { + output = left; + + const unsigned nlines = lines.nelements(); + + // image2d<value::rgb8> links = data::convert(value::rgb8(), input); + // for (unsigned l = 0; l < nlines; ++l) + // { + // mln::draw::line(links, lines(l).bbox().pcenter(), lines(left(l)).bbox().pcenter(), literal::red); + // } + // mln::io::ppm::save(links, "out_links.ppm"); + + // For each line + for (unsigned l = 0; l < nlines; ++l) + { + // Neighbors + + const value::int_u16 left_nbh = output(l); + const value::int_u16 right_nbh = right(l); + const value::int_u16 lol_nbh = output(left_nbh); + + // Line features + const float x_height = lines(l).x_height(); + const float left_x_height = lines(left_nbh).x_height(); + const float right_x_height = lines(right_nbh).x_height(); + + const box2d& left_line_bbox = lines(left_nbh).bbox(); + const box2d& current_line_bbox = lines(l).bbox(); + const box2d& right_line_bbox = lines(right_nbh).bbox(); + const box2d& lol_line_bbox = lines(lol_nbh).bbox(); // lol : left neighbor of the left neighbor + + const int lline_col_min = left_line_bbox.pmin().col(); + const int cline_col_min = current_line_bbox.pmin().col(); + const int rline_col_min = right_line_bbox.pmin().col(); + const int lolline_col_min = lol_line_bbox.pmin().col(); + + const int lline_col_max = left_line_bbox.pmax().col(); + const int cline_col_max = current_line_bbox.pmax().col(); + const int rline_col_max = right_line_bbox.pmax().col(); + + const int lline_cw = lines(left_nbh).char_width(); + const int cline_cw = lines(l).char_width(); + const int rline_cw = lines(right_nbh).char_width(); + // Maximal x variation to consider two lines vertically aligned + const int delta_alignment = cline_cw; + + // Checks the baseline distances of the two neighbors + { + // Current line baseline + const int c_baseline = lines(l).baseline(); + + // Baseline distance with the left and right neighbors + const int lc_baseline = lines(left_nbh).baseline() - c_baseline; + const int rc_baseline = c_baseline -lines(right_nbh).baseline(); + + // Max baseline distance between the two neighbors + // const float delta_baseline_max = std::max(lc_baseline, rc_baseline); + // const float delta_baseline_min = std::min(lc_baseline, + // rc_baseline); + + // Only two lines, meaning the current line has only one neighbor + bool two_lines = false; + + // If the current line has no left neighbor + if (lc_baseline == 0) + { + // ror : right neighbor of the right neighbor + const value::int_u16 ror_nbh = right(right_nbh); + const box2d& ror_line_bbox = lines(ror_nbh).bbox(); + + // If the current line has a ror + if (ror_nbh != right_nbh + && output(ror_nbh) == right_nbh) + { + // Distance between the current line and the right neighbor + const float right_distance = + current_line_bbox.pcenter().row() - right_line_bbox.pcenter().row(); + // Distance between the right neighbor and the ror + const float ror_distance = + right_line_bbox.pcenter().row() - ror_line_bbox.pcenter().row(); + // ror x_height + const float ror_x_height = lines(ror_nbh).x_height(); + + // Conditions to cut the link between the current line + // and its right neighbor + if (right_distance > 1.4f * ror_distance + && std::max(ror_x_height, right_x_height) < + 1.2f * std::min(ror_x_height, right_x_height) + && output(right_nbh) == l) + { + output(right_nbh) = right_nbh; + continue; + } + } + // Otherwise we only have a group of two lines + else + { + // We determine the distance between the two lines + const float distance = lines(l).meanline() - lines(right_nbh).baseline(); + two_lines = true; + + // If the distance between the two lines is greater than + // the minimum x height of the two lines then we cut the + // link between them + if (distance > std::min(x_height, right_x_height) + && output(right_nbh) == l) + { + output(right_nbh) = right_nbh; + continue; + } + } + + // Lines features + const float min_x_height = std::min(x_height, right_x_height); + const float max_x_height = std::max(x_height, right_x_height); + const float min_char_width = std::min(rline_cw, cline_cw); + const float max_char_width = std::max(rline_cw, cline_cw); + + // Condition to cut the link between the current line and + // its right neighbor + if ((max_x_height > min_x_height * 1.2f) && + !(max_char_width <= 1.2f * min_char_width)) + { + if (output(right_nbh) == l) + { + output(right_nbh) = right_nbh; + continue; + } + } + + // If we only have two lines we stop the study + if (two_lines) + continue; + } + // If the current line has no right neighbor + else if (rc_baseline == 0) + { + // lol : left neighbor of the left neighbor + + // If the left neighbor of the current line has a left neighbor + if (lol_nbh != left_nbh) + { + // Distance between the current line and its left neighbor + const float left_distance = + left_line_bbox.pcenter().row() - current_line_bbox.pcenter().row(); + // Distance between the left neighbor and the left + // neighbor of its left neighbor + const float lol_distance = + lol_line_bbox.pcenter().row() - left_line_bbox.pcenter().row(); + // lol x height + const float lol_x_height = lines(lol_nbh).x_height(); + + // Conditions to cut the link between the current line + // and its left neighbor + if (left_distance > 1.4f * lol_distance + && std::max(lol_x_height, left_x_height) < + 1.2f * std::min(lol_x_height, left_x_height)) + { + output(l) = l; + continue; + } + } + // Otherwise we only have a group of two lines + else + { + // Distance between the current line and it left neighbor + const float distance = lines(left_nbh).meanline() - + lines(l).baseline(); + + two_lines = true; + + // If the distance is greater than the min x height + // between the two lines + if (distance > std::min(x_height, left_x_height)) + { + output(l) = l; + continue; + } + } + + // Lines features + const float min_x_height = std::min(x_height, left_x_height); + const float max_x_height = std::max(x_height, left_x_height); + const float min_char_width = std::min(lline_cw, cline_cw); + const float max_char_width = std::max(lline_cw, cline_cw); + + // Condition to cut the link between the current line and + // its left neighbor + if ((max_x_height > min_x_height * 1.2f) && + !(max_char_width <= 1.2f * min_char_width)) + { + output(l) = l; + continue; + } + + // If we only have two lines we stop the study + if (two_lines) + continue; + } + // The current line has at least one left and one right neighbor + else // if (delta_baseline_max >= delta_baseline_min) + { + // Distance between the left and the current line + const float left_distance = + lines(left_nbh).meanline() - lines(l).baseline(); + // Distance between the right and the current line + const float right_distance = + lines(l).meanline() - lines(right_nbh).baseline(); + + // If the left line is too far compared to the right one + // we cut the link with it + if (left_distance > 1.2f * right_distance + && std::max(x_height, left_x_height) > 1.2f * std::min(x_height, left_x_height)) + { + output(l) = l; + continue; + } + // If the right line is too far compared to the left one + // we cut the link with it + else if (right_distance > 1.2f * left_distance + && std::max(x_height, right_x_height) > 1.2f * std::min(x_height, right_x_height) + && output(right_nbh) == l) + { + output(right_nbh) = right_nbh; + continue; + } + + // If the distance between the baseline of the left + // neighbor and the baseline of the current line is + // greater than the one between the current line baseline + // and the right line baseline we have to study the texte + // features of the right and left lines + if (lc_baseline > rc_baseline) + { + const float cw_max = std::max(lline_cw, cline_cw); + const float cw_min = std::min(lline_cw, cline_cw); + const float min_x_height = std::min(x_height, left_x_height); + const float max_x_height = std::max(x_height, left_x_height); + + if ((max_x_height > min_x_height * 1.2f) && + !(cw_max <= 1.2f * cw_min)) + { + output(l) = l; + continue; + } + + { + const float min_x_height = std::min(x_height, right_x_height); + const float max_x_height = std::max(x_height, right_x_height); + const float cw_max = std::max(rline_cw, cline_cw); + const float cw_min = std::min(rline_cw, cline_cw); + + if ((max_x_height > min_x_height * 1.2f) + && !(cw_max <= 1.2f * cw_min) + && output(right_nbh) == l) + { + output(right_nbh) = right_nbh; + continue; + } + } + } + else + { + const float cw_max = std::max(rline_cw, cline_cw); + const float cw_min = std::min(rline_cw, cline_cw); + const float min_x_height = std::min(x_height, right_x_height); + const float max_x_height = std::max(x_height, right_x_height); + + if ((max_x_height > min_x_height * 1.2f) + && !(cw_max <= 1.2f * cw_min) + && output(right_nbh) == l) + { + output(right_nbh) = right_nbh; + continue; + } + + { + const float min_x_height = std::min(x_height, left_x_height); + const float max_x_height = std::max(x_height, left_x_height); + const float cw_max = std::max(lline_cw, cline_cw); + const float cw_min = std::min(lline_cw, cline_cw); + + if ((max_x_height > min_x_height * 1.2f) + && !(cw_max <= 1.2f * cw_min)) + { + output(l) = l; + continue; + } + } + } + } + } + + // If we arrive here, it means than the lines in the + // neighborhood of the current line are quite similar. We can + // then begin to study the indentations in order to determine + // the beginning of new paragraphs + +//----------------------------------------------------------------------------- +// ___________________________ +// |___________________________| +// ________________________ +// |________________________| +// ___________________________ +// |___________________________| +// ___________________________ +// |___________________________| +// +// Simple case : paragraphs are justified on the left. We try to find any +// indentation like above. +// +//----------------------------------------------------------------------------- + + { + // Check if the current line neighbors are aligned + bool left_right_aligned = false; + bool left_lol_aligned = false; + const int dx_lr = std::abs(lline_col_min - rline_col_min); + const int dx_llol = std::abs(lline_col_min - lolline_col_min); + + if (dx_lr < delta_alignment) + left_right_aligned = true; + + if (dx_llol < delta_alignment) + left_lol_aligned = true; + + if (left_right_aligned && left_lol_aligned) + { + const int left_right_col_min = std::min(lline_col_min, rline_col_min); + const int dx_lrc = std::abs(left_right_col_min - cline_col_min); + const float l_char_width = 1.5f * lines(l).char_width(); + + if (dx_lrc > l_char_width && + dx_lrc < 3.0f * l_char_width && + cline_col_min > rline_col_min && + cline_col_min > lline_col_min) + { + output(right_nbh) = right_nbh; + continue; + } + } + } + +//----------------------------------------------------------------------------- +// ___________________________ +// |___________________________| +// ___________________ +// |___________________| End of the paragraph - Current line +// ________________________ +// |________________________| Beginning of a new one +// ___________________________ +// |___________________________| Left of left of current line +// +// End of paragraph case : we try to find an end to the current paragraph +// +//----------------------------------------------------------------------------- + + { + // Check if the current line neighbors are aligned + bool left_right_max_aligned = false; + bool left_current_min_aligned = false; + bool lol_current_min_aligned = false; + const bool lol_is_left = output(left_nbh) == left_nbh; + const int dx_lr_max = std::abs(lline_col_max - rline_col_max); + const int dx_lc_min = std::abs(lline_col_min - cline_col_min); + const int dx_lolc_min = std::abs(lolline_col_min - cline_col_min); + + if (dx_lr_max < delta_alignment) + left_right_max_aligned = true; + + if (dx_lc_min < delta_alignment) + left_current_min_aligned = true; + + if (dx_lolc_min < delta_alignment) + lol_current_min_aligned = true; + + if (!left_current_min_aligned && left_right_max_aligned && + (lol_current_min_aligned || lol_is_left)) + { + const int dx_lrc = std::abs(lline_col_max - cline_col_max); + const int l_char_width = lines(l).char_width(); + + if (dx_lrc > l_char_width && + cline_col_max < lline_col_max && + cline_col_min < lline_col_min && + (lline_col_min > lolline_col_min || lol_is_left)) + { + output(l) = l; + continue; + } + } + } + + +//----------------------------------------------------------------------------- +// ___________________________ +// |___________________________| +// ___________________________ +// |___________________________| +// ________________________ +// |________________________| +// +// Simple case : paragraphs are justified on the left. We try to find any +// indentation like above at the end of a column. +// +//----------------------------------------------------------------------------- + + if (left_nbh == l) + { + const value::int_u16 ror_nbh = right(right_nbh); + const box2d& ror_line_bbox = lines(ror_nbh).bbox(); + const int rorline_col_min = ror_line_bbox.pmin().col(); + + bool right_ror_min_aligned = false; + const int dx_rror_min = std::abs(rline_col_min - rorline_col_min); + + if (dx_rror_min < delta_alignment) + right_ror_min_aligned = true; + + if (right_ror_min_aligned) + { + const int right_ror_col_min = std::min(rline_col_min, rorline_col_min); + const int dx_rrorc = std::abs(right_ror_col_min - cline_col_min); + const float l_char_width = 1.5f * lines(l).char_width(); + + if (dx_rrorc > l_char_width && + dx_rrorc < 3.0f * l_char_width && + cline_col_min > rline_col_min && + cline_col_max >= rline_col_max) + { + output(right_nbh) = right_nbh; + continue; + } + } + } + } + + + // Only debug + + { + image2d<value::rgb8> debug = data::convert(value::rgb8(), input); + + for (unsigned i = 0; i < output.nelements(); ++i) + output(i) = internal::find_root(output, i); + + mln::util::array<accu::shape::bbox<point2d> > nbbox(output.nelements()); + for (unsigned i = 0; i < nlines; ++i) + { + // if (lines(i).is_textline()) + // { + // mln::draw::box(debug, lines(i).bbox(), literal::red); + nbbox(output(i)).take(lines(i).bbox()); + // } + } + + for (unsigned i = 0; i < nbbox.nelements(); ++i) + if (nbbox(i).is_valid()) + { + box2d b = nbbox(i).to_result(); + mln::draw::box(debug, b, literal::orange); + b.enlarge(1); + mln::draw::box(debug, b, literal::orange); + b.enlarge(1); + mln::draw::box(debug, b, literal::orange); + } + + mln::io::ppm::save(debug, "out_paragraph.ppm"); + } + + } + } + +//------------------------------------------------------------- +// Preparation of the lines before linking them. +// For each line we draw the top and the bottom of it. +// Assuming than i is the number of the line. Then the top of the line +// will be affected with the value 2 * i in the block image and the +// bottom with 2 * i + 1. +// +//------------------------------------------------------------- + + template <typename L> + inline + void prepare_lines(const box2d& domain, + const util::array< line_info<L> >& lines, + image2d<value::int_u16>& blocks, + util::array<box2d>& rbbox) + { + std::map< int, std::vector< const box2d* > > drawn_lines; + const unsigned nlines = lines.nelements(); + + // For each line + for (unsigned l = 0; l < nlines; ++l) + { + // Rotation of the bounding box + box2d b = geom::rotate(lines(l).bbox(), -90, domain.pcenter()); + rbbox.append(b); + + const unsigned index = l + 1; + const unsigned even_index = 2 * index; + const unsigned odd_index = even_index + 1; + + // Top of the line + { + bool not_finished = true; + int col_offset = 0; + + while (not_finished) + { + // Looking for a column in the image to draw the top of the + // line + + const int col = b.pmax().col() + col_offset; + std::map< int, std::vector< const box2d* > >::iterator it + = drawn_lines.find(col); + + if (it != drawn_lines.end()) + { + const std::vector< const box2d* >& lines = (*it).second; + const unsigned nb_lines = lines.size(); + unsigned i = 0; + + for (i = 0; i < nb_lines; ++i) + { + const box2d* box = lines[i]; + const int min_row = std::max(b.pmin().row(), box->pmin().row()); + const int max_row = std::min(b.pmax().row(), box->pmax().row()); + + if (min_row - max_row <= 0) + break; + } + + if (i == nb_lines) + { + mln::draw::line(blocks, point2d(b.pmin().row(), col), + point2d(b.pmax().row(), col), even_index); + not_finished = false; + drawn_lines[col].push_back(&(rbbox[l])); + } + else + ++col_offset; + } + else + { + mln::draw::line(blocks, point2d(b.pmin().row(), col), + point2d(b.pmax().row(), col), even_index); + not_finished = false; + drawn_lines[col].push_back(&(rbbox[l])); + } + } + } + + // Bottom of the line + { + bool not_finished = true; + int col_offset = 0; + + while (not_finished) + { + // Looking for a column in the image to draw the bottom of + // the line + + const int col = b.pmin().col() - col_offset; + std::map< int, std::vector< const box2d* > >::iterator it + = drawn_lines.find(col); + + if (it != drawn_lines.end()) + { + const std::vector< const box2d* >& lines = (*it).second; + const unsigned nb_lines = lines.size(); + unsigned i = 0; + + for (i = 0; i < nb_lines; ++i) + { + const box2d* box = lines[i]; + const int min_row = std::max(b.pmin().row(), box->pmin().row()); + const int max_row = std::min(b.pmax().row(), box->pmax().row()); + + if (min_row - max_row <= 0) + break; + } + + if (i == nb_lines) + { + mln::draw::line(blocks, point2d(b.pmin().row(), col), + point2d(b.pmax().row(), col), odd_index); + not_finished = false; + drawn_lines[col].push_back(&(rbbox[l])); + } + else + ++col_offset; + } + else + { + mln::draw::line(blocks, point2d(b.pmin().row(), col), + point2d(b.pmax().row(), col), odd_index); + not_finished = false; + drawn_lines[col].push_back(&(rbbox[l])); + } + } + } + } + } + + template <typename L> + inline + void + process_left_link(image2d<value::int_u16>& blocks, + const util::array<box2d>& rbbox, + const util::array< line_info<L> >& lines, + util::array<value::int_u16>& left) + { + typedef value::int_u16 V; + + // At the beginning each line is its own neighbor + for (unsigned i = 0; i < left.nelements(); ++i) + left(i) = i; + + const unsigned nlines = lines.nelements(); + + // For each line + for (unsigned i = 0; i < nlines; ++i) + { + // Max distance for the line search + int dmax = 1.5f * lines(i).x_height(); + + // Starting points in the current line box + point2d c = rbbox(i).pcenter(); + point2d q(rbbox(i).pmin().row() + ((c.row() - rbbox(i).pmin().row()) / 4), c.col()); + + int + midcol = (rbbox(i).pmax().col() + - rbbox(i).pmin().col()) / 2; + + // Left + { + // marge gauche + int + nleftima = c.col() - blocks.domain().pmin().col(), + // Distance gauche + nleft = std::min(nleftima, midcol + dmax); + + V + // Starting points in the box + *p = &blocks(c), + *p2 = &blocks(q), + // End of search + *pstop = p - nleft - 1, + // Line neighbor + *nbh_p = 0; + + // While we haven't found a neighbor or reached the limit + for (; p != pstop; --p, --p2) + { + if (*p2 != literal::zero // Not the background + && ((*p2 % 2) == 0) // Looking for the bottom of a line + && left((*p2 >> 1) - 1) != i) // No loops + { + // Neightbor found, we stop the research + nbh_p = p2; + break; + } + + if (*p != literal::zero // Not the background + && ((*p % 2) == 0) // Looking for the bottom of a line + && left((*p >> 1) - 1) != i) // No loops + { + // Neightbor found, we stop the research + nbh_p = p; + break; + } + } + + // If a neighbor was found, then we have found the top of the + // line. We are then looking for the bottom of the encountered + // line. If during the search process we find a complete line + // included in the touched line, this line is considered as + // the neighbor under certain conditions (see below) + + //--------------------------------------------------------------- + // _________________________ | + // |_________________________| => Current line | Search direction + // v + // => First encountered top line + // __________________________________________________ 2Q + // | Q | + // | _________________________ |2P + // | |_____________P___________| => Second top |2P + 1 + // | line | + // |__________________________________________________|2Q + 1 + // + // + //--------------------------------------------------------------- + + if (nbh_p) + { + std::vector<V> lines_nbh; + const V end_p = *nbh_p + 1; + const V* nbh_p_copy = nbh_p; + + for (; *nbh_p != end_p; --nbh_p) + { + if ((*nbh_p) != literal::zero) // Not the background + { + if ((*nbh_p) % 2 == 0)// We have found the top of + // another line + lines_nbh.push_back(*nbh_p); + else + { + // We have found the bottom of a line. We are looking if + // we have already encountered the top of this + // line. If so, we link the current line with this one + // under certain conditions: + + if (std::find(lines_nbh.begin(), lines_nbh.end(), + (*nbh_p) - 1) != lines_nbh.end()) + { + // If we can link the complete line with the current line + if (// It must be in the search range + nbh_p > pstop + // Avoid loops + && left(((*nbh_p - 1) >> 1) - 1) != i) + left(i) = ((*nbh_p - 1) >> 1) - 1; + + // We have found a complete line so we stop the search + break; + } + } + } + } + + + // If we haven't found any included line in the first + // neighbor, then the line is considered as the neighbor of + // the current line + if (*nbh_p == end_p) + left(i) = (*nbh_p_copy >> 1) - 1; + } + } + } + } + + + // We assume that the lines have been rotated + template <typename L> + inline + void + process_right_link(image2d<value::int_u16>& blocks, + const util::array<box2d>& rbbox, + const util::array< line_info<L> >& lines, + util::array<value::int_u16>& right) + { + typedef value::int_u16 V; + + // At the beginning each line is its own neighbor + for (unsigned i = 0; i < right.nelements(); ++i) + right(i) = i; + + const unsigned nlines = lines.nelements(); + + // For each line + for (unsigned i = 0; i < nlines; ++i) + { + // Max distance for the line search + int dmax = 1.5f * lines(i).x_height(); + + // Starting points in the current line box + point2d c = rbbox(i).pcenter(); + point2d q(rbbox(i).pmax().row() - ((rbbox(i).pmax().row() - c.row()) / 4), c.col()); + + int + midcol = (rbbox(i).pmax().col() + - rbbox(i).pmin().col()) / 2; + + // Right + { + int + nrightima = geom::ncols(blocks) - c.col() + blocks.domain().pmin().col(), + nright = std::min(nrightima, midcol + dmax); + + V + // Starting points in the box + *p = &blocks(c), + *p2 = &blocks(q), + // End of search + *pstop = p + nright - 1, + // Line neighbor + *nbh_p = 0; + + // While we haven't found a neighbor or reached the limit + for (; p != pstop; ++p, ++p2) + { + if (*p2 != literal::zero // Not the background + && ((*p2 % 2) == 1) // Looking for the bottom of a line + && right(((*p2 - 1) >> 1) - 1) != i) // No loops + { + // Neightbor found, we stop the research + nbh_p = p2; + break; + } + + if (*p != literal::zero // Not the background + && ((*p % 2) == 1) // Looking for the bottom of a line + && right(((*p - 1) >> 1) - 1) != i) // No loops + { + // Neightbor found, we stop the research + nbh_p = p; + break; + } + } + + // If a neighbor was found, then we have found the bottom of the + // line. We are then looking for the top of the encountered + // line. If during the search process we find a complete line + // included in the touched line, this line is considered as + // the neighbor under certain conditions (see below) + + //--------------------------------------------------------------- + // + // + // __________________________________________________ 2Q + // | Q | + // | _________________________ |2P + // | |_____________P___________| => Second bottom |2P + 1 + // | line | + // |__________________________________________________|2Q + 1 + // => First encountered bottom line + // _________________________ ^ + // |_________________________| => Current line | Search direction + // | + //--------------------------------------------------------------- + + if (nbh_p) + { + std::vector<V> lines_nbh; + const V end_p = *nbh_p - 1; + const V* nbh_p_copy = nbh_p; + + for (; *nbh_p != end_p; ++nbh_p) + { + if (*nbh_p != literal::zero) // Not the background + { + if (*nbh_p % 2 == 1) // We have found the bottom of + // another line + lines_nbh.push_back(*nbh_p); + else + { + // We have found the top of a line. We are looking if + //we have already encountered the bottom of this + // line. If so, we link the current line with this one + // under certain conditions: + + if (std::find(lines_nbh.begin(), lines_nbh.end(), + *nbh_p + 1) != lines_nbh.end()) + { + // If we can link the complete line with the current line + if (// It must be in the search range + nbh_p < pstop + // Avoid loops + && right((*nbh_p >> 1) - 1) != i) + right(i) = (*nbh_p >> 1) - 1; + + // We have found a complete line, so we stop the search + break; + } + } + } + } + + // If we haven't found any included line in the first + // neighbor, then the line is considered as the neighbor of + // the current line + + if (*nbh_p == end_p) + right(i) = ((*nbh_p_copy - 1) >> 1) - 1; + } + } + } + } + +//----------------------------------------------------------------------- +// Finalizing the links by merging information extracted from the left +// and right links +//----------------------------------------------------------------------- + + template< typename L > + inline + void finalize_links(util::array<value::int_u16>& left, + util::array<value::int_u16>& right, + const util::array< line_info<L> >& lines) + { + const unsigned nlines = lines.nelements(); + + for (unsigned i = 0; i < nlines; ++i) + { + const unsigned left_value = left(i); + const unsigned right_value = right(i); + + // If the right neighbor of my left neighbor is itself then its + // right neighbor is me + { + value::int_u16& v = right(left_value); + + if (v == left_value) + v = i; + } + + // If the left neighbor of my right neighbor is itself then its + // left neighbor is me + { + value::int_u16& v = left(right_value); + + if (v == right_value) + v = i; + } + } + } + + template <typename L> + inline + void extract_paragraphs(line_set<L>& lines, + const image2d<bool>& input) + { + typedef value::int_u16 V; + + image2d<V> blocks(geom::rotate(input.domain(), -90, input.domain().pcenter())); + data::fill(blocks, 0); + + util::array< line_info<L> > lines_info; + + for_all_lines(l, lines) + { + if (lines(l).is_textline()) + lines_info.append(lines(l)); + } + + const unsigned nlines = lines_info.nelements(); + util::array<box2d> rbbox; + util::array<V> left(nlines); + util::array<V> right(nlines); + util::array<V> output; + + rbbox.reserve(nlines); + output.reserve(nlines); + + std::cout << "Preparing lines" << std::endl; + prepare_lines(input.domain(), lines_info, blocks, rbbox); +// io::pgm::save(blocks, "blocks.pgm"); + std::cout << "Linking left" << std::endl; + process_left_link(blocks, rbbox, lines_info, left); + std::cout << "Linking right" << std::endl; + process_right_link(blocks, rbbox, lines_info, right); + std::cout << "Finalizing links" << std::endl; + finalize_links(left, right, lines_info); + // std::cout << "Finalizing merging" << std::endl; + // finalize_line_merging(left, right, lines); + std::cout << "Extracting paragraphs" << std::endl; + filter::paragraph_links(left, right, output, lines_info, input); + } +} -- 1.5.6.5
participants (1)
-
Julien Marquegnies