last-svn-commit-881-g973e5ac Improve paragraph grouping for historical documents.

* scribo/core/line_info.hh, * scribo/text/merging.hh, * scribo/text/paragraphs.hh: Here. --- scribo/ChangeLog | 8 + scribo/scribo/core/line_info.hh | 18 ++- scribo/scribo/text/merging.hh | 16 +- scribo/scribo/text/paragraphs.hh | 388 ++++++++++++++++++++++++++++++++++---- 4 files changed, 378 insertions(+), 52 deletions(-) diff --git a/scribo/ChangeLog b/scribo/ChangeLog index a28bdf7..5e29749 100644 --- a/scribo/ChangeLog +++ b/scribo/ChangeLog @@ -1,3 +1,11 @@ +2011-05-18 Julien Marquegnies <marquegnies@lrde.epita.fr> + + Improve paragraph grouping for historical documents. + + * scribo/core/line_info.hh, + * scribo/text/merging.hh, + * scribo/text/paragraphs.hh: Here. + 2011-05-17 Guillaume Lazzara <z@lrde.epita.fr> Fix use of skeleton_constrained. diff --git a/scribo/scribo/core/line_info.hh b/scribo/scribo/core/line_info.hh index e91e1c3..9017174 100644 --- a/scribo/scribo/core/line_info.hh +++ b/scribo/scribo/core/line_info.hh @@ -1000,21 +1000,22 @@ namespace scribo const int dy = bb1.pmax().row() - bb2.pmax().row(); // The two characters must be distinct - if (space < 0) - return false; + // if (space < 0) + // return false; if (// Approximately the same width - ((std::max(w1, w2) / std::min(w1, w2)) > 1.1f || + ((std::max(w1, w2) / std::min(w1, w2)) > 1.3 || // One character must not be smaller than the space between // the two characters (w1 < space || w2 < space)) // If the two characters have a different width they must also // have a different height - && not (std::max(h1, h2) / std::min(h1, h2) <= 1.5f)) + && not (std::max(h1, h2) / std::min(h1, h2) <= 1.7f)) return false; // Approximately aligned on baseline - if (std::abs(dy) > 10) + if (std::abs(dy) > 10 && + not (std::max(h1, h2) / std::min(h1, h2) <= 1.7f)) return false; return true; @@ -1049,6 +1050,10 @@ namespace scribo float min_base = 0.0f; const unsigned clusters_b_nelements = clusters_b.nelements(); + if (clusters_b_nelements >= 3) + return data_->baseline_clusters_.mean(); + + for (unsigned i = 0; i < clusters_b_nelements; ++i) { const unsigned clusters_b_i_nelements = clusters_b[i].nelements(); @@ -1086,6 +1091,9 @@ namespace scribo float max_mean = 0.0f; const unsigned clusters_m_nelements = clusters_m.nelements(); + if (clusters_m_nelements >= 3) + return data_->meanline_clusters_.mean(); + for (unsigned i = 0; i < clusters_m_nelements; ++i) { const unsigned clusters_m_i_nelements = clusters_m[i].nelements(); diff --git a/scribo/scribo/text/merging.hh b/scribo/scribo/text/merging.hh index f691188..2763882 100644 --- a/scribo/scribo/text/merging.hh +++ b/scribo/scribo/text/merging.hh @@ -461,8 +461,8 @@ namespace scribo if (l_cur_height < l_ted_x_height && l_cur_height > 0.05f * l_ted_x_height && float(l_cur_width) / float(l_cur.card()) < l_ted.char_width() - && dx < l_ted_cw - && l_cur_pmin.row() < l_ted_pmax.row()) + && dx < 2 * l_ted_cw + && l_cur_pmin.row() < l_ted.baseline()) { l_cur.update_type(line::Punctuation); return true; @@ -741,11 +741,12 @@ namespace scribo // vertically aligned // Obviously no separators between the two lines if ((l_info.card() <= 5 || - (std::abs(l_info.baseline() - mc_info.baseline()) < 5 - && std::abs(l_info.meanline() - mc_info.meanline()) < 5)) - && dx < l_ted_cw && dy < 0 - && not (l_info.holder().components().has_separators() - && between_separators(l_info, mc_info))) + (std::abs(l_info.baseline() - mc_info.baseline()) + < 5 && std::abs(l_info.meanline() - + mc_info.meanline()) < 5)) + && dx < l_ted_cw && dy < 0 + && not (l_info.holder().components().has_separators() + && between_separators(l_info, mc_info))) l = do_union(lines, l, mc, parent); // } @@ -1047,7 +1048,6 @@ namespace scribo // ts = t.stop(); // std::cout << "time " << ts << std::endl; - lines.force_stats_update(); return lines; diff --git a/scribo/scribo/text/paragraphs.hh b/scribo/scribo/text/paragraphs.hh index 6c9285b..9a59e02 100644 --- a/scribo/scribo/text/paragraphs.hh +++ b/scribo/scribo/text/paragraphs.hh @@ -30,30 +30,140 @@ namespace scribo //------------------------------------- // Extracting root of links //------------------------------------- - template <typename T> - inline - unsigned - find_root(util::array<T>& parent, unsigned x) - { - unsigned tmp_x = x; + template <typename T> + inline + unsigned + find_root(util::array<T>& parent, unsigned x) + { + unsigned tmp_x = x; - while (parent(tmp_x) != tmp_x) - tmp_x = parent(tmp_x); + while (parent(tmp_x) != tmp_x) + tmp_x = parent(tmp_x); - while (parent(x) != x) - { - const unsigned tmp = parent(x); - x = parent(x); - parent(tmp) = tmp_x; - } + while (parent(x) != x) + { + const unsigned tmp = parent(x); + x = parent(x); + parent(tmp) = tmp_x; + } + + return x; + } - return x; + template <typename T> + inline + void + set_root(util::array<T>& parent, unsigned x, const unsigned root) + { + while (parent(x) != x && parent(x) != root) + { + const unsigned tmp = parent(x); + x = parent(x); + parent(tmp) = root; } + + parent(x) = root; + } + } namespace filter { + template <typename L> + inline + bool + between_horizontal_separator(const scribo::line_info<L>& l1, + const scribo::line_info<L>& l2) + { + // No separators found in image. + mln_precondition(l1.holder().components().has_separators()); + + const box2d& l1_bbox = l1.bbox(); + const box2d& l2_bbox = l2.bbox(); + + unsigned + row1 = l1_bbox.pcenter().row(), + row2 = l2_bbox.pcenter().row(); + const mln_ch_value(L, bool)& + separators = l1.holder().components().separators(); + + unsigned row; + unsigned col_ptr; + unsigned left_col_ptr; + unsigned right_col_ptr; + unsigned end; + + if (row1 < row2) + { + row1 = l1_bbox.pmax().row(); + row2 = l2_bbox.pmin().row(); + + const unsigned quarter = + ((l1_bbox.pcenter().col() - l1_bbox.pmin().col()) >> 2); + + row = l1_bbox.pcenter().row(); + col_ptr = l1_bbox.pcenter().col(); + left_col_ptr = l1_bbox.pmin().col() + quarter; + right_col_ptr = l1_bbox.pmax().col() - quarter; + end = row2; + } + else + { + row2 = l2_bbox.pmax().row(); + row1 = l1_bbox.pmin().row(); + + const unsigned quarter = + ((l2_bbox.pcenter().col() - l2_bbox.pmin().col()) >> 2); + + row = l2_bbox.pcenter().row(); + col_ptr = l2_bbox.pcenter().col(); + left_col_ptr = l2_bbox.pmin().col() + quarter; + right_col_ptr = l2_bbox.pmax().col() - quarter; + end = row1; + } + + // If sep_ptr is true, then a separator is reached. + while (row < end) + { + ++row; + if (separators.at_(row, col_ptr) + || separators.at_(row, left_col_ptr) + || separators.at_(row, right_col_ptr)) + return true; + } + + return false; + } + + + template <typename L> + bool may_have_another_left_link(const line_links<L>& right, + const line_id_t& index, + const line_id_t& current_line, + const line_set<L>& lines) + { + const line_info<L>& l = lines(current_line); + const point2d& pmin = l.bbox().pmin(); + const unsigned x1 = l.x_height(); + + for_all_links(i, right) + if (i != index && right(i) == index) + { + const line_info<L>& l_info = lines(i); + const unsigned x2 = l_info.x_height(); + + const float delta_max = 0.5f * std::min(x1, x2); + + if (l_info.bbox().pmin().col() < pmin.col() + && std::abs(l.baseline() - l_info.baseline()) < delta_max + ) + return true; + } + + return false; + } + //--------------------------------------------------------------------- // This method aims to cut the links between lines that do not fit the // different criteria @@ -62,7 +172,7 @@ namespace scribo template <typename L> inline void paragraph_links(const line_links<L>& left, - const line_links<L>& right, + line_links<L>& right, line_links<L>& output, const line_set<L>& lines) { @@ -83,9 +193,27 @@ namespace scribo { // Neighbors - const line_id_t left_nbh = output(l); - const line_id_t right_nbh = right(l); - const line_id_t lol_nbh = output(left_nbh); + line_id_t left_nbh = output(l); + line_id_t right_nbh = right(l); + line_id_t lol_nbh = output(left_nbh); + + const line_info<L>& left_line = lines(left_nbh); + const line_info<L>& current_line = lines(l); + const line_info<L>& right_line = lines(right_nbh); + + if (right_line.holder().components().has_separators() && + between_horizontal_separator(right_line, current_line)) + { + output(right_nbh) = right_nbh; + right_nbh = l; + } + if (current_line.holder().components().has_separators() && + between_horizontal_separator(current_line, left_line)) + { + output(l) = l; + left_nbh = l; + lol_nbh = l; + } // Line features const float x_height = lines(l).x_height(); @@ -112,6 +240,7 @@ namespace scribo // Maximal x variation to consider two lines vertically aligned const int delta_alignment = cline_cw; + // Checks the baseline distances of the two neighbors { // Current line baseline @@ -151,7 +280,7 @@ namespace scribo // and its right neighbor if (right_distance > 1.4f * ror_distance && std::max(ror_x_height, right_x_height) < - 1.2f * std::min(ror_x_height, right_x_height) + 1.4f * std::min(ror_x_height, right_x_height) && output(right_nbh) == l) { output(right_nbh) = right_nbh; @@ -184,7 +313,7 @@ namespace scribo // Condition to cut the link between the current line and // its right neighbor - if ((max_x_height > min_x_height * 1.2f) && + if ((max_x_height > min_x_height * 1.4f) && !(max_char_width <= 1.2f * min_char_width)) { if (output(right_nbh) == l) @@ -220,7 +349,7 @@ namespace scribo // and its left neighbor if (left_distance > 1.4f * lol_distance && std::max(lol_x_height, left_x_height) < - 1.2f * std::min(lol_x_height, left_x_height)) + 1.4f * std::min(lol_x_height, left_x_height)) { output(l) = l; continue; @@ -252,7 +381,7 @@ namespace scribo // Condition to cut the link between the current line and // its left neighbor - if ((max_x_height > min_x_height * 1.2f) && + if ((max_x_height > min_x_height * 1.4f) && !(max_char_width <= 1.2f * min_char_width)) { output(l) = l; @@ -264,18 +393,18 @@ namespace scribo continue; } // The current line has at least one left and one right neighbor - else // if (delta_baseline_max >= delta_baseline_min) + else // if (delta_baseline_max >= 1.1 * delta_baseline_min) { // Distance between the left and the current line - const float left_distance = - lines(left_nbh).meanline() - lines(l).baseline(); + const float + left_distance = left_line_bbox.pcenter().row() - current_line_bbox.pcenter().row(); // Distance between the right and the current line - const float right_distance = - lines(l).meanline() - lines(right_nbh).baseline(); + const float + right_distance = current_line_bbox.pcenter().row() - right_line_bbox.pcenter().row();; // If the left line is too far compared to the right one // we cut the link with it - if (left_distance > 1.2f * right_distance + if (left_distance > 1.5f * right_distance && std::max(x_height, left_x_height) > 1.2f * std::min(x_height, left_x_height)) { output(l) = l; @@ -283,8 +412,8 @@ namespace scribo } // If the right line is too far compared to the left one // we cut the link with it - else if (right_distance > 1.2f * left_distance - && std::max(x_height, right_x_height) > 1.2f * std::min(x_height, right_x_height) + else if (right_distance > 1.5f * left_distance + && std::max(x_height, right_x_height) >= 1.2f * std::min(x_height, right_x_height) && output(right_nbh) == l) { output(right_nbh) = right_nbh; @@ -303,7 +432,7 @@ namespace scribo const float min_x_height = std::min(x_height, left_x_height); const float max_x_height = std::max(x_height, left_x_height); - if ((max_x_height > min_x_height * 1.2f) && + if ((max_x_height > min_x_height * 1.4f) && !(cw_max <= 1.2f * cw_min)) { output(l) = l; @@ -316,7 +445,7 @@ namespace scribo const float cw_max = std::max(rline_cw, cline_cw); const float cw_min = std::min(rline_cw, cline_cw); - if ((max_x_height > min_x_height * 1.2f) + if ((max_x_height > min_x_height * 1.4f) && !(cw_max <= 1.2f * cw_min) && output(right_nbh) == l) { @@ -332,7 +461,7 @@ namespace scribo const float min_x_height = std::min(x_height, right_x_height); const float max_x_height = std::max(x_height, right_x_height); - if ((max_x_height > min_x_height * 1.2f) + if ((max_x_height > min_x_height * 1.4f) && !(cw_max <= 1.2f * cw_min) && output(right_nbh) == l) { @@ -346,7 +475,7 @@ namespace scribo const float cw_max = std::max(lline_cw, cline_cw); const float cw_min = std::min(lline_cw, cline_cw); - if ((max_x_height > min_x_height * 1.2f) + if ((max_x_height > min_x_height * 1.4f) && !(cw_max <= 1.2f * cw_min)) { output(l) = l; @@ -445,8 +574,11 @@ namespace scribo { const int dx_lrc = std::abs(lline_col_max - cline_col_max); const int l_char_width = lines(l).char_width(); + const int dx_indent = std::abs(std::max(lline_col_min, + rline_col_min) - cline_col_min); if (dx_lrc > l_char_width && + dx_indent < 4 * delta_alignment && cline_col_max < lline_col_max && cline_col_min < lline_col_min && (lline_col_min > lolline_col_min || lol_is_left)) @@ -457,6 +589,172 @@ namespace scribo } } +//----------------------------------------------------------------------------- +// ___________________________ +// |___________________________| +// ________________________ +// |________________________| +// ___________________________ +// |___________________________| +// ___________________________ +// |___________________________| +// +// Simple case : paragraphs are justified on the left. We try to find any +// indentation like above. +// +//----------------------------------------------------------------------------- + + { + // Check if the current line neighbors are aligned + bool left_right_aligned = false; + bool left_lol_aligned = false; + const int dx_lr = std::abs(lline_col_min - rline_col_min); + const int dx_llol = std::abs(lline_col_min - lolline_col_min); + + if (dx_lr < delta_alignment) + left_right_aligned = true; + + if (dx_llol < delta_alignment) + left_lol_aligned = true; + + if (left_right_aligned && left_lol_aligned) + { + const int left_right_col_min = std::min(lline_col_min, rline_col_min); + const int dx_lrc = std::abs(left_right_col_min - cline_col_min); + const float l_char_width = 1.5f * lines(l).char_width(); + + if (dx_lrc > l_char_width && + !may_have_another_left_link(right, right_nbh, l, lines) && + dx_lrc < 10.0f * l_char_width && + cline_col_min > rline_col_min && + cline_col_min > lline_col_min) + { + const line_id_t out_right_nbh = output(right_nbh); + + if (out_right_nbh != l) + right(l) = l; + else + output(right_nbh) = right_nbh; + continue; + } + } + } + +//----------------------------------------------------------------------------- +// ___________________________ +// |___________________________| +// ___________________________ +// |___________________________| +// ________________________ +// |________________________| +// ___________________________ +// |___________________________| +// +// Simple case : paragraphs are justified on the left. We try to find any +// indentation like above. +// +//----------------------------------------------------------------------------- + + { + const line_id_t ror_nbh = right(right_nbh); + const box2d& ror_line_bbox = lines(ror_nbh).bbox(); + const int rorline_col_min = ror_line_bbox.pmin().col(); + + bool right_ror_min_aligned = false; + bool left_right_aligned = false; + const int dx_lr = std::abs(lline_col_min - rline_col_min); + const int dx_rror_min = std::abs(rline_col_min - rorline_col_min); + + if (dx_rror_min < delta_alignment) + right_ror_min_aligned = true; + + if (dx_lr < delta_alignment) + left_right_aligned = true; + + if (right_ror_min_aligned && left_right_aligned && + ror_nbh != right_nbh) + { + const int left_right_col_min = std::min(lline_col_min, rline_col_min); + const int dx_lrc = std::abs(left_right_col_min - cline_col_min); + const float l_char_width = 1.5f * lines(l).char_width(); + + if (dx_lrc > l_char_width && + !may_have_another_left_link(right, right_nbh, l, lines) && + dx_lrc < 10.0f * l_char_width && + cline_col_min > rline_col_min && + cline_col_min > lline_col_min) + { + const line_id_t out_right_nbh = output(right_nbh); + + if (out_right_nbh != l) + right(l) = l; + else + output(right_nbh) = right_nbh; + continue; + } + } + } + +//----------------------------------------------------------------------------- +// ___________________________ +// |___________________________| +// ___________ +// |___________| +// ________________________ +// |________________________| +// ___________________________ +// |___________________________| +// +// Simple case : paragraphs are justified on the left. We try to find any +// indentation like above. +// +//----------------------------------------------------------------------------- + + { + const line_id_t ror_nbh = right(right_nbh); + const box2d& ror_line_bbox = lines(ror_nbh).bbox(); + const int rorline_col_min = ror_line_bbox.pmin().col(); + + bool left_ror_aligned = false; + const int dx_lror = std::abs(lline_col_min - rorline_col_min); + + if (dx_lror < delta_alignment) + left_ror_aligned = true; + + if (left_ror_aligned) + { + const int left_ror_col_min = std::min(lline_col_min, rorline_col_min); + const int dx_lrorc = std::abs(left_ror_col_min - cline_col_min); + const float l_char_width = 1.5f * lines(l).char_width(); + const int dx_lrorr = std::abs(left_ror_col_min - rline_col_min); + const int dx_crmax = std::abs(rline_col_max - cline_col_max); + + if (dx_lrorc > l_char_width && + dx_lrorr > 5 * l_char_width && + dx_lrorr > dx_lrorc && + dx_crmax > 5 * l_char_width && + !may_have_another_left_link(right, right_nbh, l, lines) && + dx_lrorc < 10.0f * l_char_width && + cline_col_min > rorline_col_min && + cline_col_min > lline_col_min) + { + right(right_nbh) = right_nbh; + continue; + } + } + } + + +// Strange case + { + if (rline_col_min > current_line_bbox.pcenter().col() + && !may_have_another_left_link(right, right_nbh, l, lines) + && cline_col_max < rline_col_max + && output(right_nbh) == l) + { + output(right_nbh) = right_nbh; + } + } //----------------------------------------------------------------------------- // ___________________________ @@ -490,7 +788,7 @@ namespace scribo const float l_char_width = 1.5f * lines(l).char_width(); if (dx_rrorc > l_char_width && - dx_rrorc < 3.0f * l_char_width && + dx_rrorc < 10.0f * l_char_width && cline_col_min > rline_col_min && cline_col_max >= rline_col_max) { @@ -501,14 +799,26 @@ namespace scribo } } - // Only debug // { // image2d<value::rgb8> debug = data::convert(value::rgb8(), input); - // for (unsigned i = 0; i < output.nelements(); ++i) - // output(i) = scribo::make::internal::find_root(output, i); + // const util::array<value::int_u16> backup = output; + // for (unsigned i = 0; i < output.nelements(); ++i) + // { + // const value::int_u16 current_neighbor = backup(i); + // output(i) = internal::find_root(output, i); + // const value::int_u16 root_index = output(i); + + // for (unsigned j = 0; j < right.nelements(); ++j) + // { + // if (i != j && + // current_neighbor != i && + // right(j) == i) + // internal::set_root(output, j, root_index); + // } + // } // mln::util::array<accu::shape::bbox<point2d> > nbbox(output.nelements()); // for_all_lines(l, lines) -- 1.5.6.5
participants (1)
-
Julien Marquegnies