last-svn-commit-924-g5454dc1 Make different algorithms for doc and hdoc use cases.

* scribo/core/paragraph_set.hh: Add an overload of make::paragraph(). * scribo/text/paragraphs.hh: Move... * scribo/text/extract_paragraphs.hh, * scribo/text/extract_paragraphs_hdoc.hh: ... here. * scribo/text/merging.hh: Remove holder() calls. * scribo/text/merging_hdoc.hh: New. * scribo/toolchain/internal/content_in_doc_functor.hh: Fix compilation. * scribo/toolchain/internal/content_in_hdoc_functor.hh: Make use of hdoc algorithms. --- scribo/ChangeLog | 21 + scribo/scribo/core/paragraph_set.hh | 29 + scribo/scribo/text/extract_paragraphs.hh | 1091 ++++++++++++++++ scribo/scribo/text/extract_paragraphs_hdoc.hh | 1327 ++++++++++++++++++++ scribo/scribo/text/merging.hh | 81 +- scribo/scribo/text/{merging.hh => merging_hdoc.hh} | 10 +- scribo/scribo/text/paragraphs.hh | 1270 ------------------- .../toolchain/internal/content_in_doc_functor.hh | 4 +- .../toolchain/internal/content_in_hdoc_functor.hh | 9 +- 9 files changed, 2510 insertions(+), 1332 deletions(-) create mode 100644 scribo/scribo/text/extract_paragraphs.hh create mode 100644 scribo/scribo/text/extract_paragraphs_hdoc.hh copy scribo/scribo/text/{merging.hh => merging_hdoc.hh} (99%) delete mode 100644 scribo/scribo/text/paragraphs.hh diff --git a/scribo/ChangeLog b/scribo/ChangeLog index 394c247..d1678c5 100644 --- a/scribo/ChangeLog +++ b/scribo/ChangeLog @@ -1,3 +1,24 @@ +2011-06-20 Guillaume Lazzara <z@lrde.epita.fr> + + Make different algorithms for doc and hdoc use cases. + + * scribo/core/paragraph_set.hh: Add an overload of + make::paragraph(). + + * scribo/text/paragraphs.hh: Move... + + * scribo/text/extract_paragraphs.hh, + * scribo/text/extract_paragraphs_hdoc.hh: ... here. + + * scribo/text/merging.hh: Remove holder() calls. + + * scribo/text/merging_hdoc.hh: New. + + * scribo/toolchain/internal/content_in_doc_functor.hh: Fix compilation. + + * scribo/toolchain/internal/content_in_hdoc_functor.hh: Make use + of hdoc algorithms. + 2011-06-07 Guillaume Lazzara <z@lrde.epita.fr> Add a new toolchain for ICDAR contests. diff --git a/scribo/scribo/core/paragraph_set.hh b/scribo/scribo/core/paragraph_set.hh index 9214f20..c21359d 100644 --- a/scribo/scribo/core/paragraph_set.hh +++ b/scribo/scribo/core/paragraph_set.hh @@ -377,6 +377,35 @@ namespace scribo } + template <typename L> + scribo::paragraph_set<L> + paragraph(const line_links<L>& llinks) + { + line_links<L> links = llinks.duplicate(); + + for_all_links(l, links) + links(l) = internal::find_root(links, l); + + unsigned npars; + mln::fun::i2v::array<unsigned> + par_ids = mln::make::relabelfun(links.line_to_link(), + links.nelements() - 1, npars); + paragraph_set<L> parset(links, npars); + + const scribo::line_set<L>& lines = links.lines(); + for_all_links(l, links) + if (links(l)) + { + value::int_u16 par_id = par_ids(l); + parset(par_id).add_line(lines(l)); + } + + for_all_paragraphs(p, parset) + parset(p).force_stats_update(); + + return parset; + } + // FIXME: move that code into paragraph_set constructor? template <typename L> scribo::paragraph_set<L> diff --git a/scribo/scribo/text/extract_paragraphs.hh b/scribo/scribo/text/extract_paragraphs.hh new file mode 100644 index 0000000..e8e9ac6 --- /dev/null +++ b/scribo/scribo/text/extract_paragraphs.hh @@ -0,0 +1,1091 @@ +// Copyright (C) 2010, 2011 EPITA Research and Development Laboratory +// (LRDE) +// +// This file is part of Olena. +// +// Olena is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation, version 2 of the License. +// +// Olena is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Olena. If not, see <http://www.gnu.org/licenses/>. +// +// As a special exception, you may use this file as part of a free +// software project without restriction. Specifically, if other files +// instantiate templates or use macros or inline functions from this +// file, or you compile this file and link it with other files to produce +// an executable, this file does not by itself cause the resulting +// executable to be covered by the GNU General Public License. This +// exception does not however invalidate any other reasons why the +// executable file might be covered by the GNU General Public License. + +/// \file +/// +/// + + +#ifndef SCRIBO_TEXT_EXTRACT_PARAGRAPHS_HH +# define SCRIBO_TEXT_EXTRACT_PARAGRAPHS_HH + +#include <mln/util/array.hh> +#include <mln/accu/shape/bbox.hh> +#include <mln/core/image/image2d.hh> +#include <mln/core/alias/neighb2d.hh> +#include <mln/draw/box.hh> +#include <mln/data/convert.hh> +#include <mln/value/int_u16.hh> +#include <mln/value/label_16.hh> +#include <mln/value/int_u8.hh> +#include <mln/value/rgb8.hh> +#include <mln/io/ppm/save.hh> +#include <mln/io/pgm/save.hh> +#include <mln/geom/rotate.hh> +#include <mln/literal/colors.hh> + +#include <scribo/core/macros.hh> +#include <scribo/core/line_set.hh> +#include <scribo/core/line_links.hh> +#include <scribo/core/line_info.hh> +#include <scribo/core/paragraph_set.hh> + +using namespace mln; + +namespace scribo +{ + + namespace text + { + + +# ifndef MLN_INCLUDE_ONLY + + namespace internal + { + +//------------------------------------- +// Extracting root of links +//------------------------------------- + template <typename T> + inline + unsigned + find_root(mln::util::array<T>& parent, unsigned x) + { + unsigned tmp_x = x; + + while (parent(tmp_x) != tmp_x) + tmp_x = parent(tmp_x); + + while (parent(x) != x) + { + const unsigned tmp = parent(x); + x = parent(x); + parent(tmp) = tmp_x; + } + + return x; + } + + +//--------------------------------------------------------------------- +// This method aims to cut the links between lines that do not fit the +// different criteria +//--------------------------------------------------------------------- + + template <typename L> + inline + void paragraph_links(const line_links<L>& left, + const line_links<L>& right, + line_links<L>& output, + const line_set<L>& lines) + { + output = left.duplicate(); + + // const unsigned nlines = lines.nelements(); + + // image2d<value::rgb8> links = data::convert(value::rgb8(), input); + // for (unsigned l = 0; l < nlines; ++l) + // { + // mln::draw::line(links, lines(l).bbox().pcenter(), lines(left(l)).bbox().pcenter(), literal::red); + // } + // mln::io::ppm::save(links, "out_links.ppm"); + + // For each line + for_all_lines(l, lines) + if (lines(l).is_textline()) + { + // Neighbors + + const line_id_t left_nbh = output(l); + const line_id_t right_nbh = right(l); + const line_id_t lol_nbh = output(left_nbh); + + // Line features + const float x_height = lines(l).x_height(); + const float left_x_height = lines(left_nbh).x_height(); + const float right_x_height = lines(right_nbh).x_height(); + + const box2d& left_line_bbox = lines(left_nbh).bbox(); + const box2d& current_line_bbox = lines(l).bbox(); + const box2d& right_line_bbox = lines(right_nbh).bbox(); + const box2d& lol_line_bbox = lines(lol_nbh).bbox(); // lol : left neighbor of the left neighbor + + const int lline_col_min = left_line_bbox.pmin().col(); + const int cline_col_min = current_line_bbox.pmin().col(); + const int rline_col_min = right_line_bbox.pmin().col(); + const int lolline_col_min = lol_line_bbox.pmin().col(); + + const int lline_col_max = left_line_bbox.pmax().col(); + const int cline_col_max = current_line_bbox.pmax().col(); + const int rline_col_max = right_line_bbox.pmax().col(); + + const int lline_cw = lines(left_nbh).char_width(); + const int cline_cw = lines(l).char_width(); + const int rline_cw = lines(right_nbh).char_width(); + // Maximal x variation to consider two lines vertically aligned + const int delta_alignment = cline_cw; + + // Checks the baseline distances of the two neighbors + { + // Current line baseline + const int c_baseline = lines(l).baseline(); + + // Baseline distance with the left and right neighbors + const int lc_baseline = lines(left_nbh).baseline() - c_baseline; + const int rc_baseline = c_baseline -lines(right_nbh).baseline(); + + // Max baseline distance between the two neighbors + // const float delta_baseline_max = std::max(lc_baseline, rc_baseline); + // const float delta_baseline_min = std::min(lc_baseline, + // rc_baseline); + + // Only two lines, meaning the current line has only one neighbor + bool two_lines = false; + + // If the current line has no left neighbor + if (lc_baseline == 0) + { + // ror : right neighbor of the right neighbor + const line_id_t ror_nbh = right(right_nbh); + //const box2d& ror_line_bbox = lines(ror_nbh).bbox(); + + // If the current line has a ror + if (ror_nbh != right_nbh + && output(ror_nbh) == right_nbh) + { + // Distance between the current line and the right neighbor + const float right_distance = lines(l).meanline() - lines(right_nbh).baseline(); + // Distance between the right neighbor and the ror + const float ror_distance = lines(right_nbh).meanline() - lines(ror_nbh).baseline(); + // ror x_height + const float ror_x_height = lines(ror_nbh).x_height(); + + // Conditions to cut the link between the current line + // and its right neighbor + if (right_distance > 1.4f * ror_distance + && std::max(ror_x_height, right_x_height) < + 1.2f * std::min(ror_x_height, right_x_height) + && output(right_nbh) == l) + { + output(right_nbh) = right_nbh; + continue; + } + } + // Otherwise we only have a group of two lines + else + { + // We determine the distance between the two lines + const float distance = lines(l).meanline() - lines(right_nbh).baseline(); + two_lines = true; + + // If the distance between the two lines is greater than + // the minimum x height of the two lines then we cut the + // link between them + if (distance > 2.0f * std::min(x_height, right_x_height) + && output(right_nbh) == l) + { + output(right_nbh) = right_nbh; + continue; + } + } + + // Lines features + const float min_x_height = std::min(x_height, right_x_height); + const float max_x_height = std::max(x_height, right_x_height); + const float min_char_width = std::min(rline_cw, cline_cw); + const float max_char_width = std::max(rline_cw, cline_cw); + + // Condition to cut the link between the current line and + // its right neighbor + if ((max_x_height > min_x_height * 1.2f) && + !(max_char_width <= 1.2f * min_char_width)) + { + if (output(right_nbh) == l) + { + output(right_nbh) = right_nbh; + continue; + } + } + + // If we only have two lines we stop the study + if (two_lines) + continue; + } + // If the current line has no right neighbor + else if (rc_baseline == 0) + { + // lol : left neighbor of the left neighbor + + // If the left neighbor of the current line has a left neighbor + if (lol_nbh != left_nbh) + { + // Distance between the current line and its left neighbor + const float left_distance = lines(left_nbh).meanline() - + lines(l).baseline(); + // Distance between the left neighbor and the left + // neighbor of its left neighbor + const float lol_distance = lines(lol_nbh).meanline() - + lines(left_nbh).baseline(); + // lol x height + const float lol_x_height = lines(lol_nbh).x_height(); + + // Conditions to cut the link between the current line + // and its left neighbor + if (left_distance > 1.4f * lol_distance + && std::max(lol_x_height, left_x_height) < + 1.2f * std::min(lol_x_height, left_x_height)) + { + output(l) = l; + continue; + } + } + // Otherwise we only have a group of two lines + else + { + // Distance between the current line and it left neighbor + const float distance = lines(left_nbh).meanline() - + lines(l).baseline(); + + two_lines = true; + + // If the distance is greater than the min x height + // between the two lines + if (distance > 2.0f * std::min(x_height, left_x_height)) + { + output(l) = l; + continue; + } + } + + // Lines features + const float min_x_height = std::min(x_height, left_x_height); + const float max_x_height = std::max(x_height, left_x_height); + const float min_char_width = std::min(lline_cw, cline_cw); + const float max_char_width = std::max(lline_cw, cline_cw); + + // Condition to cut the link between the current line and + // its left neighbor + if ((max_x_height > min_x_height * 1.2f) && + !(max_char_width <= 1.2f * min_char_width)) + { + output(l) = l; + continue; + } + + // If we only have two lines we stop the study + if (two_lines) + continue; + } + // The current line has at least one left and one right neighbor + else // if (delta_baseline_max >= delta_baseline_min) + { + // Distance between the left and the current line + const float left_distance = + lines(left_nbh).meanline() - lines(l).baseline(); + // Distance between the right and the current line + const float right_distance = + lines(l).meanline() - lines(right_nbh).baseline(); + + // If the left line is too far compared to the right one + // we cut the link with it + if (left_distance > 1.2f * right_distance + && std::max(x_height, left_x_height) > 1.2f * std::min(x_height, left_x_height)) + { + output(l) = l; + continue; + } + // If the right line is too far compared to the left one + // we cut the link with it + else if (right_distance > 1.2f * left_distance + && std::max(x_height, right_x_height) > 1.2f * std::min(x_height, right_x_height) + && output(right_nbh) == l) + { + output(right_nbh) = right_nbh; + continue; + } + + // If the distance between the baseline of the left + // neighbor and the baseline of the current line is + // greater than the one between the current line baseline + // and the right line baseline we have to study the texte + // features of the right and left lines + if (lc_baseline > rc_baseline) + { + const float cw_max = std::max(lline_cw, cline_cw); + const float cw_min = std::min(lline_cw, cline_cw); + const float min_x_height = std::min(x_height, left_x_height); + const float max_x_height = std::max(x_height, left_x_height); + + if ((max_x_height > min_x_height * 1.2f) && + !(cw_max <= 1.2f * cw_min)) + { + output(l) = l; + continue; + } + + { + const float min_x_height = std::min(x_height, right_x_height); + const float max_x_height = std::max(x_height, right_x_height); + const float cw_max = std::max(rline_cw, cline_cw); + const float cw_min = std::min(rline_cw, cline_cw); + + if ((max_x_height > min_x_height * 1.2f) + && !(cw_max <= 1.2f * cw_min) + && output(right_nbh) == l) + { + output(right_nbh) = right_nbh; + continue; + } + } + } + else + { + const float cw_max = std::max(rline_cw, cline_cw); + const float cw_min = std::min(rline_cw, cline_cw); + const float min_x_height = std::min(x_height, right_x_height); + const float max_x_height = std::max(x_height, right_x_height); + + if ((max_x_height > min_x_height * 1.2f) + && !(cw_max <= 1.2f * cw_min) + && output(right_nbh) == l) + { + output(right_nbh) = right_nbh; + continue; + } + + { + const float min_x_height = std::min(x_height, left_x_height); + const float max_x_height = std::max(x_height, left_x_height); + const float cw_max = std::max(lline_cw, cline_cw); + const float cw_min = std::min(lline_cw, cline_cw); + + if ((max_x_height > min_x_height * 1.2f) + && !(cw_max <= 1.2f * cw_min)) + { + output(l) = l; + continue; + } + } + } + } + } + + // If we arrive here, it means than the lines in the + // neighborhood of the current line are quite similar. We can + // then begin to study the indentations in order to determine + // the beginning of new paragraphs + +//----------------------------------------------------------------------------- +// ___________________________ +// |___________________________| +// ________________________ +// |________________________| +// ___________________________ +// |___________________________| +// ___________________________ +// |___________________________| +// +// Simple case : paragraphs are justified on the left. We try to find any +// indentation like above. +// +//----------------------------------------------------------------------------- + + { + // Check if the current line neighbors are aligned + bool left_right_aligned = false; + bool left_lol_aligned = false; + const int dx_lr = std::abs(lline_col_min - rline_col_min); + const int dx_llol = std::abs(lline_col_min - lolline_col_min); + + if (dx_lr < delta_alignment) + left_right_aligned = true; + + if (dx_llol < delta_alignment) + left_lol_aligned = true; + + if (left_right_aligned && left_lol_aligned) + { + const int left_right_col_min = std::min(lline_col_min, rline_col_min); + const int dx_lrc = std::abs(left_right_col_min - cline_col_min); + const float l_char_width = 1.5f * lines(l).char_width(); + + if (dx_lrc > l_char_width && + dx_lrc < 3.0f * l_char_width && + cline_col_min > rline_col_min && + cline_col_min > lline_col_min) + { + output(right_nbh) = right_nbh; + continue; + } + } + } + +//----------------------------------------------------------------------------- +// ___________________________ +// |___________________________| +// ___________________ +// |___________________| End of the paragraph - Current line +// ________________________ +// |________________________| Beginning of a new one +// ___________________________ +// |___________________________| Left of left of current line +// +// End of paragraph case : we try to find an end to the current paragraph +// +//----------------------------------------------------------------------------- + + { + // Check if the current line neighbors are aligned + bool left_right_max_aligned = false; + bool left_current_min_aligned = false; + bool lol_current_min_aligned = false; + const bool lol_is_left = output(left_nbh) == left_nbh; + const int dx_lr_max = std::abs(lline_col_max - rline_col_max); + const int dx_lc_min = std::abs(lline_col_min - cline_col_min); + const int dx_lolc_min = std::abs(lolline_col_min - cline_col_min); + + if (dx_lr_max < delta_alignment) + left_right_max_aligned = true; + + if (dx_lc_min < delta_alignment) + left_current_min_aligned = true; + + if (dx_lolc_min < delta_alignment) + lol_current_min_aligned = true; + + if (!left_current_min_aligned && left_right_max_aligned && + (lol_current_min_aligned || lol_is_left)) + { + const int dx_lrc = std::abs(lline_col_max - cline_col_max); + const int l_char_width = lines(l).char_width(); + + if (dx_lrc > l_char_width && + cline_col_max < lline_col_max && + cline_col_min < lline_col_min && + (lline_col_min > lolline_col_min || lol_is_left)) + { + output(l) = l; + continue; + } + } + } + + +//----------------------------------------------------------------------------- +// ___________________________ +// |___________________________| +// ___________________________ +// |___________________________| +// ________________________ +// |________________________| +// +// Simple case : paragraphs are justified on the left. We try to find any +// indentation like above at the end of a column. +// +//----------------------------------------------------------------------------- + + if (left_nbh == l) + { + const line_id_t ror_nbh = right(right_nbh); + const box2d& ror_line_bbox = lines(ror_nbh).bbox(); + const int rorline_col_min = ror_line_bbox.pmin().col(); + + bool right_ror_min_aligned = false; + const int dx_rror_min = std::abs(rline_col_min - rorline_col_min); + + if (dx_rror_min < delta_alignment) + right_ror_min_aligned = true; + + if (right_ror_min_aligned) + { + const int right_ror_col_min = std::min(rline_col_min, rorline_col_min); + const int dx_rrorc = std::abs(right_ror_col_min - cline_col_min); + const float l_char_width = 1.5f * lines(l).char_width(); + + if (dx_rrorc > l_char_width && + dx_rrorc < 3.0f * l_char_width && + cline_col_min > rline_col_min && + cline_col_max >= rline_col_max) + { + output(right_nbh) = right_nbh; + continue; + } + } + } + } + + + // Only debug + + // { + // image2d<value::rgb8> debug = data::convert(value::rgb8(), input); + + // for (unsigned i = 0; i < output.nelements(); ++i) + // output(i) = scribo::make::internal::find_root(output, i); + + // mln::util::array<accu::shape::bbox<point2d> > nbbox(output.nelements()); + // for_all_lines(l, lines) + // if (lines(l).is_textline()) + // { + // // if (lines(i).is_textline()) + // // { + // // mln::draw::box(debug, lines(i).bbox(), literal::red); + // nbbox(output(l)).take(lines(l).bbox()); + // // } + // } + + // for (unsigned i = 0; i < nbbox.nelements(); ++i) + // if (nbbox(i).is_valid()) + // { + // box2d b = nbbox(i).to_result(); + // mln::draw::box(debug, b, literal::orange); + // b.enlarge(1); + // mln::draw::box(debug, b, literal::orange); + // b.enlarge(1); + // mln::draw::box(debug, b, literal::orange); + // } + + // mln::io::ppm::save(debug, "out_paragraph.ppm"); + // } + + } + +//------------------------------------------------------------- +// Preparation of the lines before linking them. +// For each line we draw the top and the bottom of it. +// Assuming than i is the number of the line. Then the top of the line +// will be affected with the value 2 * i in the block image and the +// bottom with 2 * i + 1. +// +//------------------------------------------------------------- + + template <typename L> + inline + void prepare_lines(const box2d& domain, + const line_set<L>& lines, + L& blocks, + mln::util::array<box2d>& rbbox) + { + std::map< int, std::vector< const box2d* > > drawn_lines; + // const unsigned nlines = lines.nelements(); + + // For each line + //for (unsigned l = 0; l < nlines; ++l) + for_all_lines(l, lines) + if (lines(l).is_textline()) + { + // Rotation of the bounding box + box2d b = geom::rotate(lines(l).bbox(), -90, domain.pcenter()); +// rbbox.append(b); + rbbox(l) = b; + + const unsigned index = l + 1; + const unsigned even_index = 2 * index; + const unsigned odd_index = even_index + 1; + + // Top of the line + { + bool not_finished = true; + int col_offset = 0; + + while (not_finished) + { + // Looking for a column in the image to draw the top of the + // line + + const int col = b.pmax().col() + col_offset; + std::map< int, std::vector< const box2d* > >::iterator it + = drawn_lines.find(col); + + if (it != drawn_lines.end()) + { + const std::vector< const box2d* >& lines = (*it).second; + const unsigned nb_lines = lines.size(); + unsigned i = 0; + + for (i = 0; i < nb_lines; ++i) + { + const box2d* box = lines[i]; + const int min_row = std::max(b.pmin().row(), box->pmin().row()); + const int max_row = std::min(b.pmax().row(), box->pmax().row()); + + if (min_row - max_row <= 0) + break; + } + + if (i == nb_lines) + { + mln::draw::line(blocks, point2d(b.pmin().row(), col), + point2d(b.pmax().row(), col), even_index); + not_finished = false; + drawn_lines[col].push_back(&(rbbox[l])); + } + else + ++col_offset; + } + else + { + mln::draw::line(blocks, point2d(b.pmin().row(), col), + point2d(b.pmax().row(), col), even_index); + not_finished = false; + drawn_lines[col].push_back(&(rbbox[l])); + } + } + } + + // Bottom of the line + { + bool not_finished = true; + int col_offset = 0; + + while (not_finished) + { + // Looking for a column in the image to draw the bottom of + // the line + + const int col = b.pmin().col() - col_offset; + std::map< int, std::vector< const box2d* > >::iterator it + = drawn_lines.find(col); + + if (it != drawn_lines.end()) + { + const std::vector< const box2d* >& lines = (*it).second; + const unsigned nb_lines = lines.size(); + unsigned i = 0; + + for (i = 0; i < nb_lines; ++i) + { + const box2d* box = lines[i]; + const int min_row = std::max(b.pmin().row(), box->pmin().row()); + const int max_row = std::min(b.pmax().row(), box->pmax().row()); + + if (min_row - max_row <= 0) + break; + } + + if (i == nb_lines) + { + mln::draw::line(blocks, point2d(b.pmin().row(), col), + point2d(b.pmax().row(), col), odd_index); + not_finished = false; + drawn_lines[col].push_back(&(rbbox[l])); + } + else + ++col_offset; + } + else + { + mln::draw::line(blocks, point2d(b.pmin().row(), col), + point2d(b.pmax().row(), col), odd_index); + not_finished = false; + drawn_lines[col].push_back(&(rbbox[l])); + } + } + } + } + } + + template <typename L> + inline + void + process_left_link(L& blocks, + const mln::util::array<box2d>& rbbox, + const line_set<L>& lines, + line_links<L>& left) + { + typedef scribo::def::lbl_type V; + + // At the beginning each line is its own neighbor + for_all_lines(l, lines) + if (lines(l).is_textline()) + left(l) = l; + else + left(l) = 0; + + // const unsigned nlines = lines.nelements(); + + // For each line + for_all_lines(l, lines) + if (lines(l).is_textline()) + { + // Max distance for the line search + int dmax = 1.5f * lines(l).x_height(); + + // Starting points in the current line box + point2d c = rbbox(l).pcenter(); + point2d q(rbbox(l).pmin().row() + ((c.row() - rbbox(l).pmin().row()) / 4), c.col()); + + int + midcol = (rbbox(l).pmax().col() + - rbbox(l).pmin().col()) / 2; + + // Left + { + // marge gauche + int + nleftima = c.col() - blocks.domain().pmin().col(), + // Distance gauche + nleft = std::min(nleftima, midcol + dmax); + + V + // Starting points in the box + *p = &blocks(c), + *p2 = &blocks(q), + // End of search + *pstop = p - nleft - 1, + // Line neighbor + *nbh_p = 0; + + // While we haven't found a neighbor or reached the limit + for (; p != pstop; --p, --p2) + { + if (*p2 != literal::zero // Not the background + && ((*p2 % 2) == 0) // Looking for the bottom of a line + && left((*p2 >> 1) - 1) != l) // No loops + { + // Neightbor found, we stop the research + nbh_p = p2; + break; + } + + if (*p != literal::zero // Not the background + && ((*p % 2) == 0) // Looking for the bottom of a line + && left((*p >> 1) - 1) != l) // No loops + { + // Neightbor found, we stop the research + nbh_p = p; + break; + } + } + + // If a neighbor was found, then we have found the top of the + // line. We are then looking for the bottom of the encountered + // line. If during the search process we find a complete line + // included in the touched line, this line is considered as + // the neighbor under certain conditions (see below) + + //--------------------------------------------------------------- + // _________________________ | + // |_________________________| => Current line | Search direction + // v + // => First encountered top line + // __________________________________________________ 2Q + // | Q | + // | _________________________ |2P + // | |_____________P___________| => Second top |2P + 1 + // | line | + // |__________________________________________________|2Q + 1 + // + // + //--------------------------------------------------------------- + + if (nbh_p) + { + std::vector<V> lines_nbh; + const V end_p = *nbh_p + 1; + const V* nbh_p_copy = nbh_p; + + for (; *nbh_p != end_p; --nbh_p) + { + if ((*nbh_p) != literal::zero) // Not the background + { + if ((*nbh_p) % 2 == 0)// We have found the top of + // another line + lines_nbh.push_back(*nbh_p); + else + { + // We have found the bottom of a line. We are looking if + // we have already encountered the top of this + // line. If so, we link the current line with this one + // under certain conditions: + + if (std::find(lines_nbh.begin(), lines_nbh.end(), + (*nbh_p) - 1) != lines_nbh.end()) + { + // If we can link the complete line with the current line + if (// It must be in the search range + nbh_p > pstop + // Avoid loops + && left(((*nbh_p - 1) >> 1) - 1) != l) + left(l) = ((*nbh_p - 1) >> 1) - 1; + + // We have found a complete line so we stop the search + break; + } + } + } + } + + + // If we haven't found any included line in the first + // neighbor, then the line is considered as the neighbor of + // the current line + if (*nbh_p == end_p) + left(l) = (*nbh_p_copy >> 1) - 1; + } + } + } + } + + + // We assume that the lines have been rotated + template <typename L> + inline + void + process_right_link(L& blocks, + const mln::util::array<box2d>& rbbox, + const line_set<L>& lines, + line_links<L>& right) + { + typedef scribo::def::lbl_type V; + + // At the beginning each line is its own neighbor + for_all_lines(l, lines) + if (lines(l).is_textline()) + right(l) = l; + else + right(l) = 0; + + // const unsigned nlines = lines.nelements(); + + // For each line + for_all_lines(l, lines) + if (lines(l).is_textline()) + { + // Max distance for the line search + int dmax = 1.5f * lines(l).x_height(); + + // Starting points in the current line box + point2d c = rbbox(l).pcenter(); + point2d q(rbbox(l).pmax().row() - ((rbbox(l).pmax().row() - c.row()) / 4), c.col()); + + int + midcol = (rbbox(l).pmax().col() + - rbbox(l).pmin().col()) / 2; + + // Right + { + int + nrightima = geom::ncols(blocks) - c.col() + blocks.domain().pmin().col(), + nright = std::min(nrightima, midcol + dmax); + + V + // Starting points in the box + *p = &blocks(c), + *p2 = &blocks(q), + // End of search + *pstop = p + nright - 1, + // Line neighbor + *nbh_p = 0; + + // While we haven't found a neighbor or reached the limit + for (; p != pstop; ++p, ++p2) + { + if (*p2 != literal::zero // Not the background + && ((*p2 % 2) == 1) // Looking for the bottom of a line + && right(((*p2 - 1) >> 1) - 1) != l) // No loops + { + // Neightbor found, we stop the research + nbh_p = p2; + break; + } + + if (*p != literal::zero // Not the background + && ((*p % 2) == 1) // Looking for the bottom of a line + && right(((*p - 1) >> 1) - 1) != l) // No loops + { + // Neightbor found, we stop the research + nbh_p = p; + break; + } + } + + // If a neighbor was found, then we have found the bottom of the + // line. We are then looking for the top of the encountered + // line. If during the search process we find a complete line + // included in the touched line, this line is considered as + // the neighbor under certain conditions (see below) + + //--------------------------------------------------------------- + // + // + // __________________________________________________ 2Q + // | Q | + // | _________________________ |2P + // | |_____________P___________| => Second bottom |2P + 1 + // | line | + // |__________________________________________________|2Q + 1 + // => First encountered bottom line + // _________________________ ^ + // |_________________________| => Current line | Search direction + // | + //--------------------------------------------------------------- + + if (nbh_p) + { + std::vector<V> lines_nbh; + const V end_p = *nbh_p - 1; + const V* nbh_p_copy = nbh_p; + + for (; *nbh_p != end_p; ++nbh_p) + { + if (*nbh_p != literal::zero) // Not the background + { + if (*nbh_p % 2 == 1) // We have found the bottom of + // another line + lines_nbh.push_back(*nbh_p); + else + { + // We have found the top of a line. We are looking if + //we have already encountered the bottom of this + // line. If so, we link the current line with this one + // under certain conditions: + + if (std::find(lines_nbh.begin(), lines_nbh.end(), + *nbh_p + 1) != lines_nbh.end()) + { + // If we can link the complete line with the current line + if (// It must be in the search range + nbh_p < pstop + // Avoid loops + && right((*nbh_p >> 1) - 1) != l) + right(l) = (*nbh_p >> 1) - 1; + + // We have found a complete line, so we stop the search + break; + } + } + } + } + + // If we haven't found any included line in the first + // neighbor, then the line is considered as the neighbor of + // the current line + + if (*nbh_p == end_p) + right(l) = ((*nbh_p_copy - 1) >> 1) - 1; + } + } + } + } + +//----------------------------------------------------------------------- +// Finalizing the links by merging information extracted from the left +// and right links +//----------------------------------------------------------------------- + + template< typename L > + inline + void finalize_links(line_links<L>& left, + line_links<L>& right, + const line_set<L>& lines) + { + // const unsigned nlines = lines.nelements(); + + for_all_lines(l, lines) + if (lines(l).is_textline()) + { + const unsigned left_value = left(l); + const unsigned right_value = right(l); + + // If the right neighbor of my left neighbor is itself then its + // right neighbor is me + { + line_id_t& v = right(left_value); + + if (v == left_value) + v = l; + } + + // If the left neighbor of my right neighbor is itself then its + // left neighbor is me + { + line_id_t& v = left(right_value); + + if (v == right_value) + v = l; + } + } + } + + } // end of namespace scribo::text::internal + + template <typename L> + inline + paragraph_set<L> + extract_paragraphs(line_set<L>& lines, + const image2d<bool>& input) + { + typedef scribo::def::lbl_type V; + + image2d<V> blocks(geom::rotate(input.domain(), -90, input.domain().pcenter())); + data::fill(blocks, 0); + + // util::array< line_info<L> > lines_info; + + // for_all_lines(l, lines) + // { + // if (lines(l).is_textline()) + // lines_info.append(lines(l)); + // } + +/// const unsigned nlines = lines_info.nelemnts(); + mln::util::array<box2d> rbbox; + line_links<L> left(lines); + left(0) = 0; + line_links<L> right(lines); + right(0) = 0; + line_links<L> output(lines); + output(0) = 0; + + rbbox.resize(lines.nelements() + 1); + + std::cout << "Preparing lines" << std::endl; + internal::prepare_lines(input.domain(), lines , blocks, rbbox); +// io::pgm::save(blocks, "blocks.pgm"); + std::cout << "Linking left" << std::endl; + internal::process_left_link(blocks, rbbox, lines , left); + std::cout << "Linking right" << std::endl; + internal::process_right_link(blocks, rbbox, lines , right); + std::cout << "Finalizing links" << std::endl; + internal::finalize_links(left, right, lines ); + // std::cout << "Finalizing merging" << std::endl; + // finalize_line_merging(left, right, lines); + std::cout << "Extracting paragraphs" << std::endl; + internal::paragraph_links(left, right, output, lines); + + paragraph_set<L> par_set = make::paragraph(output); + return par_set; + } + +# endif // ! MLN_INCLUDE_ONLY + + } // end of namespace scribo::text + +} // end of namespace scribo + +#endif // ! SCRIBO_TEXT_EXTRACT_PARAGRAPHS_HH diff --git a/scribo/scribo/text/extract_paragraphs_hdoc.hh b/scribo/scribo/text/extract_paragraphs_hdoc.hh new file mode 100644 index 0000000..57d3713 --- /dev/null +++ b/scribo/scribo/text/extract_paragraphs_hdoc.hh @@ -0,0 +1,1327 @@ +// Copyright (C) 2010, 2011 EPITA Research and Development Laboratory +// (LRDE) +// +// This file is part of Olena. +// +// Olena is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation, version 2 of the License. +// +// Olena is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Olena. If not, see <http://www.gnu.org/licenses/>. +// +// As a special exception, you may use this file as part of a free +// software project without restriction. Specifically, if other files +// instantiate templates or use macros or inline functions from this +// file, or you compile this file and link it with other files to produce +// an executable, this file does not by itself cause the resulting +// executable to be covered by the GNU General Public License. This +// exception does not however invalidate any other reasons why the +// executable file might be covered by the GNU General Public License. + +/// \file +/// +/// + + +#ifndef SCRIBO_TEXT_EXTRACT_PARAGRAPHS_HDOC_HH +# define SCRIBO_TEXT_EXTRACT_PARAGRAPHS_HDOC_HH + +#include <mln/util/array.hh> +#include <mln/accu/shape/bbox.hh> +#include <mln/core/image/image2d.hh> +#include <mln/core/alias/neighb2d.hh> +#include <mln/draw/box.hh> +#include <mln/data/convert.hh> +#include <mln/value/int_u16.hh> +#include <mln/value/label_16.hh> +#include <mln/value/int_u8.hh> +#include <mln/value/rgb8.hh> +#include <mln/io/ppm/save.hh> +#include <mln/io/pgm/save.hh> +#include <mln/geom/rotate.hh> +#include <mln/literal/colors.hh> + +#include <scribo/core/macros.hh> +#include <scribo/core/line_set.hh> +#include <scribo/core/line_links.hh> +#include <scribo/core/line_info.hh> +#include <scribo/core/paragraph_set.hh> + +using namespace mln; + + +namespace scribo +{ + + namespace text + { + + template <typename L> + inline + paragraph_set<L> + extract_paragraphs_hdoc(line_set<L>& lines, + const image2d<bool>& input); + + +# ifndef MLN_INCLUDE_ONLY + + namespace internal + { + + template <typename L> + inline + bool + between_horizontal_separator(const line_set<L>& lines, + const line_id_t& l1_, + const line_id_t& l2_) + { + const line_info<L>& l1 = lines(l1_); + const line_info<L>& l2 = lines(l2_); + + // No separators found in image. + mln_precondition(lines.components().has_separators()); + + const box2d& l1_bbox = l1.bbox(); + const box2d& l2_bbox = l2.bbox(); + + unsigned + row1 = l1_bbox.pcenter().row(), + row2 = l2_bbox.pcenter().row(); + const mln_ch_value(L, bool)& + separators = lines.components().separators(); + + unsigned row; + unsigned col_ptr; + unsigned left_col_ptr; + unsigned right_col_ptr; + unsigned end; + + if (row1 < row2) + { + row1 = l1_bbox.pmax().row(); + row2 = l2_bbox.pmin().row(); + + const unsigned quarter = + ((l1_bbox.pcenter().col() - l1_bbox.pmin().col()) >> 2); + + row = l1_bbox.pcenter().row(); + col_ptr = l1_bbox.pcenter().col(); + left_col_ptr = l1_bbox.pmin().col() + quarter; + right_col_ptr = l1_bbox.pmax().col() - quarter; + end = row2; + } + else + { + row2 = l2_bbox.pmax().row(); + row1 = l1_bbox.pmin().row(); + + const unsigned quarter = + ((l2_bbox.pcenter().col() - l2_bbox.pmin().col()) >> 2); + + row = l2_bbox.pcenter().row(); + col_ptr = l2_bbox.pcenter().col(); + left_col_ptr = l2_bbox.pmin().col() + quarter; + right_col_ptr = l2_bbox.pmax().col() - quarter; + end = row1; + } + + // If sep_ptr is true, then a separator is reached. + while (row < end) + { + ++row; + if (separators.at_(row, col_ptr) + || separators.at_(row, left_col_ptr) + || separators.at_(row, right_col_ptr)) + return true; + } + + return false; + } + + + template <typename L> + bool may_have_another_left_link(const line_links<L>& right, + const line_id_t& index, + const line_id_t& current_line, + const line_set<L>& lines) + { + const line_info<L>& l = lines(current_line); + const point2d& pmin = l.bbox().pmin(); + const unsigned x1 = l.x_height(); + + for_all_links(i, right) + if (i != index && right(i) == index) + { + const line_info<L>& l_info = lines(i); + const unsigned x2 = l_info.x_height(); + + const float delta_max = 0.5f * std::min(x1, x2); + + if (l_info.bbox().pmin().col() < pmin.col() + && std::abs(l.baseline() - l_info.baseline()) < delta_max + ) + return true; + } + + return false; + } + +//--------------------------------------------------------------------- +// This method aims to cut the links between lines that do not fit the +// different criteria +//--------------------------------------------------------------------- + + template <typename L> + inline + void paragraph_links(const line_links<L>& left, + line_links<L>& right, + line_links<L>& output, + const line_set<L>& lines) + { + output = left.duplicate(); + + // const unsigned nlines = lines.nelements(); + + // image2d<value::rgb8> links = data::convert(value::rgb8(), input); + // for (unsigned l = 0; l < nlines; ++l) + // { + // mln::draw::line(links, lines(l).bbox().pcenter(), lines(left(l)).bbox().pcenter(), literal::red); + // } + // mln::io::ppm::save(links, "out_links.ppm"); + + // For each line + for_all_lines(l, lines) + if (lines(l).is_textline()) + { + // Neighbors + + line_id_t left_nbh = output(l); + line_id_t right_nbh = right(l); + line_id_t lol_nbh = output(left_nbh); + + // const line_info<L>& left_line = lines(left_nbh); + // const line_info<L>& current_line = lines(l); + // const line_info<L>& right_line = lines(right_nbh); + + if (lines.components().has_separators() && + between_horizontal_separator(lines, right_nbh, l)) + { + output(right_nbh) = right_nbh; + right_nbh = l; + } + if (lines.components().has_separators() && + between_horizontal_separator(lines, l, left_nbh)) + { + output(l) = l; + left_nbh = l; + lol_nbh = l; + } + + // Line features + const float x_height = lines(l).x_height(); + const float left_x_height = lines(left_nbh).x_height(); + const float right_x_height = lines(right_nbh).x_height(); + + const box2d& left_line_bbox = lines(left_nbh).bbox(); + const box2d& current_line_bbox = lines(l).bbox(); + const box2d& right_line_bbox = lines(right_nbh).bbox(); + const box2d& lol_line_bbox = lines(lol_nbh).bbox(); // lol : left neighbor of the left neighbor + + const int lline_col_min = left_line_bbox.pmin().col(); + const int cline_col_min = current_line_bbox.pmin().col(); + const int rline_col_min = right_line_bbox.pmin().col(); + const int lolline_col_min = lol_line_bbox.pmin().col(); + + const int lline_col_max = left_line_bbox.pmax().col(); + const int cline_col_max = current_line_bbox.pmax().col(); + const int rline_col_max = right_line_bbox.pmax().col(); + + const int lline_cw = lines(left_nbh).char_width(); + const int cline_cw = lines(l).char_width(); + const int rline_cw = lines(right_nbh).char_width(); + // Maximal x variation to consider two lines vertically aligned + const int delta_alignment = cline_cw; + + + // Checks the baseline distances of the two neighbors + { + // Current line baseline + const int c_baseline = lines(l).baseline(); + + // Baseline distance with the left and right neighbors + const int lc_baseline = lines(left_nbh).baseline() - c_baseline; + const int rc_baseline = c_baseline -lines(right_nbh).baseline(); + + // Max baseline distance between the two neighbors + // const float delta_baseline_max = std::max(lc_baseline, rc_baseline); + // const float delta_baseline_min = std::min(lc_baseline, + // rc_baseline); + + // Only two lines, meaning the current line has only one neighbor + bool two_lines = false; + + // If the current line has no left neighbor + if (lc_baseline == 0) + { + // ror : right neighbor of the right neighbor + const line_id_t ror_nbh = right(right_nbh); + //const box2d& ror_line_bbox = lines(ror_nbh).bbox(); + + // If the current line has a ror + if (ror_nbh != right_nbh + && output(ror_nbh) == right_nbh) + { + // Distance between the current line and the right neighbor + const float right_distance = lines(l).meanline() - lines(right_nbh).baseline(); + // Distance between the right neighbor and the ror + const float ror_distance = lines(right_nbh).meanline() - lines(ror_nbh).baseline(); + // ror x_height + const float ror_x_height = lines(ror_nbh).x_height(); + + // Conditions to cut the link between the current line + // and its right neighbor + if (right_distance > 1.4f * ror_distance + && std::max(ror_x_height, right_x_height) < + 1.4f * std::min(ror_x_height, right_x_height) + && output(right_nbh) == l) + { + output(right_nbh) = right_nbh; + continue; + } + } + // Otherwise we only have a group of two lines + else + { + // We determine the distance between the two lines + const float distance = lines(l).meanline() - lines(right_nbh).baseline(); + two_lines = true; + + // If the distance between the two lines is greater than + // the minimum x height of the two lines then we cut the + // link between them + if (distance > 2.0f * std::min(x_height, right_x_height) + && output(right_nbh) == l) + { + output(right_nbh) = right_nbh; + continue; + } + } + + // Lines features + const float min_x_height = std::min(x_height, right_x_height); + const float max_x_height = std::max(x_height, right_x_height); + const float min_char_width = std::min(rline_cw, cline_cw); + const float max_char_width = std::max(rline_cw, cline_cw); + + // Condition to cut the link between the current line and + // its right neighbor + if ((max_x_height > min_x_height * 1.4f) && + !(max_char_width <= 1.2f * min_char_width)) + { + if (output(right_nbh) == l) + { + output(right_nbh) = right_nbh; + continue; + } + } + + // If we only have two lines we stop the study + if (two_lines) + continue; + } + // If the current line has no right neighbor + else if (rc_baseline == 0) + { + // lol : left neighbor of the left neighbor + + // If the left neighbor of the current line has a left neighbor + if (lol_nbh != left_nbh) + { + // Distance between the current line and its left neighbor + const float left_distance = lines(left_nbh).meanline() - + lines(l).baseline(); + // Distance between the left neighbor and the left + // neighbor of its left neighbor + const float lol_distance = lines(lol_nbh).meanline() - + lines(left_nbh).baseline(); + // lol x height + const float lol_x_height = lines(lol_nbh).x_height(); + + // Conditions to cut the link between the current line + // and its left neighbor + if (left_distance > 1.4f * lol_distance + && std::max(lol_x_height, left_x_height) < + 1.4f * std::min(lol_x_height, left_x_height)) + { + output(l) = l; + continue; + } + } + // Otherwise we only have a group of two lines + else + { + // Distance between the current line and it left neighbor + const float distance = lines(left_nbh).meanline() - + lines(l).baseline(); + + two_lines = true; + + // If the distance is greater than the min x height + // between the two lines + if (distance > 2.0f * std::min(x_height, left_x_height)) + { + output(l) = l; + continue; + } + } + + // Lines features + const float min_x_height = std::min(x_height, left_x_height); + const float max_x_height = std::max(x_height, left_x_height); + const float min_char_width = std::min(lline_cw, cline_cw); + const float max_char_width = std::max(lline_cw, cline_cw); + + // Condition to cut the link between the current line and + // its left neighbor + if ((max_x_height > min_x_height * 1.4f) && + !(max_char_width <= 1.2f * min_char_width)) + { + output(l) = l; + continue; + } + + // If we only have two lines we stop the study + if (two_lines) + continue; + } + // The current line has at least one left and one right neighbor + else // if (delta_baseline_max >= 1.1 * delta_baseline_min) + { + // Distance between the left and the current line + const float + left_distance = left_line_bbox.pcenter().row() - current_line_bbox.pcenter().row(); + // Distance between the right and the current line + const float + right_distance = current_line_bbox.pcenter().row() - right_line_bbox.pcenter().row();; + + // If the left line is too far compared to the right one + // we cut the link with it + if (left_distance > 1.5f * right_distance + && std::max(x_height, left_x_height) > 1.2f * std::min(x_height, left_x_height)) + { + output(l) = l; + continue; + } + // If the right line is too far compared to the left one + // we cut the link with it + else if (right_distance > 1.5f * left_distance + && std::max(x_height, right_x_height) >= 1.2f * std::min(x_height, right_x_height) + && output(right_nbh) == l) + { + output(right_nbh) = right_nbh; + continue; + } + + // If the distance between the baseline of the left + // neighbor and the baseline of the current line is + // greater than the one between the current line baseline + // and the right line baseline we have to study the texte + // features of the right and left lines + if (lc_baseline > rc_baseline) + { + const float cw_max = std::max(lline_cw, cline_cw); + const float cw_min = std::min(lline_cw, cline_cw); + const float min_x_height = std::min(x_height, left_x_height); + const float max_x_height = std::max(x_height, left_x_height); + + if ((max_x_height > min_x_height * 1.4f) && + !(cw_max <= 1.2f * cw_min)) + { + output(l) = l; + continue; + } + + { + const float min_x_height = std::min(x_height, right_x_height); + const float max_x_height = std::max(x_height, right_x_height); + const float cw_max = std::max(rline_cw, cline_cw); + const float cw_min = std::min(rline_cw, cline_cw); + + if ((max_x_height > min_x_height * 1.4f) + && !(cw_max <= 1.2f * cw_min) + && output(right_nbh) == l) + { + output(right_nbh) = right_nbh; + continue; + } + } + } + else + { + const float cw_max = std::max(rline_cw, cline_cw); + const float cw_min = std::min(rline_cw, cline_cw); + const float min_x_height = std::min(x_height, right_x_height); + const float max_x_height = std::max(x_height, right_x_height); + + if ((max_x_height > min_x_height * 1.4f) + && !(cw_max <= 1.2f * cw_min) + && output(right_nbh) == l) + { + output(right_nbh) = right_nbh; + continue; + } + + { + const float min_x_height = std::min(x_height, left_x_height); + const float max_x_height = std::max(x_height, left_x_height); + const float cw_max = std::max(lline_cw, cline_cw); + const float cw_min = std::min(lline_cw, cline_cw); + + if ((max_x_height > min_x_height * 1.4f) + && !(cw_max <= 1.2f * cw_min)) + { + output(l) = l; + continue; + } + } + } + } + } + + // If we arrive here, it means than the lines in the + // neighborhood of the current line are quite similar. We can + // then begin to study the indentations in order to determine + // the beginning of new paragraphs + +//----------------------------------------------------------------------------- +// ___________________________ +// |___________________________| +// ________________________ +// |________________________| +// ___________________________ +// |___________________________| +// ___________________________ +// |___________________________| +// +// Simple case : paragraphs are justified on the left. We try to find any +// indentation like above. +// +//----------------------------------------------------------------------------- + + { + // Check if the current line neighbors are aligned + bool left_right_aligned = false; + bool left_lol_aligned = false; + const int dx_lr = std::abs(lline_col_min - rline_col_min); + const int dx_llol = std::abs(lline_col_min - lolline_col_min); + + if (dx_lr < delta_alignment) + left_right_aligned = true; + + if (dx_llol < delta_alignment) + left_lol_aligned = true; + + if (left_right_aligned && left_lol_aligned) + { + const int left_right_col_min = std::min(lline_col_min, rline_col_min); + const int dx_lrc = std::abs(left_right_col_min - cline_col_min); + const float l_char_width = 1.5f * lines(l).char_width(); + + if (dx_lrc > l_char_width && + dx_lrc < 3.0f * l_char_width && + cline_col_min > rline_col_min && + cline_col_min > lline_col_min) + { + const line_id_t out_right_nbh = output(right_nbh); + + if (out_right_nbh != l) + { + if (output(out_right_nbh) == l) + output(out_right_nbh) = out_right_nbh; + right(l) = l; + } + else + output(right_nbh) = right_nbh; + continue; + } + } + } + +//----------------------------------------------------------------------------- +// ___________________________ +// |___________________________| +// ___________________ +// |___________________| End of the paragraph - Current line +// ________________________ +// |________________________| Beginning of a new one +// ___________________________ +// |___________________________| Left of left of current line +// +// End of paragraph case : we try to find an end to the current paragraph +// +//----------------------------------------------------------------------------- + + { + // Check if the current line neighbors are aligned + bool left_right_max_aligned = false; + bool left_current_min_aligned = false; + bool lol_current_min_aligned = false; + const bool lol_is_left = output(left_nbh) == left_nbh; + const int dx_lr_max = std::abs(lline_col_max - rline_col_max); + const int dx_lc_min = std::abs(lline_col_min - cline_col_min); + const int dx_lolc_min = std::abs(lolline_col_min - cline_col_min); + + if (dx_lr_max < delta_alignment) + left_right_max_aligned = true; + + if (dx_lc_min < delta_alignment) + left_current_min_aligned = true; + + if (dx_lolc_min < delta_alignment) + lol_current_min_aligned = true; + + if (!left_current_min_aligned && left_right_max_aligned && + (lol_current_min_aligned || lol_is_left)) + { + const int dx_lrc = std::abs(lline_col_max - cline_col_max); + const int l_char_width = lines(l).char_width(); + const int dx_indent = std::abs(std::max(lline_col_min, + rline_col_min) - cline_col_min); + + if (dx_lrc > l_char_width && + dx_indent < 4 * delta_alignment && + cline_col_max < lline_col_max && + cline_col_min < lline_col_min && + (lline_col_min > lolline_col_min || lol_is_left)) + { + output(l) = l; + continue; + } + } + } + +//----------------------------------------------------------------------------- +// ___________________________ +// |___________________________| +// ___________________________ +// |___________________________| +// ________________________ +// |________________________| +// ___________________________ +// |___________________________| +// +// Simple case : paragraphs are justified on the left. We try to find any +// indentation like above. +// +//----------------------------------------------------------------------------- + + { + const line_id_t ror_nbh = right(right_nbh); + const box2d& ror_line_bbox = lines(ror_nbh).bbox(); + const int rorline_col_min = ror_line_bbox.pmin().col(); + + bool right_ror_min_aligned = false; + bool left_right_aligned = false; + const int dx_lr = std::abs(lline_col_min - rline_col_min); + const int dx_rror_min = std::abs(rline_col_min - rorline_col_min); + + if (dx_rror_min < delta_alignment) + right_ror_min_aligned = true; + + if (dx_lr < delta_alignment) + left_right_aligned = true; + + if (right_ror_min_aligned && left_right_aligned && + ror_nbh != right_nbh) + { + const int left_right_col_min = std::min(lline_col_min, rline_col_min); + const int dx_lrc = std::abs(left_right_col_min - cline_col_min); + const float l_char_width = 1.5f * lines(l).char_width(); + + if (dx_lrc > l_char_width && + !may_have_another_left_link(right, right_nbh, l, lines) && + dx_lrc < 10.0f * l_char_width && + cline_col_min > rline_col_min && + cline_col_min > lline_col_min) + { + const line_id_t out_right_nbh = output(right_nbh); + + if (out_right_nbh != l) + { + if (output(out_right_nbh) == l) + output(out_right_nbh) = out_right_nbh; + right(l) = l; + } + else + output(right_nbh) = right_nbh; + continue; + } + } + } + +//----------------------------------------------------------------------------- +// ___________________________ +// |___________________________| +// ___________ +// |___________| +// ________________________ +// |________________________| +// ___________________________ +// |___________________________| +// +// Simple case : paragraphs are justified on the left. We try to find any +// indentation like above. +// +//----------------------------------------------------------------------------- + + { + const line_id_t ror_nbh = right(right_nbh); + const box2d& ror_line_bbox = lines(ror_nbh).bbox(); + const int rorline_col_min = ror_line_bbox.pmin().col(); + + bool left_ror_aligned = false; + const int dx_lror = std::abs(lline_col_min - rorline_col_min); + + if (dx_lror < delta_alignment) + left_ror_aligned = true; + + if (left_ror_aligned) + { + const int left_ror_col_min = std::min(lline_col_min, rorline_col_min); + const int dx_lrorc = std::abs(left_ror_col_min - cline_col_min); + const float l_char_width = 1.5f * lines(l).char_width(); + const int dx_lrorr = std::abs(left_ror_col_min - rline_col_min); + const int dx_crmax = std::abs(rline_col_max - cline_col_max); + + if (dx_lrorc > l_char_width && + dx_lrorr > 5 * l_char_width && + dx_lrorr > dx_lrorc && + dx_crmax > 5 * l_char_width && + !may_have_another_left_link(right, right_nbh, l, lines) && + dx_lrorc < 10.0f * l_char_width && + cline_col_min > rorline_col_min && + cline_col_min > lline_col_min) + { + right(right_nbh) = right_nbh; + continue; + } + } + } + + +// Strange case + { + if (rline_col_min > current_line_bbox.pcenter().col() + && !may_have_another_left_link(right, right_nbh, l, lines) + && cline_col_max < rline_col_max + && output(right_nbh) == l) + { + output(right_nbh) = right_nbh; + } + } + +//----------------------------------------------------------------------------- +// ___________________________ +// |___________________________| +// ___________________________ +// |___________________________| +// ________________________ +// |________________________| +// +// Simple case : paragraphs are justified on the left. We try to find any +// indentation like above at the end of a column. +// +//----------------------------------------------------------------------------- + + if (left_nbh == l) + { + const line_id_t ror_nbh = right(right_nbh); + const box2d& ror_line_bbox = lines(ror_nbh).bbox(); + const int rorline_col_min = ror_line_bbox.pmin().col(); + + bool right_ror_min_aligned = false; + const int dx_rror_min = std::abs(rline_col_min - rorline_col_min); + + if (dx_rror_min < delta_alignment) + right_ror_min_aligned = true; + + if (right_ror_min_aligned) + { + const int right_ror_col_min = std::min(rline_col_min, rorline_col_min); + const int dx_rrorc = std::abs(right_ror_col_min - cline_col_min); + const float l_char_width = 1.5f * lines(l).char_width(); + + if (dx_rrorc > l_char_width && + dx_rrorc < 10.0f * l_char_width && + cline_col_min > rline_col_min && + cline_col_max >= rline_col_max) + { + const line_id_t out_right_nbh = output(right_nbh); + + if (out_right_nbh != l) + { + if (output(out_right_nbh) == l) + output(out_right_nbh) = out_right_nbh; + right(l) = l; + } + else + output(right_nbh) = right_nbh; + continue; + } + } + } + } + + // Only debug + // { + // image2d<value::rgb8> debug = data::convert(value::rgb8(), input); + + // mln::util::array<accu::shape::bbox<point2d> > nbbox(output.nelements()); + // for_all_lines(l, lines) + // if (lines(l).is_textline()) + // { + // // if (lines(i).is_textline()) + // // { + // // mln::draw::box(debug, lines(i).bbox(), literal::red); + // nbbox(output(l)).take(lines(l).bbox()); + // // } + // } + + // for (unsigned i = 0; i < nbbox.nelements(); ++i) + // if (nbbox(i).is_valid()) + // { + // box2d b = nbbox(i).to_result(); + // mln::draw::box(debug, b, literal::orange); + // b.enlarge(1); + // mln::draw::box(debug, b, literal::orange); + // b.enlarge(1); + // mln::draw::box(debug, b, literal::orange); + // } + + // mln::io::ppm::save(debug, "out_paragraph.ppm"); + // } + + } + +//------------------------------------------------------------- +// Preparation of the lines before linking them. +// For each line we draw the top and the bottom of it. +// Assuming than i is the number of the line. Then the top of the line +// will be affected with the value 2 * i in the block image and the +// bottom with 2 * i + 1. +// +//------------------------------------------------------------- + + template <typename L> + inline + void prepare_lines(const box2d& domain, + const line_set<L>& lines, + L& blocks, + mln::util::array<box2d>& rbbox) + { + std::map< int, std::vector< const box2d* > > drawn_lines; + // const unsigned nlines = lines.nelements(); + + // For each line + //for (unsigned l = 0; l < nlines; ++l) + for_all_lines(l, lines) + if (lines(l).is_textline()) + { + // Rotation of the bounding box + box2d b = geom::rotate(lines(l).bbox(), -90, domain.pcenter()); +// rbbox.append(b); + rbbox(l) = b; + + const unsigned index = l + 1; + const unsigned even_index = 2 * index; + const unsigned odd_index = even_index + 1; + + // Top of the line + { + bool not_finished = true; + int col_offset = 0; + + while (not_finished) + { + // Looking for a column in the image to draw the top of the + // line + + const int col = b.pmax().col() + col_offset; + std::map< int, std::vector< const box2d* > >::iterator it + = drawn_lines.find(col); + + if (it != drawn_lines.end()) + { + const std::vector< const box2d* >& lines = (*it).second; + const unsigned nb_lines = lines.size(); + unsigned i = 0; + + for (i = 0; i < nb_lines; ++i) + { + const box2d* box = lines[i]; + const int min_row = std::max(b.pmin().row(), box->pmin().row()); + const int max_row = std::min(b.pmax().row(), box->pmax().row()); + + if (min_row - max_row <= 0) + break; + } + + if (i == nb_lines) + { + mln::draw::line(blocks, point2d(b.pmin().row(), col), + point2d(b.pmax().row(), col), even_index); + not_finished = false; + drawn_lines[col].push_back(&(rbbox[l])); + } + else + ++col_offset; + } + else + { + mln::draw::line(blocks, point2d(b.pmin().row(), col), + point2d(b.pmax().row(), col), even_index); + not_finished = false; + drawn_lines[col].push_back(&(rbbox[l])); + } + } + } + + // Bottom of the line + { + bool not_finished = true; + int col_offset = 0; + + while (not_finished) + { + // Looking for a column in the image to draw the bottom of + // the line + + const int col = b.pmin().col() - col_offset; + std::map< int, std::vector< const box2d* > >::iterator it + = drawn_lines.find(col); + + if (it != drawn_lines.end()) + { + const std::vector< const box2d* >& lines = (*it).second; + const unsigned nb_lines = lines.size(); + unsigned i = 0; + + for (i = 0; i < nb_lines; ++i) + { + const box2d* box = lines[i]; + const int min_row = std::max(b.pmin().row(), box->pmin().row()); + const int max_row = std::min(b.pmax().row(), box->pmax().row()); + + if (min_row - max_row <= 0) + break; + } + + if (i == nb_lines) + { + mln::draw::line(blocks, point2d(b.pmin().row(), col), + point2d(b.pmax().row(), col), odd_index); + not_finished = false; + drawn_lines[col].push_back(&(rbbox[l])); + } + else + ++col_offset; + } + else + { + mln::draw::line(blocks, point2d(b.pmin().row(), col), + point2d(b.pmax().row(), col), odd_index); + not_finished = false; + drawn_lines[col].push_back(&(rbbox[l])); + } + } + } + } + } + + template <typename L> + inline + void + process_left_link(L& blocks, + const mln::util::array<box2d>& rbbox, + const line_set<L>& lines, + line_links<L>& left) + { + typedef scribo::def::lbl_type V; + + // At the beginning each line is its own neighbor + for_all_lines(l, lines) + if (lines(l).is_textline()) + left(l) = l; + else + left(l) = 0; + + // const unsigned nlines = lines.nelements(); + + // For each line + for_all_lines(l, lines) + if (lines(l).is_textline()) + { + // Max distance for the line search + int dmax = 1.5f * lines(l).x_height(); + + // Starting points in the current line box + point2d c = rbbox(l).pcenter(); + point2d q(rbbox(l).pmin().row() + ((c.row() - rbbox(l).pmin().row()) / 4), c.col()); + + int + midcol = (rbbox(l).pmax().col() + - rbbox(l).pmin().col()) / 2; + + // Left + { + // marge gauche + int + nleftima = c.col() - blocks.domain().pmin().col(), + // Distance gauche + nleft = std::min(nleftima, midcol + dmax); + + V + // Starting points in the box + *p = &blocks(c), + *p2 = &blocks(q), + // End of search + *pstop = p - nleft - 1, + // Line neighbor + *nbh_p = 0; + + // While we haven't found a neighbor or reached the limit + for (; p != pstop; --p, --p2) + { + if (*p2 != literal::zero // Not the background + && ((*p2 % 2) == 0) // Looking for the bottom of a line + && left((*p2 >> 1) - 1) != l) // No loops + { + // Neightbor found, we stop the research + nbh_p = p2; + break; + } + + if (*p != literal::zero // Not the background + && ((*p % 2) == 0) // Looking for the bottom of a line + && left((*p >> 1) - 1) != l) // No loops + { + // Neightbor found, we stop the research + nbh_p = p; + break; + } + } + + // If a neighbor was found, then we have found the top of the + // line. We are then looking for the bottom of the encountered + // line. If during the search process we find a complete line + // included in the touched line, this line is considered as + // the neighbor under certain conditions (see below) + + //--------------------------------------------------------------- + // _________________________ | + // |_________________________| => Current line | Search direction + // v + // => First encountered top line + // __________________________________________________ 2Q + // | Q | + // | _________________________ |2P + // | |_____________P___________| => Second top |2P + 1 + // | line | + // |__________________________________________________|2Q + 1 + // + // + //--------------------------------------------------------------- + + if (nbh_p) + { + std::vector<V> lines_nbh; + const V end_p = *nbh_p + 1; + const V* nbh_p_copy = nbh_p; + + for (; *nbh_p != end_p; --nbh_p) + { + if ((*nbh_p) != literal::zero) // Not the background + { + if ((*nbh_p) % 2 == 0)// We have found the top of + // another line + lines_nbh.push_back(*nbh_p); + else + { + // We have found the bottom of a line. We are looking if + // we have already encountered the top of this + // line. If so, we link the current line with this one + // under certain conditions: + + if (std::find(lines_nbh.begin(), lines_nbh.end(), + (*nbh_p) - 1) != lines_nbh.end()) + { + // If we can link the complete line with the current line + if (// It must be in the search range + nbh_p > pstop + // Avoid loops + && left(((*nbh_p - 1) >> 1) - 1) != l) + left(l) = ((*nbh_p - 1) >> 1) - 1; + + // We have found a complete line so we stop the search + break; + } + } + } + } + + + // If we haven't found any included line in the first + // neighbor, then the line is considered as the neighbor of + // the current line + if (*nbh_p == end_p) + left(l) = (*nbh_p_copy >> 1) - 1; + } + } + } + } + + + // We assume that the lines have been rotated + template <typename L> + inline + void + process_right_link(L& blocks, + const mln::util::array<box2d>& rbbox, + const line_set<L>& lines, + line_links<L>& right) + { + typedef scribo::def::lbl_type V; + + // At the beginning each line is its own neighbor + for_all_lines(l, lines) + if (lines(l).is_textline()) + right(l) = l; + else + right(l) = 0; + + // const unsigned nlines = lines.nelements(); + + // For each line + for_all_lines(l, lines) + if (lines(l).is_textline()) + { + // Max distance for the line search + int dmax = 1.5f * lines(l).x_height(); + + // Starting points in the current line box + point2d c = rbbox(l).pcenter(); + point2d q(rbbox(l).pmax().row() - ((rbbox(l).pmax().row() - c.row()) / 4), c.col()); + + int + midcol = (rbbox(l).pmax().col() + - rbbox(l).pmin().col()) / 2; + + // Right + { + int + nrightima = geom::ncols(blocks) - c.col() + blocks.domain().pmin().col(), + nright = std::min(nrightima, midcol + dmax); + + V + // Starting points in the box + *p = &blocks(c), + *p2 = &blocks(q), + // End of search + *pstop = p + nright - 1, + // Line neighbor + *nbh_p = 0; + + // While we haven't found a neighbor or reached the limit + for (; p != pstop; ++p, ++p2) + { + if (*p2 != literal::zero // Not the background + && ((*p2 % 2) == 1) // Looking for the bottom of a line + && right(((*p2 - 1) >> 1) - 1) != l) // No loops + { + // Neightbor found, we stop the research + nbh_p = p2; + break; + } + + if (*p != literal::zero // Not the background + && ((*p % 2) == 1) // Looking for the bottom of a line + && right(((*p - 1) >> 1) - 1) != l) // No loops + { + // Neightbor found, we stop the research + nbh_p = p; + break; + } + } + + // If a neighbor was found, then we have found the bottom of the + // line. We are then looking for the top of the encountered + // line. If during the search process we find a complete line + // included in the touched line, this line is considered as + // the neighbor under certain conditions (see below) + + //--------------------------------------------------------------- + // + // + // __________________________________________________ 2Q + // | Q | + // | _________________________ |2P + // | |_____________P___________| => Second bottom |2P + 1 + // | line | + // |__________________________________________________|2Q + 1 + // => First encountered bottom line + // _________________________ ^ + // |_________________________| => Current line | Search direction + // | + //--------------------------------------------------------------- + + if (nbh_p) + { + std::vector<V> lines_nbh; + const V end_p = *nbh_p - 1; + const V* nbh_p_copy = nbh_p; + + for (; *nbh_p != end_p; ++nbh_p) + { + if (*nbh_p != literal::zero) // Not the background + { + if (*nbh_p % 2 == 1) // We have found the bottom of + // another line + lines_nbh.push_back(*nbh_p); + else + { + // We have found the top of a line. We are looking if + //we have already encountered the bottom of this + // line. If so, we link the current line with this one + // under certain conditions: + + if (std::find(lines_nbh.begin(), lines_nbh.end(), + *nbh_p + 1) != lines_nbh.end()) + { + // If we can link the complete line with the current line + if (// It must be in the search range + nbh_p < pstop + // Avoid loops + && right((*nbh_p >> 1) - 1) != l) + right(l) = (*nbh_p >> 1) - 1; + + // We have found a complete line, so we stop the search + break; + } + } + } + } + + // If we haven't found any included line in the first + // neighbor, then the line is considered as the neighbor of + // the current line + + if (*nbh_p == end_p) + right(l) = ((*nbh_p_copy - 1) >> 1) - 1; + } + } + } + } + +//----------------------------------------------------------------------- +// Finalizing the links by merging information extracted from the left +// and right links +//----------------------------------------------------------------------- + + template< typename L > + inline + void finalize_links(line_links<L>& left, + line_links<L>& right, + const line_set<L>& lines) + { + // const unsigned nlines = lines.nelements(); + + for_all_lines(l, lines) + if (lines(l).is_textline()) + { + const unsigned left_value = left(l); + const unsigned right_value = right(l); + + // If the right neighbor of my left neighbor is itself then its + // right neighbor is me + { + line_id_t& v = right(left_value); + + if (v == left_value) + v = l; + } + + // If the left neighbor of my right neighbor is itself then its + // left neighbor is me + { + line_id_t& v = left(right_value); + + if (v == right_value) + v = l; + } + } + } + + } // end of namespace scribo::text::internal + + + template <typename L> + inline + paragraph_set<L> + extract_paragraphs_hdoc(line_set<L>& lines, + const image2d<bool>& input) + { + typedef scribo::def::lbl_type V; + + image2d<V> blocks(geom::rotate(input.domain(), -90, input.domain().pcenter())); + data::fill(blocks, 0); + + // util::array< line_info<L> > lines_info; + + // for_all_lines(l, lines) + // { + // if (lines(l).is_textline()) + // lines_info.append(lines(l)); + // } + +/// const unsigned nlines = lines_info.nelemnts(); + mln::util::array<box2d> rbbox; + line_links<L> left(lines); + left(0) = 0; + line_links<L> right(lines); + right(0) = 0; + line_links<L> output(lines); + output(0) = 0; + + rbbox.resize(lines.nelements() + 1); + +// std::cout << "Preparing lines" << std::endl; + prepare_lines(input.domain(), lines , blocks, rbbox); +// io::pgm::save(blocks, "blocks.pgm"); +// std::cout << "Linking left" << std::endl; + process_left_link(blocks, rbbox, lines , left); +// std::cout << "Linking right" << std::endl; + process_right_link(blocks, rbbox, lines , right); +// std::cout << "Finalizing links" << std::endl; + finalize_links(left, right, lines ); + // std::cout << "Finalizing merging" << std::endl; + // finalize_line_merging(left, right, lines); +// std::cout << "Extracting paragraphs" << std::endl; + filter::paragraph_links(left, right, output, lines); + + paragraph_set<L> par_set = make::paragraph(output, right); + return par_set; + } + +# endif // ! MLN_INCLUDE_ONLY + + } // end of namespace scribo::text + +} // end of namespace scribo + +#endif // ! SCRIBO_TEXT_EXTRACT_PARAGRAPHS_HDOC_HH diff --git a/scribo/scribo/text/merging.hh b/scribo/scribo/text/merging.hh index 31a5ed4..dd5762a 100644 --- a/scribo/scribo/text/merging.hh +++ b/scribo/scribo/text/merging.hh @@ -1,5 +1,4 @@ -// Copyright (C) 2010, 2011 EPITA Research and Development Laboratory -// (LRDE) +// Copyright (C) 2010 EPITA Research and Development Laboratory (LRDE) // // This file is part of Olena. // @@ -181,7 +180,10 @@ namespace scribo l1 = my_find_root(parent, l1); l2 = my_find_root(parent, l2); if (l1 == l2) - return l1; + { + std::cerr << "what! in'do_union': already merged!!!" << std::endl; + return l1; + } swap_ordering(l1, l2); parent[l2] = l1; // The smallest label value is root. @@ -224,16 +226,16 @@ namespace scribo template <typename L> - bool between_separators(const scribo::line_set<L>& lines, + bool between_separators(const line_set<L>& lines, const line_id_t& l1_, const line_id_t& l2_) { - const scribo::line_info<L>& l1 = lines(l1_); - const scribo::line_info<L>& l2 = lines(l2_); - // No separators found in image. mln_precondition(lines.components().has_separators()); + const scribo::line_info<L>& l1 = lines(l1_); + const scribo::line_info<L>& l2 = lines(l2_); + const box2d& l1_bbox = l1.bbox(); const box2d& l2_bbox = l2.bbox(); @@ -295,11 +297,11 @@ namespace scribo */ template <typename L> bool lines_can_merge(scribo::line_set<L>& lines, - const scribo::line_id_t& l1_, - const scribo::line_id_t& l2_) + const line_id_t& l1_, + const line_id_t& l2_) { scribo::line_info<L>& l1 = lines(l1_); - scribo::line_info<L>& l2 = lines(l2_); + const scribo::line_info<L>& l2 = lines(l2_); // Parameters. const float x_ratio_max = 1.7f; @@ -432,12 +434,13 @@ namespace scribo */ template <typename L> - bool non_text_and_text_can_merge(scribo::line_set<L>& lines, - const scribo::line_id_t& l_cur_, // current - const scribo::line_id_t l_ted_) // touched + bool non_text_and_text_can_merge(line_set<L>& lines, + const line_id_t& l_cur_, // current + const line_id_t& l_ted_) // touched { scribo::line_info<L>& l_cur = lines(l_cur_); - scribo::line_info<L>& l_ted = lines(l_ted_); + const scribo::line_info<L>& l_ted = lines(l_ted_); + if (l_cur.type() == line::Text || l_ted.type() != line::Text) return false; @@ -470,8 +473,8 @@ namespace scribo if (l_cur_height < l_ted_x_height && l_cur_height > 0.05f * l_ted_x_height && float(l_cur_width) / float(l_cur.card()) < l_ted.char_width() - && dx < 2 * l_ted_cw - && l_cur_pmin.row() < l_ted.baseline()) + && dx < l_ted_cw + && l_cur_pmin.row() < l_ted_pmax.row()) { l_cur.update_type(line::Punctuation); return true; @@ -634,10 +637,8 @@ namespace scribo image2d<unsigned> billboard(domain); data::fill(billboard, 0); -# ifndef SCRIBO_NDEBUG image2d<value::int_u8> log(domain); data::fill(log, 0); -# endif // ! SCRIBO_NDEBUG const unsigned n = v.size(); unsigned l_; @@ -669,7 +670,7 @@ namespace scribo x---------------x | | | mc | - ml x x x mr + ml x x x mr | | | | x---------------x @@ -752,12 +753,11 @@ namespace scribo // vertically aligned // Obviously no separators between the two lines if ((l_info.card() <= 5 || - (std::abs(l_info.baseline() - mc_info.baseline()) - < 5 && std::abs(l_info.meanline() - - mc_info.meanline()) < 5)) - && dx < l_ted_cw && dy < 0 - && not (lines.components().has_separators() - && between_separators(lines, l, mc))) + (std::abs(l_info.baseline() - mc_info.baseline()) < 5 + && std::abs(l_info.meanline() - mc_info.meanline()) < 5)) + && dx < l_ted_cw && dy < 0 + && not (lines.components().has_separators() + && between_separators(lines, l, mc))) l = do_union(lines, l, mc, parent); // } @@ -776,10 +776,9 @@ namespace scribo // l_ = do_union(lines, mc, l, parent); // draw_box(billboard, lines(l_).ebbox(), l_); -# ifndef SCRIBO_NDEBUG // Log: draw_box(log, b, 126); -# endif // ! SCRIBO_NDEBUG + } else // FIXME: Remove! since included in a non-text-line, so not drawn, so inclusion impossible!!!!!!!!!! @@ -789,11 +788,8 @@ namespace scribo // a non-text-line (probably a drawing or a frame) includes a text line draw_box(billboard, lines(l).ebbox(), l); - -# ifndef SCRIBO_NDEBUG // Log: draw_box(log, b, 100); -# endif // ! SCRIBO_NDEBUG } } @@ -832,10 +828,8 @@ namespace scribo // it may change of label (take the one of the included line). draw_box(billboard, lines(l_).ebbox(), l_); -# ifndef SCRIBO_NDEBUG // Log: draw_box(log, b, 128); -# endif // ! SCRIBO_NDEBUG } } } @@ -848,15 +842,11 @@ namespace scribo { ++count_new_txtline; draw_box(billboard, lines(l).ebbox(), l); -# ifndef SCRIBO_NDEBUG // Log: draw_box(log, b, 127); -# endif // ! SCRIBO_NDEBUG } -# ifndef SCRIBO_NDEBUG else - draw_box(log, b, 1); -# endif // ! SCRIBO_NDEBUG + draw_box(log, b, 1); } } else @@ -873,8 +863,8 @@ namespace scribo if (lcand == 0) // Skip background. continue; - // if (lines(lcand).type() != line::Text) - // std::cerr << "again!" << std::endl; + if (lines(lcand).type() != line::Text) + std::cerr << "again!" << std::endl; if (lines(l_).type() == line::Text) @@ -886,22 +876,15 @@ namespace scribo l_ = do_union(lines, l_, lcand, parent); draw_box(billboard, lines(l_).ebbox(), l_); - -# ifndef SCRIBO_NDEBUG // Log: draw_box(log, b, 151); -# endif // ! SCRIBO_NDEBUG - continue; } else { ++count_WTF; - -# ifndef SCRIBO_NDEBUG // Log: draw_box(log, b, 255); -# endif // ! SCRIBO_NDEBUG // (*) SEE BELOW draw_box(billboard, lines(l_).ebbox(), l_); @@ -918,20 +901,15 @@ namespace scribo l_ = do_union(lines, l_, lcand, parent); draw_box(billboard, lines(l_).ebbox(), l_); -# ifndef SCRIBO_NDEBUG // Log: draw_box(log, b, 169); -# endif // ! SCRIBO_NDEBUG - continue; } -# ifndef SCRIBO_NDEBUG else { // Log: draw_box(log, b, 254); } -# endif // ! SCRIBO_NDEBUG } @@ -1081,6 +1059,7 @@ namespace scribo // ts = t.stop(); // std::cout << "time " << ts << std::endl; + lines.force_stats_update(); return lines; diff --git a/scribo/scribo/text/merging.hh b/scribo/scribo/text/merging_hdoc.hh similarity index 99% copy from scribo/scribo/text/merging.hh copy to scribo/scribo/text/merging_hdoc.hh index 31a5ed4..6f89d2a 100644 --- a/scribo/scribo/text/merging.hh +++ b/scribo/scribo/text/merging_hdoc.hh @@ -24,8 +24,8 @@ // exception does not however invalidate any other reasons why the // executable file might be covered by the GNU General Public License. -#ifndef SCRIBO_TEXT_MERGING_HH -# define SCRIBO_TEXT_MERGING_HH +#ifndef SCRIBO_TEXT_MERGING_HDOC_HH +# define SCRIBO_TEXT_MERGING_HDOC_HH /// \file /// @@ -82,7 +82,7 @@ namespace scribo // template <typename L> line_set<L> - merging(const scribo::line_set<L>& lines); + merging_hdoc(const scribo::line_set<L>& lines); # ifndef MLN_INCLUDE_ONLY @@ -1094,7 +1094,7 @@ namespace scribo template <typename L> line_set<L> - merging(const scribo::line_set<L>& lines) + merging_hdoc(const scribo::line_set<L>& lines) { using namespace mln; @@ -1116,4 +1116,4 @@ namespace scribo } // end of namespace scribo -#endif // ! SCRIBO_TEXT_MERGING_HH +#endif // ! SCRIBO_TEXT_MERGING_HDOC_HH diff --git a/scribo/scribo/text/paragraphs.hh b/scribo/scribo/text/paragraphs.hh deleted file mode 100644 index e37f610..0000000 --- a/scribo/scribo/text/paragraphs.hh +++ /dev/null @@ -1,1270 +0,0 @@ -#include <mln/util/array.hh> -#include <mln/accu/shape/bbox.hh> -#include <mln/core/image/image2d.hh> -#include <mln/core/alias/neighb2d.hh> -#include <mln/draw/box.hh> -#include <mln/data/convert.hh> -#include <mln/value/int_u16.hh> -#include <mln/value/label_16.hh> -#include <mln/value/int_u8.hh> -#include <mln/value/rgb8.hh> -#include <mln/io/ppm/save.hh> -#include <mln/io/pgm/save.hh> -#include <mln/geom/rotate.hh> -#include <mln/literal/colors.hh> - -#include <scribo/core/macros.hh> -#include <scribo/core/line_set.hh> -#include <scribo/core/line_links.hh> -#include <scribo/core/line_info.hh> -#include <scribo/core/paragraph_set.hh> - -using namespace mln; - -namespace scribo -{ - namespace filter - { - - template <typename L> - inline - bool - between_horizontal_separator(const line_set<L>& lines, - const line_id_t& l1_, - const line_id_t& l2_) - { - const line_info<L>& l1 = lines(l1_); - const line_info<L>& l2 = lines(l2_); - - // No separators found in image. - mln_precondition(lines.components().has_separators()); - - const box2d& l1_bbox = l1.bbox(); - const box2d& l2_bbox = l2.bbox(); - - unsigned - row1 = l1_bbox.pcenter().row(), - row2 = l2_bbox.pcenter().row(); - const mln_ch_value(L, bool)& - separators = lines.components().separators(); - - unsigned row; - unsigned col_ptr; - unsigned left_col_ptr; - unsigned right_col_ptr; - unsigned end; - - if (row1 < row2) - { - row1 = l1_bbox.pmax().row(); - row2 = l2_bbox.pmin().row(); - - const unsigned quarter = - ((l1_bbox.pcenter().col() - l1_bbox.pmin().col()) >> 2); - - row = l1_bbox.pcenter().row(); - col_ptr = l1_bbox.pcenter().col(); - left_col_ptr = l1_bbox.pmin().col() + quarter; - right_col_ptr = l1_bbox.pmax().col() - quarter; - end = row2; - } - else - { - row2 = l2_bbox.pmax().row(); - row1 = l1_bbox.pmin().row(); - - const unsigned quarter = - ((l2_bbox.pcenter().col() - l2_bbox.pmin().col()) >> 2); - - row = l2_bbox.pcenter().row(); - col_ptr = l2_bbox.pcenter().col(); - left_col_ptr = l2_bbox.pmin().col() + quarter; - right_col_ptr = l2_bbox.pmax().col() - quarter; - end = row1; - } - - // If sep_ptr is true, then a separator is reached. - while (row < end) - { - ++row; - if (separators.at_(row, col_ptr) - || separators.at_(row, left_col_ptr) - || separators.at_(row, right_col_ptr)) - return true; - } - - return false; - } - - - template <typename L> - bool may_have_another_left_link(const line_links<L>& right, - const line_id_t& index, - const line_id_t& current_line, - const line_set<L>& lines) - { - const line_info<L>& l = lines(current_line); - const point2d& pmin = l.bbox().pmin(); - const unsigned x1 = l.x_height(); - - for_all_links(i, right) - if (i != index && right(i) == index) - { - const line_info<L>& l_info = lines(i); - const unsigned x2 = l_info.x_height(); - - const float delta_max = 0.5f * std::min(x1, x2); - - if (l_info.bbox().pmin().col() < pmin.col() - && std::abs(l.baseline() - l_info.baseline()) < delta_max - ) - return true; - } - - return false; - } - -//--------------------------------------------------------------------- -// This method aims to cut the links between lines that do not fit the -// different criteria -//--------------------------------------------------------------------- - - template <typename L> - inline - void paragraph_links(const line_links<L>& left, - line_links<L>& right, - line_links<L>& output, - const line_set<L>& lines) - { - output = left.duplicate(); - - // const unsigned nlines = lines.nelements(); - - // image2d<value::rgb8> links = data::convert(value::rgb8(), input); - // for (unsigned l = 0; l < nlines; ++l) - // { - // mln::draw::line(links, lines(l).bbox().pcenter(), lines(left(l)).bbox().pcenter(), literal::red); - // } - // mln::io::ppm::save(links, "out_links.ppm"); - - // For each line - for_all_lines(l, lines) - if (lines(l).is_textline()) - { - // Neighbors - - line_id_t left_nbh = output(l); - line_id_t right_nbh = right(l); - line_id_t lol_nbh = output(left_nbh); - - // const line_info<L>& left_line = lines(left_nbh); - // const line_info<L>& current_line = lines(l); - // const line_info<L>& right_line = lines(right_nbh); - - if (lines.components().has_separators() && - between_horizontal_separator(lines, right_nbh, l)) - { - output(right_nbh) = right_nbh; - right_nbh = l; - } - if (lines.components().has_separators() && - between_horizontal_separator(lines, l, left_nbh)) - { - output(l) = l; - left_nbh = l; - lol_nbh = l; - } - - // Line features - const float x_height = lines(l).x_height(); - const float left_x_height = lines(left_nbh).x_height(); - const float right_x_height = lines(right_nbh).x_height(); - - const box2d& left_line_bbox = lines(left_nbh).bbox(); - const box2d& current_line_bbox = lines(l).bbox(); - const box2d& right_line_bbox = lines(right_nbh).bbox(); - const box2d& lol_line_bbox = lines(lol_nbh).bbox(); // lol : left neighbor of the left neighbor - - const int lline_col_min = left_line_bbox.pmin().col(); - const int cline_col_min = current_line_bbox.pmin().col(); - const int rline_col_min = right_line_bbox.pmin().col(); - const int lolline_col_min = lol_line_bbox.pmin().col(); - - const int lline_col_max = left_line_bbox.pmax().col(); - const int cline_col_max = current_line_bbox.pmax().col(); - const int rline_col_max = right_line_bbox.pmax().col(); - - const int lline_cw = lines(left_nbh).char_width(); - const int cline_cw = lines(l).char_width(); - const int rline_cw = lines(right_nbh).char_width(); - // Maximal x variation to consider two lines vertically aligned - const int delta_alignment = cline_cw; - - - // Checks the baseline distances of the two neighbors - { - // Current line baseline - const int c_baseline = lines(l).baseline(); - - // Baseline distance with the left and right neighbors - const int lc_baseline = lines(left_nbh).baseline() - c_baseline; - const int rc_baseline = c_baseline -lines(right_nbh).baseline(); - - // Max baseline distance between the two neighbors - // const float delta_baseline_max = std::max(lc_baseline, rc_baseline); - // const float delta_baseline_min = std::min(lc_baseline, - // rc_baseline); - - // Only two lines, meaning the current line has only one neighbor - bool two_lines = false; - - // If the current line has no left neighbor - if (lc_baseline == 0) - { - // ror : right neighbor of the right neighbor - const line_id_t ror_nbh = right(right_nbh); - //const box2d& ror_line_bbox = lines(ror_nbh).bbox(); - - // If the current line has a ror - if (ror_nbh != right_nbh - && output(ror_nbh) == right_nbh) - { - // Distance between the current line and the right neighbor - const float right_distance = lines(l).meanline() - lines(right_nbh).baseline(); - // Distance between the right neighbor and the ror - const float ror_distance = lines(right_nbh).meanline() - lines(ror_nbh).baseline(); - // ror x_height - const float ror_x_height = lines(ror_nbh).x_height(); - - // Conditions to cut the link between the current line - // and its right neighbor - if (right_distance > 1.4f * ror_distance - && std::max(ror_x_height, right_x_height) < - 1.4f * std::min(ror_x_height, right_x_height) - && output(right_nbh) == l) - { - output(right_nbh) = right_nbh; - continue; - } - } - // Otherwise we only have a group of two lines - else - { - // We determine the distance between the two lines - const float distance = lines(l).meanline() - lines(right_nbh).baseline(); - two_lines = true; - - // If the distance between the two lines is greater than - // the minimum x height of the two lines then we cut the - // link between them - if (distance > 2.0f * std::min(x_height, right_x_height) - && output(right_nbh) == l) - { - output(right_nbh) = right_nbh; - continue; - } - } - - // Lines features - const float min_x_height = std::min(x_height, right_x_height); - const float max_x_height = std::max(x_height, right_x_height); - const float min_char_width = std::min(rline_cw, cline_cw); - const float max_char_width = std::max(rline_cw, cline_cw); - - // Condition to cut the link between the current line and - // its right neighbor - if ((max_x_height > min_x_height * 1.4f) && - !(max_char_width <= 1.2f * min_char_width)) - { - if (output(right_nbh) == l) - { - output(right_nbh) = right_nbh; - continue; - } - } - - // If we only have two lines we stop the study - if (two_lines) - continue; - } - // If the current line has no right neighbor - else if (rc_baseline == 0) - { - // lol : left neighbor of the left neighbor - - // If the left neighbor of the current line has a left neighbor - if (lol_nbh != left_nbh) - { - // Distance between the current line and its left neighbor - const float left_distance = lines(left_nbh).meanline() - - lines(l).baseline(); - // Distance between the left neighbor and the left - // neighbor of its left neighbor - const float lol_distance = lines(lol_nbh).meanline() - - lines(left_nbh).baseline(); - // lol x height - const float lol_x_height = lines(lol_nbh).x_height(); - - // Conditions to cut the link between the current line - // and its left neighbor - if (left_distance > 1.4f * lol_distance - && std::max(lol_x_height, left_x_height) < - 1.4f * std::min(lol_x_height, left_x_height)) - { - output(l) = l; - continue; - } - } - // Otherwise we only have a group of two lines - else - { - // Distance between the current line and it left neighbor - const float distance = lines(left_nbh).meanline() - - lines(l).baseline(); - - two_lines = true; - - // If the distance is greater than the min x height - // between the two lines - if (distance > 2.0f * std::min(x_height, left_x_height)) - { - output(l) = l; - continue; - } - } - - // Lines features - const float min_x_height = std::min(x_height, left_x_height); - const float max_x_height = std::max(x_height, left_x_height); - const float min_char_width = std::min(lline_cw, cline_cw); - const float max_char_width = std::max(lline_cw, cline_cw); - - // Condition to cut the link between the current line and - // its left neighbor - if ((max_x_height > min_x_height * 1.4f) && - !(max_char_width <= 1.2f * min_char_width)) - { - output(l) = l; - continue; - } - - // If we only have two lines we stop the study - if (two_lines) - continue; - } - // The current line has at least one left and one right neighbor - else // if (delta_baseline_max >= 1.1 * delta_baseline_min) - { - // Distance between the left and the current line - const float - left_distance = left_line_bbox.pcenter().row() - current_line_bbox.pcenter().row(); - // Distance between the right and the current line - const float - right_distance = current_line_bbox.pcenter().row() - right_line_bbox.pcenter().row();; - - // If the left line is too far compared to the right one - // we cut the link with it - if (left_distance > 1.5f * right_distance - && std::max(x_height, left_x_height) > 1.2f * std::min(x_height, left_x_height)) - { - output(l) = l; - continue; - } - // If the right line is too far compared to the left one - // we cut the link with it - else if (right_distance > 1.5f * left_distance - && std::max(x_height, right_x_height) >= 1.2f * std::min(x_height, right_x_height) - && output(right_nbh) == l) - { - output(right_nbh) = right_nbh; - continue; - } - - // If the distance between the baseline of the left - // neighbor and the baseline of the current line is - // greater than the one between the current line baseline - // and the right line baseline we have to study the texte - // features of the right and left lines - if (lc_baseline > rc_baseline) - { - const float cw_max = std::max(lline_cw, cline_cw); - const float cw_min = std::min(lline_cw, cline_cw); - const float min_x_height = std::min(x_height, left_x_height); - const float max_x_height = std::max(x_height, left_x_height); - - if ((max_x_height > min_x_height * 1.4f) && - !(cw_max <= 1.2f * cw_min)) - { - output(l) = l; - continue; - } - - { - const float min_x_height = std::min(x_height, right_x_height); - const float max_x_height = std::max(x_height, right_x_height); - const float cw_max = std::max(rline_cw, cline_cw); - const float cw_min = std::min(rline_cw, cline_cw); - - if ((max_x_height > min_x_height * 1.4f) - && !(cw_max <= 1.2f * cw_min) - && output(right_nbh) == l) - { - output(right_nbh) = right_nbh; - continue; - } - } - } - else - { - const float cw_max = std::max(rline_cw, cline_cw); - const float cw_min = std::min(rline_cw, cline_cw); - const float min_x_height = std::min(x_height, right_x_height); - const float max_x_height = std::max(x_height, right_x_height); - - if ((max_x_height > min_x_height * 1.4f) - && !(cw_max <= 1.2f * cw_min) - && output(right_nbh) == l) - { - output(right_nbh) = right_nbh; - continue; - } - - { - const float min_x_height = std::min(x_height, left_x_height); - const float max_x_height = std::max(x_height, left_x_height); - const float cw_max = std::max(lline_cw, cline_cw); - const float cw_min = std::min(lline_cw, cline_cw); - - if ((max_x_height > min_x_height * 1.4f) - && !(cw_max <= 1.2f * cw_min)) - { - output(l) = l; - continue; - } - } - } - } - } - - // If we arrive here, it means than the lines in the - // neighborhood of the current line are quite similar. We can - // then begin to study the indentations in order to determine - // the beginning of new paragraphs - -//----------------------------------------------------------------------------- -// ___________________________ -// |___________________________| -// ________________________ -// |________________________| -// ___________________________ -// |___________________________| -// ___________________________ -// |___________________________| -// -// Simple case : paragraphs are justified on the left. We try to find any -// indentation like above. -// -//----------------------------------------------------------------------------- - - { - // Check if the current line neighbors are aligned - bool left_right_aligned = false; - bool left_lol_aligned = false; - const int dx_lr = std::abs(lline_col_min - rline_col_min); - const int dx_llol = std::abs(lline_col_min - lolline_col_min); - - if (dx_lr < delta_alignment) - left_right_aligned = true; - - if (dx_llol < delta_alignment) - left_lol_aligned = true; - - if (left_right_aligned && left_lol_aligned) - { - const int left_right_col_min = std::min(lline_col_min, rline_col_min); - const int dx_lrc = std::abs(left_right_col_min - cline_col_min); - const float l_char_width = 1.5f * lines(l).char_width(); - - if (dx_lrc > l_char_width && - dx_lrc < 3.0f * l_char_width && - cline_col_min > rline_col_min && - cline_col_min > lline_col_min) - { - const line_id_t out_right_nbh = output(right_nbh); - - if (out_right_nbh != l) - { - if (output(out_right_nbh) == l) - output(out_right_nbh) = out_right_nbh; - right(l) = l; - } - else - output(right_nbh) = right_nbh; - continue; - } - } - } - -//----------------------------------------------------------------------------- -// ___________________________ -// |___________________________| -// ___________________ -// |___________________| End of the paragraph - Current line -// ________________________ -// |________________________| Beginning of a new one -// ___________________________ -// |___________________________| Left of left of current line -// -// End of paragraph case : we try to find an end to the current paragraph -// -//----------------------------------------------------------------------------- - - { - // Check if the current line neighbors are aligned - bool left_right_max_aligned = false; - bool left_current_min_aligned = false; - bool lol_current_min_aligned = false; - const bool lol_is_left = output(left_nbh) == left_nbh; - const int dx_lr_max = std::abs(lline_col_max - rline_col_max); - const int dx_lc_min = std::abs(lline_col_min - cline_col_min); - const int dx_lolc_min = std::abs(lolline_col_min - cline_col_min); - - if (dx_lr_max < delta_alignment) - left_right_max_aligned = true; - - if (dx_lc_min < delta_alignment) - left_current_min_aligned = true; - - if (dx_lolc_min < delta_alignment) - lol_current_min_aligned = true; - - if (!left_current_min_aligned && left_right_max_aligned && - (lol_current_min_aligned || lol_is_left)) - { - const int dx_lrc = std::abs(lline_col_max - cline_col_max); - const int l_char_width = lines(l).char_width(); - const int dx_indent = std::abs(std::max(lline_col_min, - rline_col_min) - cline_col_min); - - if (dx_lrc > l_char_width && - dx_indent < 4 * delta_alignment && - cline_col_max < lline_col_max && - cline_col_min < lline_col_min && - (lline_col_min > lolline_col_min || lol_is_left)) - { - output(l) = l; - continue; - } - } - } - -//----------------------------------------------------------------------------- -// ___________________________ -// |___________________________| -// ___________________________ -// |___________________________| -// ________________________ -// |________________________| -// ___________________________ -// |___________________________| -// -// Simple case : paragraphs are justified on the left. We try to find any -// indentation like above. -// -//----------------------------------------------------------------------------- - - { - const line_id_t ror_nbh = right(right_nbh); - const box2d& ror_line_bbox = lines(ror_nbh).bbox(); - const int rorline_col_min = ror_line_bbox.pmin().col(); - - bool right_ror_min_aligned = false; - bool left_right_aligned = false; - const int dx_lr = std::abs(lline_col_min - rline_col_min); - const int dx_rror_min = std::abs(rline_col_min - rorline_col_min); - - if (dx_rror_min < delta_alignment) - right_ror_min_aligned = true; - - if (dx_lr < delta_alignment) - left_right_aligned = true; - - if (right_ror_min_aligned && left_right_aligned && - ror_nbh != right_nbh) - { - const int left_right_col_min = std::min(lline_col_min, rline_col_min); - const int dx_lrc = std::abs(left_right_col_min - cline_col_min); - const float l_char_width = 1.5f * lines(l).char_width(); - - if (dx_lrc > l_char_width && - !may_have_another_left_link(right, right_nbh, l, lines) && - dx_lrc < 10.0f * l_char_width && - cline_col_min > rline_col_min && - cline_col_min > lline_col_min) - { - const line_id_t out_right_nbh = output(right_nbh); - - if (out_right_nbh != l) - { - if (output(out_right_nbh) == l) - output(out_right_nbh) = out_right_nbh; - right(l) = l; - } - else - output(right_nbh) = right_nbh; - continue; - } - } - } - -//----------------------------------------------------------------------------- -// ___________________________ -// |___________________________| -// ___________ -// |___________| -// ________________________ -// |________________________| -// ___________________________ -// |___________________________| -// -// Simple case : paragraphs are justified on the left. We try to find any -// indentation like above. -// -//----------------------------------------------------------------------------- - - { - const line_id_t ror_nbh = right(right_nbh); - const box2d& ror_line_bbox = lines(ror_nbh).bbox(); - const int rorline_col_min = ror_line_bbox.pmin().col(); - - bool left_ror_aligned = false; - const int dx_lror = std::abs(lline_col_min - rorline_col_min); - - if (dx_lror < delta_alignment) - left_ror_aligned = true; - - if (left_ror_aligned) - { - const int left_ror_col_min = std::min(lline_col_min, rorline_col_min); - const int dx_lrorc = std::abs(left_ror_col_min - cline_col_min); - const float l_char_width = 1.5f * lines(l).char_width(); - const int dx_lrorr = std::abs(left_ror_col_min - rline_col_min); - const int dx_crmax = std::abs(rline_col_max - cline_col_max); - - if (dx_lrorc > l_char_width && - dx_lrorr > 5 * l_char_width && - dx_lrorr > dx_lrorc && - dx_crmax > 5 * l_char_width && - !may_have_another_left_link(right, right_nbh, l, lines) && - dx_lrorc < 10.0f * l_char_width && - cline_col_min > rorline_col_min && - cline_col_min > lline_col_min) - { - right(right_nbh) = right_nbh; - continue; - } - } - } - - -// Strange case - { - if (rline_col_min > current_line_bbox.pcenter().col() - && !may_have_another_left_link(right, right_nbh, l, lines) - && cline_col_max < rline_col_max - && output(right_nbh) == l) - { - output(right_nbh) = right_nbh; - } - } - -//----------------------------------------------------------------------------- -// ___________________________ -// |___________________________| -// ___________________________ -// |___________________________| -// ________________________ -// |________________________| -// -// Simple case : paragraphs are justified on the left. We try to find any -// indentation like above at the end of a column. -// -//----------------------------------------------------------------------------- - - if (left_nbh == l) - { - const line_id_t ror_nbh = right(right_nbh); - const box2d& ror_line_bbox = lines(ror_nbh).bbox(); - const int rorline_col_min = ror_line_bbox.pmin().col(); - - bool right_ror_min_aligned = false; - const int dx_rror_min = std::abs(rline_col_min - rorline_col_min); - - if (dx_rror_min < delta_alignment) - right_ror_min_aligned = true; - - if (right_ror_min_aligned) - { - const int right_ror_col_min = std::min(rline_col_min, rorline_col_min); - const int dx_rrorc = std::abs(right_ror_col_min - cline_col_min); - const float l_char_width = 1.5f * lines(l).char_width(); - - if (dx_rrorc > l_char_width && - dx_rrorc < 10.0f * l_char_width && - cline_col_min > rline_col_min && - cline_col_max >= rline_col_max) - { - const line_id_t out_right_nbh = output(right_nbh); - - if (out_right_nbh != l) - { - if (output(out_right_nbh) == l) - output(out_right_nbh) = out_right_nbh; - right(l) = l; - } - else - output(right_nbh) = right_nbh; - continue; - } - } - } - } - - // Only debug - // { - // image2d<value::rgb8> debug = data::convert(value::rgb8(), input); - - // mln::util::array<accu::shape::bbox<point2d> > nbbox(output.nelements()); - // for_all_lines(l, lines) - // if (lines(l).is_textline()) - // { - // // if (lines(i).is_textline()) - // // { - // // mln::draw::box(debug, lines(i).bbox(), literal::red); - // nbbox(output(l)).take(lines(l).bbox()); - // // } - // } - - // for (unsigned i = 0; i < nbbox.nelements(); ++i) - // if (nbbox(i).is_valid()) - // { - // box2d b = nbbox(i).to_result(); - // mln::draw::box(debug, b, literal::orange); - // b.enlarge(1); - // mln::draw::box(debug, b, literal::orange); - // b.enlarge(1); - // mln::draw::box(debug, b, literal::orange); - // } - - // mln::io::ppm::save(debug, "out_paragraph.ppm"); - // } - - } - } - -//------------------------------------------------------------- -// Preparation of the lines before linking them. -// For each line we draw the top and the bottom of it. -// Assuming than i is the number of the line. Then the top of the line -// will be affected with the value 2 * i in the block image and the -// bottom with 2 * i + 1. -// -//------------------------------------------------------------- - - template <typename L> - inline - void prepare_lines(const box2d& domain, - const line_set<L>& lines, - L& blocks, - mln::util::array<box2d>& rbbox) - { - std::map< int, std::vector< const box2d* > > drawn_lines; - // const unsigned nlines = lines.nelements(); - - // For each line - //for (unsigned l = 0; l < nlines; ++l) - for_all_lines(l, lines) - if (lines(l).is_textline()) - { - // Rotation of the bounding box - box2d b = geom::rotate(lines(l).bbox(), -90, domain.pcenter()); -// rbbox.append(b); - rbbox(l) = b; - - const unsigned index = l + 1; - const unsigned even_index = 2 * index; - const unsigned odd_index = even_index + 1; - - // Top of the line - { - bool not_finished = true; - int col_offset = 0; - - while (not_finished) - { - // Looking for a column in the image to draw the top of the - // line - - const int col = b.pmax().col() + col_offset; - std::map< int, std::vector< const box2d* > >::iterator it - = drawn_lines.find(col); - - if (it != drawn_lines.end()) - { - const std::vector< const box2d* >& lines = (*it).second; - const unsigned nb_lines = lines.size(); - unsigned i = 0; - - for (i = 0; i < nb_lines; ++i) - { - const box2d* box = lines[i]; - const int min_row = std::max(b.pmin().row(), box->pmin().row()); - const int max_row = std::min(b.pmax().row(), box->pmax().row()); - - if (min_row - max_row <= 0) - break; - } - - if (i == nb_lines) - { - mln::draw::line(blocks, point2d(b.pmin().row(), col), - point2d(b.pmax().row(), col), even_index); - not_finished = false; - drawn_lines[col].push_back(&(rbbox[l])); - } - else - ++col_offset; - } - else - { - mln::draw::line(blocks, point2d(b.pmin().row(), col), - point2d(b.pmax().row(), col), even_index); - not_finished = false; - drawn_lines[col].push_back(&(rbbox[l])); - } - } - } - - // Bottom of the line - { - bool not_finished = true; - int col_offset = 0; - - while (not_finished) - { - // Looking for a column in the image to draw the bottom of - // the line - - const int col = b.pmin().col() - col_offset; - std::map< int, std::vector< const box2d* > >::iterator it - = drawn_lines.find(col); - - if (it != drawn_lines.end()) - { - const std::vector< const box2d* >& lines = (*it).second; - const unsigned nb_lines = lines.size(); - unsigned i = 0; - - for (i = 0; i < nb_lines; ++i) - { - const box2d* box = lines[i]; - const int min_row = std::max(b.pmin().row(), box->pmin().row()); - const int max_row = std::min(b.pmax().row(), box->pmax().row()); - - if (min_row - max_row <= 0) - break; - } - - if (i == nb_lines) - { - mln::draw::line(blocks, point2d(b.pmin().row(), col), - point2d(b.pmax().row(), col), odd_index); - not_finished = false; - drawn_lines[col].push_back(&(rbbox[l])); - } - else - ++col_offset; - } - else - { - mln::draw::line(blocks, point2d(b.pmin().row(), col), - point2d(b.pmax().row(), col), odd_index); - not_finished = false; - drawn_lines[col].push_back(&(rbbox[l])); - } - } - } - } - } - - template <typename L> - inline - void - process_left_link(L& blocks, - const mln::util::array<box2d>& rbbox, - const line_set<L>& lines, - line_links<L>& left) - { - typedef scribo::def::lbl_type V; - - // At the beginning each line is its own neighbor - for_all_lines(l, lines) - if (lines(l).is_textline()) - left(l) = l; - else - left(l) = 0; - - // const unsigned nlines = lines.nelements(); - - // For each line - for_all_lines(l, lines) - if (lines(l).is_textline()) - { - // Max distance for the line search - int dmax = 1.5f * lines(l).x_height(); - - // Starting points in the current line box - point2d c = rbbox(l).pcenter(); - point2d q(rbbox(l).pmin().row() + ((c.row() - rbbox(l).pmin().row()) / 4), c.col()); - - int - midcol = (rbbox(l).pmax().col() - - rbbox(l).pmin().col()) / 2; - - // Left - { - // marge gauche - int - nleftima = c.col() - blocks.domain().pmin().col(), - // Distance gauche - nleft = std::min(nleftima, midcol + dmax); - - V - // Starting points in the box - *p = &blocks(c), - *p2 = &blocks(q), - // End of search - *pstop = p - nleft - 1, - // Line neighbor - *nbh_p = 0; - - // While we haven't found a neighbor or reached the limit - for (; p != pstop; --p, --p2) - { - if (*p2 != literal::zero // Not the background - && ((*p2 % 2) == 0) // Looking for the bottom of a line - && left((*p2 >> 1) - 1) != l) // No loops - { - // Neightbor found, we stop the research - nbh_p = p2; - break; - } - - if (*p != literal::zero // Not the background - && ((*p % 2) == 0) // Looking for the bottom of a line - && left((*p >> 1) - 1) != l) // No loops - { - // Neightbor found, we stop the research - nbh_p = p; - break; - } - } - - // If a neighbor was found, then we have found the top of the - // line. We are then looking for the bottom of the encountered - // line. If during the search process we find a complete line - // included in the touched line, this line is considered as - // the neighbor under certain conditions (see below) - - //--------------------------------------------------------------- - // _________________________ | - // |_________________________| => Current line | Search direction - // v - // => First encountered top line - // __________________________________________________ 2Q - // | Q | - // | _________________________ |2P - // | |_____________P___________| => Second top |2P + 1 - // | line | - // |__________________________________________________|2Q + 1 - // - // - //--------------------------------------------------------------- - - if (nbh_p) - { - std::vector<V> lines_nbh; - const V end_p = *nbh_p + 1; - const V* nbh_p_copy = nbh_p; - - for (; *nbh_p != end_p; --nbh_p) - { - if ((*nbh_p) != literal::zero) // Not the background - { - if ((*nbh_p) % 2 == 0)// We have found the top of - // another line - lines_nbh.push_back(*nbh_p); - else - { - // We have found the bottom of a line. We are looking if - // we have already encountered the top of this - // line. If so, we link the current line with this one - // under certain conditions: - - if (std::find(lines_nbh.begin(), lines_nbh.end(), - (*nbh_p) - 1) != lines_nbh.end()) - { - // If we can link the complete line with the current line - if (// It must be in the search range - nbh_p > pstop - // Avoid loops - && left(((*nbh_p - 1) >> 1) - 1) != l) - left(l) = ((*nbh_p - 1) >> 1) - 1; - - // We have found a complete line so we stop the search - break; - } - } - } - } - - - // If we haven't found any included line in the first - // neighbor, then the line is considered as the neighbor of - // the current line - if (*nbh_p == end_p) - left(l) = (*nbh_p_copy >> 1) - 1; - } - } - } - } - - - // We assume that the lines have been rotated - template <typename L> - inline - void - process_right_link(L& blocks, - const mln::util::array<box2d>& rbbox, - const line_set<L>& lines, - line_links<L>& right) - { - typedef scribo::def::lbl_type V; - - // At the beginning each line is its own neighbor - for_all_lines(l, lines) - if (lines(l).is_textline()) - right(l) = l; - else - right(l) = 0; - - // const unsigned nlines = lines.nelements(); - - // For each line - for_all_lines(l, lines) - if (lines(l).is_textline()) - { - // Max distance for the line search - int dmax = 1.5f * lines(l).x_height(); - - // Starting points in the current line box - point2d c = rbbox(l).pcenter(); - point2d q(rbbox(l).pmax().row() - ((rbbox(l).pmax().row() - c.row()) / 4), c.col()); - - int - midcol = (rbbox(l).pmax().col() - - rbbox(l).pmin().col()) / 2; - - // Right - { - int - nrightima = geom::ncols(blocks) - c.col() + blocks.domain().pmin().col(), - nright = std::min(nrightima, midcol + dmax); - - V - // Starting points in the box - *p = &blocks(c), - *p2 = &blocks(q), - // End of search - *pstop = p + nright - 1, - // Line neighbor - *nbh_p = 0; - - // While we haven't found a neighbor or reached the limit - for (; p != pstop; ++p, ++p2) - { - if (*p2 != literal::zero // Not the background - && ((*p2 % 2) == 1) // Looking for the bottom of a line - && right(((*p2 - 1) >> 1) - 1) != l) // No loops - { - // Neightbor found, we stop the research - nbh_p = p2; - break; - } - - if (*p != literal::zero // Not the background - && ((*p % 2) == 1) // Looking for the bottom of a line - && right(((*p - 1) >> 1) - 1) != l) // No loops - { - // Neightbor found, we stop the research - nbh_p = p; - break; - } - } - - // If a neighbor was found, then we have found the bottom of the - // line. We are then looking for the top of the encountered - // line. If during the search process we find a complete line - // included in the touched line, this line is considered as - // the neighbor under certain conditions (see below) - - //--------------------------------------------------------------- - // - // - // __________________________________________________ 2Q - // | Q | - // | _________________________ |2P - // | |_____________P___________| => Second bottom |2P + 1 - // | line | - // |__________________________________________________|2Q + 1 - // => First encountered bottom line - // _________________________ ^ - // |_________________________| => Current line | Search direction - // | - //--------------------------------------------------------------- - - if (nbh_p) - { - std::vector<V> lines_nbh; - const V end_p = *nbh_p - 1; - const V* nbh_p_copy = nbh_p; - - for (; *nbh_p != end_p; ++nbh_p) - { - if (*nbh_p != literal::zero) // Not the background - { - if (*nbh_p % 2 == 1) // We have found the bottom of - // another line - lines_nbh.push_back(*nbh_p); - else - { - // We have found the top of a line. We are looking if - //we have already encountered the bottom of this - // line. If so, we link the current line with this one - // under certain conditions: - - if (std::find(lines_nbh.begin(), lines_nbh.end(), - *nbh_p + 1) != lines_nbh.end()) - { - // If we can link the complete line with the current line - if (// It must be in the search range - nbh_p < pstop - // Avoid loops - && right((*nbh_p >> 1) - 1) != l) - right(l) = (*nbh_p >> 1) - 1; - - // We have found a complete line, so we stop the search - break; - } - } - } - } - - // If we haven't found any included line in the first - // neighbor, then the line is considered as the neighbor of - // the current line - - if (*nbh_p == end_p) - right(l) = ((*nbh_p_copy - 1) >> 1) - 1; - } - } - } - } - -//----------------------------------------------------------------------- -// Finalizing the links by merging information extracted from the left -// and right links -//----------------------------------------------------------------------- - - template< typename L > - inline - void finalize_links(line_links<L>& left, - line_links<L>& right, - const line_set<L>& lines) - { - // const unsigned nlines = lines.nelements(); - - for_all_lines(l, lines) - if (lines(l).is_textline()) - { - const unsigned left_value = left(l); - const unsigned right_value = right(l); - - // If the right neighbor of my left neighbor is itself then its - // right neighbor is me - { - line_id_t& v = right(left_value); - - if (v == left_value) - v = l; - } - - // If the left neighbor of my right neighbor is itself then its - // left neighbor is me - { - line_id_t& v = left(right_value); - - if (v == right_value) - v = l; - } - } - } - - template <typename L> - inline - paragraph_set<L> - extract_paragraphs(line_set<L>& lines, - const image2d<bool>& input) - { - typedef scribo::def::lbl_type V; - - image2d<V> blocks(geom::rotate(input.domain(), -90, input.domain().pcenter())); - data::fill(blocks, 0); - - // util::array< line_info<L> > lines_info; - - // for_all_lines(l, lines) - // { - // if (lines(l).is_textline()) - // lines_info.append(lines(l)); - // } - -/// const unsigned nlines = lines_info.nelemnts(); - mln::util::array<box2d> rbbox; - line_links<L> left(lines); - left(0) = 0; - line_links<L> right(lines); - right(0) = 0; - line_links<L> output(lines); - output(0) = 0; - - rbbox.resize(lines.nelements() + 1); - -// std::cout << "Preparing lines" << std::endl; - prepare_lines(input.domain(), lines , blocks, rbbox); -// io::pgm::save(blocks, "blocks.pgm"); -// std::cout << "Linking left" << std::endl; - process_left_link(blocks, rbbox, lines , left); -// std::cout << "Linking right" << std::endl; - process_right_link(blocks, rbbox, lines , right); -// std::cout << "Finalizing links" << std::endl; - finalize_links(left, right, lines ); - // std::cout << "Finalizing merging" << std::endl; - // finalize_line_merging(left, right, lines); -// std::cout << "Extracting paragraphs" << std::endl; - filter::paragraph_links(left, right, output, lines); - - paragraph_set<L> par_set = make::paragraph(output, right); - return par_set; - } -} diff --git a/scribo/scribo/toolchain/internal/content_in_doc_functor.hh b/scribo/scribo/toolchain/internal/content_in_doc_functor.hh index 4f2c074..52ee5b0 100644 --- a/scribo/scribo/toolchain/internal/content_in_doc_functor.hh +++ b/scribo/scribo/toolchain/internal/content_in_doc_functor.hh @@ -63,7 +63,7 @@ # include <scribo/text/merging.hh> # include <scribo/text/link_lines.hh> -# include <scribo/text/paragraphs.hh> +# include <scribo/text/extract_paragraphs.hh> # include <scribo/make/debug_filename.hh> @@ -541,7 +541,7 @@ namespace scribo // scribo::paragraph_set<L> parset = scribo::make::paragraph(llinks); scribo::paragraph_set<L> - parset = extract_paragraphs(lines, doc.binary_image()); + parset = text::extract_paragraphs(lines, doc.binary_image()); doc.set_paragraphs(parset); on_progress(); diff --git a/scribo/scribo/toolchain/internal/content_in_hdoc_functor.hh b/scribo/scribo/toolchain/internal/content_in_hdoc_functor.hh index 5e98f3e..4a74aef 100644 --- a/scribo/scribo/toolchain/internal/content_in_hdoc_functor.hh +++ b/scribo/scribo/toolchain/internal/content_in_hdoc_functor.hh @@ -74,9 +74,9 @@ # include <scribo/text/recognition.hh> # endif // ! SCRIBO_NOCR -# include <scribo/text/merging.hh> +# include <scribo/text/merging_hdoc.hh> # include <scribo/text/link_lines.hh> -# include <scribo/text/paragraphs.hh> +# include <scribo/text/paragraphs_hdoc.hh> # include <scribo/make/debug_filename.hh> @@ -461,7 +461,7 @@ namespace scribo on_new_progress_label("Merging segmented lines"); - lines = scribo::text::merging(lines); + lines = scribo::text::merging_hdoc(lines); //===== DEBUG ===== @@ -525,7 +525,8 @@ namespace scribo on_new_progress_label("Extracting paragraphs"); scribo::paragraph_set<L> - parset = extract_paragraphs(lines, doc.binary_image()); + parset = scribo::text::extract_paragraphs_hdoc(lines, + doc.binary_image()); on_progress(); -- 1.5.6.5
participants (1)
-
Guillaume Lazzara