---
scribo/scribo/core/line_info.hh | 212 +++++-
scribo/scribo/core/stats.hh | 320 +++++++++
scribo/scribo/text/look_like_text_lines.hh | 16 +
scribo/scribo/text/merging.hh | 306 ++++++---
scribo/scribo/text/paragraphs.hh | 1027 ++++++++++++++++++++++++++++
5 files changed, 1774 insertions(+), 107 deletions(-)
create mode 100644 scribo/scribo/core/stats.hh
create mode 100644 scribo/scribo/text/paragraphs.hh
diff --git a/scribo/scribo/core/line_info.hh b/scribo/scribo/core/line_info.hh
index b8a8be3..c683be4 100644
--- a/scribo/scribo/core/line_info.hh
+++ b/scribo/scribo/core/line_info.hh
@@ -61,6 +61,9 @@
# include <scribo/core/internal/sort_comp_ids.hh>
# include <scribo/core/concept/serializable.hh>
+// DEBUG
+
+# include <scribo/core/stats.hh>
namespace scribo
{
@@ -145,6 +148,11 @@ namespace scribo
private:
void init_();
+ // DEBUG
+ stats< float > meanline_clusters_;
+ stats< float > baseline_clusters_;
+ };
+
};
} // end of namespace scribo::internal
@@ -273,7 +281,8 @@ namespace scribo
/// @}
-
+ bool chars_same_width() const;
+ unsigned get_first_char_height() const;
/// Force a new computation of statistics.
void force_stats_update();
@@ -301,6 +310,9 @@ namespace scribo
void update_components_type(component::Type type);
+ int compute_baseline();
+ int compute_meanline();
+
private: // Attributes
line_id_t id_;
mln::util::tracked_ptr<data_t> data_;
@@ -810,7 +822,7 @@ namespace scribo
int
line_info<L>::delta_of_line() const
{
- return char_width() + 2 * char_space();
+ return 2 * char_width() + 2 * char_space();
// FIXME: choose between:
// not enough: char_width + char_space
// too much: 2 * char_width
@@ -960,6 +972,145 @@ namespace scribo
}
template <typename L>
+ bool
+ line_info<L>::chars_same_width() const
+ {
+ // Only for the case of two-character words
+ if (card() == 2)
+ {
+ const component_set<L>& comp_set = data_->holder_.components();
+
+ const unsigned c1 = data_->components_(0);
+ const unsigned c2 = data_->components_(1);
+
+ if (data_->holder_.components()(c1).type() == component::Punctuation
+ || data_->holder_.components()(c2).type() == component::Punctuation)
+ return false;
+
+ const mln::box2d& bb1 = comp_set(c1).bbox();
+ const mln::box2d& bb2 = comp_set(c2).bbox();
+
+ const float w1 = bb1.width();
+ const float h1 = bb1.height();
+ const float w2 = bb2.width();
+ const float h2 = bb2.height();
+
+ const float space = std::max(bb1.pmin().col(), bb2.pmin().col()) -
+ std::min(bb1.pmax().col(), bb2.pmax().col());
+
+ const int dy = bb1.pmax().row() - bb2.pmax().row();
+
+ // The two characters must be distinct
+ if (space < 0)
+ return false;
+
+ if (// Approximately the same width
+ ((std::max(w1, w2) / std::min(w1, w2)) > 1.1f ||
+ // One character must not be smaller than the space between
+ // the two characters
+ (w1 < space || w2 < space))
+ // If the two characters have a different width they must also
+ // have a different height
+ && not (std::max(h1, h2) / std::min(h1, h2) <= 1.5f))
+ return false;
+
+ // Approximately aligned on baseline
+ if (std::abs(dy) > 10)
+ return false;
+
+ return true;
+ }
+
+ return false;
+ }
+
+ template< typename L >
+ unsigned
+ line_info<L>::get_first_char_height() const
+ {
+ const component_set<L>& comp_set = data_->holder_.components();
+ const unsigned c1 = data_->components_(0);
+ const mln::box2d& bb1 = comp_set(c1).bbox();
+
+ return bb1.height();
+ }
+
+ template <typename L>
+ int
+ line_info<L>::compute_baseline()
+ {
+ const unsigned nelements = data_->baseline_clusters_.nelements();
+
+ if (nelements == 2)
+ return data_->baseline_clusters_.mean();
+
+ mln::util::array< cluster_stats< float > >& clusters_b =
data_->baseline_clusters_.clusters();
+
+ unsigned index = 0;
+ float min_base = 0.0f;
+ const unsigned clusters_b_nelements = clusters_b.nelements();
+
+ for (unsigned i = 0; i < clusters_b_nelements; ++i)
+ {
+ const unsigned clusters_b_i_nelements = clusters_b[i].nelements();
+
+ if (clusters_b_i_nelements >= min_base * 2.0f)
+ {
+ min_base = clusters_b_i_nelements;
+ index = i;
+ }
+ else if (clusters_b_i_nelements >= 0.5f * min_base)
+ {
+ if (clusters_b_i_nelements > 1 &&
+ clusters_b[index].median() > clusters_b[i].median())
+ {
+ if (clusters_b_i_nelements > min_base)
+ min_base = clusters_b_i_nelements;
+ index = i;
+ }
+ }
+ }
+
+ if (clusters_b[index].nelements() <= 2 && nelements <= 5)
+ return data_->baseline_clusters_.mean();
+
+ return clusters_b[index].median();
+ }
+
+ template <typename L>
+ int
+ line_info<L>::compute_meanline()
+ {
+ mln::util::array< cluster_stats< float > >& clusters_m =
data_->meanline_clusters_.clusters();
+
+ unsigned index = 0;
+ float max_mean = 0.0f;
+ const unsigned clusters_m_nelements = clusters_m.nelements();
+
+ for (unsigned i = 0; i < clusters_m_nelements; ++i)
+ {
+ const unsigned clusters_m_i_nelements = clusters_m[i].nelements();
+
+ if (clusters_m_i_nelements >= max_mean * 2.0f)
+ {
+ max_mean = clusters_m_i_nelements;
+ index = i;
+ }
+ else if (clusters_m_i_nelements >= 0.5f * max_mean)
+ {
+ if (clusters_m[index].median() < clusters_m[i].median())
+ {
+ if (clusters_m_i_nelements > max_mean)
+ max_mean = clusters_m_i_nelements;
+ index = i;
+ }
+ }
+ }
+
+ return clusters_m[index].median();
+ }
+
+ template <typename L>
void
line_info<L>::force_stats_update()
{
@@ -970,8 +1121,8 @@ namespace scribo
typedef mln::value::int_u<12> median_data_t;
typedef mln::accu::stat::median_h<median_data_t> median_t;
median_t
- meanline,
- baseline,
+ // meanline,
+ // baseline,
char_space,
char_width;
@@ -995,6 +1146,10 @@ namespace scribo
mln::def::coord ref_line = mln_max(mln::def::coord);
+ // DEBUG
+ data_->baseline_clusters_.reset();
+ data_->meanline_clusters_.reset();
+
// Find a reference line to compute baselines and other attributes.
// Workaround to avoid overflow with int_u<12> in median accumulators.
//
@@ -1033,23 +1188,22 @@ namespace scribo
// incremented.
++used_comps;
-
// COMPUTE FEATURES DATA
- if (comp_set(c).has_features())
- {
- // Compute boldness
- boldness.take(comp_set(c).features().boldness);
- sum2_boldness += mln::math::sqr<float>(comp_set(c).features().boldness);
-
- // Compute color
- color_red.take(comp_set(c).features().color.red());
- color_green.take(comp_set(c).features().color.green());
- color_blue.take(comp_set(c).features().color.blue());
-
- sum2_red += mln::math::sqr<unsigned>(comp_set(c).features().color.red());
- sum2_green += mln::math::sqr<unsigned>(comp_set(c).features().color.green());
- sum2_blue += mln::math::sqr<unsigned>(comp_set(c).features().color.blue());
- }
+ // if (comp_set(c).has_features())
+ // {
+ // // Compute boldness
+ // boldness.take(comp_set(c).features().boldness);
+ // sum2_boldness += mln::math::sqr<float>(comp_set(c).features().boldness);
+
+ // // Compute color
+ // color_red.take(comp_set(c).features().color.red());
+ // color_green.take(comp_set(c).features().color.green());
+ // color_blue.take(comp_set(c).features().color.blue());
+
+ // sum2_red +=
mln::math::sqr<unsigned>(comp_set(c).features().color.red());
+ // sum2_green +=
mln::math::sqr<unsigned>(comp_set(c).features().color.green());
+ // sum2_blue +=
mln::math::sqr<unsigned>(comp_set(c).features().color.blue());
+ // }
// FIXME: we must guaranty here that the relationship is from
// right to left, otherwise, the space size computed between
@@ -1083,12 +1237,14 @@ namespace scribo
// Meanline (compute an absolute value, from the top left corner
// of the highest character bounding box, excluding
// punctuation).
- meanline.take(bb.pmin().row() - ref_line);
+ // meanline.take(bb.pmin().row() - ref_line);
+ data_->meanline_clusters_.take(bb.pmin().row());
// Baseline (compute an absolute value, from the top left corner
// of the highest character bounding box, excluding
// punctuation).
- baseline.take(bb.pmax().row() - ref_line);
+ // baseline.take(bb.pmax().row() - ref_line);
+ data_->baseline_clusters_.take(bb.pmax().row());
}
// Finalization
@@ -1140,12 +1296,14 @@ namespace scribo
else
data_->char_width_ = char_width.to_result();
- mln::def::coord
- absolute_baseline_r = baseline.to_result() + ref_line,
- absolute_meanline_r = meanline.to_result() + ref_line;
+ // // mln::def::coord
+ // // absolute_baseline_r = baseline.to_result() + ref_line,
+ // // absolute_meanline_r = meanline.to_result() + ref_line;
- data_->baseline_ = absolute_baseline_r;
- data_->meanline_ = absolute_meanline_r;
+ // data_->baseline_ = absolute_baseline_r;
+ // data_->meanline_ = absolute_meanline_r;
+ data_->baseline_ = compute_baseline();
+ data_->meanline_ = compute_meanline();
data_->x_height_ = data_->baseline_ - data_->meanline_ + 1;
data_->d_height_ = data_->baseline_ - bbox.to_result().pmax().row();
data_->a_height_ = data_->baseline_ - bbox.to_result().pmin().row() + 1;
diff --git a/scribo/scribo/core/stats.hh b/scribo/scribo/core/stats.hh
new file mode 100644
index 0000000..570325c
--- /dev/null
+++ b/scribo/scribo/core/stats.hh
@@ -0,0 +1,320 @@
+#ifndef STATS_HH_
+# define STATS_HH_
+
+# include <vector>
+# include <algorithm>
+
+# include <mln/util/array.hh>
+
+using namespace mln;
+
+//---------------------------------------------------------------------------
+// compare_values
+//---------------------------------------------------------------------------
+
+template< typename T >
+struct compare_values
+{
+ bool operator() (const T& lhs,
+ const T& rhs)
+ {
+ return (lhs < rhs);
+ }
+};
+
+
+//---------------------------------------------------------------------------
+// cluster_stats
+//---------------------------------------------------------------------------
+
+template< typename T >
+class cluster_stats
+{
+public:
+ cluster_stats()
+ : mean_needs_update_(false), median_needs_update_(false),
+ variance_needs_update_(false), std_needs_update_(false), size_(0)
+ {
+ }
+
+ cluster_stats(const unsigned size)
+ : mean_needs_update_(false), median_needs_update_(false),
+ variance_needs_update_(false), std_needs_update_(false), size_(0)
+ {
+ data_.reserve(size);
+ }
+
+ void reset()
+ {
+ std::vector< T >& data = data_.hook_std_vector_();
+ data.clear();
+
+ size_ = 0;
+ needs_update();
+ }
+
+ void take(const T& value)
+ {
+ if (not size_)
+ {
+ min_ = value;
+ max_ = value;
+ }
+ else
+ {
+ if (value < min_)
+ min_ = value;
+ else if (value > max_)
+ max_ = value;
+ }
+
+ ++size_;
+ data_.append(value);
+ needs_update();
+ }
+
+ T mean()
+ {
+ if (mean_needs_update_)
+ {
+ mean_ = 0;
+
+ for (unsigned i = 0; i < size_; ++i)
+ mean_ += data_[i];
+
+ mean_ /= size_;
+ mean_needs_update_ = false;
+ }
+
+ return mean_;
+ }
+
+ T median()
+ {
+ if (median_needs_update_)
+ {
+ std::vector< T >& data = data_.hook_std_vector_();
+ std::sort(data.begin(), data.end(), compare_values< T >());
+
+ median_ = data[(size_ - 1) >> 1];
+ median_needs_update_ = false;
+ }
+
+ return median_;
+ }
+
+ T variance()
+ {
+ if (variance_needs_update_)
+ {
+ mean();
+ variance_ = 0;
+
+ for (unsigned i = 0; i < size_; ++i)
+ {
+ const T tmp = mean_ - data_[i];
+
+ variance_ += (tmp * tmp);
+ }
+
+ variance_ /= size_;
+ std_ = sqrt(variance_);
+
+ variance_needs_update_ = false;
+ std_needs_update_ = false;
+ }
+
+ return variance_;
+ }
+
+ T standard_deviation()
+ {
+ if (std_needs_update_)
+ variance();
+
+ return std_;
+ }
+
+ T min() { return min_; }
+ T max() { return max_; }
+
+ void sort()
+ {
+ std::vector< T >& data = data_.hook_std_vector_();
+ std::sort(data.begin(), data.end(), compare_values< T >());
+ }
+
+ unsigned nelements() { return size_; }
+
+ T operator[] (const unsigned index)
+ {
+ return data_[index];
+ }
+
+private:
+ void needs_update()
+ {
+ mean_needs_update_ = true;
+ median_needs_update_ = true;
+ variance_needs_update_ = true;
+ std_needs_update_ = true;
+ }
+
+private:
+ bool mean_needs_update_ : 1;
+ bool median_needs_update_ : 1;
+ bool variance_needs_update_ : 1;
+ bool std_needs_update_ : 1;
+ T mean_;
+ T median_;
+ T min_;
+ T max_;
+ T variance_;
+ T std_;
+ unsigned size_;
+ util::array< T > data_;
+};
+
+//---------------------------------------------------------------------------
+// stats
+//---------------------------------------------------------------------------
+
+template< typename T >
+class stats
+{
+public:
+ stats()
+ : clusters_need_update_(true), data_sorted_(false)
+ {
+ }
+
+ stats(const int size)
+ : clusters_need_update_(true), data_sorted_(false), data_(size)
+ {
+ }
+
+ void reset()
+ {
+ data_.reset();
+ std::vector< cluster_stats< T > >& clusters =
clusters_.hook_std_vector_();
+ clusters.clear();
+ data_sorted_ = false;
+ clusters_need_update_ = true;
+ }
+
+ void take(const T& value)
+ {
+ data_.take(value);
+ clusters_need_update_ = true;
+ }
+
+ T mean()
+ {
+ return data_.mean();
+ }
+
+ T median()
+ {
+ return data_.median();
+ }
+
+ T variance()
+ {
+ return data_.variance();
+ }
+
+ T standard_deviation()
+ {
+ return data_.standard_deviation();
+ }
+
+ T min() { return data_.min(); }
+ T max() { return data_.max(); }
+
+ unsigned nelements() { return data_.nelements(); }
+
+ util::array< cluster_stats< T > >& clusters()
+ {
+ if (clusters_need_update_)
+ {
+ compute_clusters();
+ clusters_need_update_ = false;
+ }
+
+ return clusters_;
+ }
+
+private:
+ void compute_clusters()
+ {
+ std::vector< unsigned > clusters;
+ unsigned cluster_index = 1;
+
+ clusters.reserve(data_.nelements());
+
+ if (not data_sorted_)
+ {
+ data_.sort();
+ data_sorted_ = true;
+ }
+
+ unsigned i = 0;
+ const unsigned nelements = data_.nelements();
+
+ clusters[0] = cluster_index;
+ const T std = data_.standard_deviation();
+
+ for (i = 1; i < nelements - 1; ++i)
+ {
+ const T left_distance = data_[i] - data_[i - 1];
+ const T right_distance = data_[i + 1] - data_[i];
+
+ if (not ((left_distance <= 2 || left_distance < right_distance)
+ && left_distance <= std))
+ ++cluster_index;
+
+ clusters[i] = cluster_index;
+ }
+
+ if (nelements > 1
+ && data_[i] - data_[i - 1] > std)
+ ++cluster_index;
+
+ clusters[i] = cluster_index;
+
+ clusters_.clear();
+ clusters_.reserve(cluster_index);
+ cluster_index = 1;
+
+ i = 0;
+ while (i < nelements)
+ {
+ unsigned tmp = i;
+
+ while (tmp < nelements && clusters[tmp] == clusters[i])
+ ++tmp;
+
+ cluster_stats< T > cluster(tmp - i);
+
+ tmp = i;
+ while (tmp < nelements && clusters[tmp] == clusters[i])
+ {
+ cluster.take(data_[tmp]);
+ ++tmp;
+ }
+
+ clusters_.append(cluster);
+
+ i = tmp;
+ ++cluster_index;
+ }
+ }
+
+private:
+ bool clusters_need_update_ : 1;
+ bool data_sorted_ : 1;
+ cluster_stats< T > data_;
+ util::array< cluster_stats< T > > clusters_;
+};
+
+#endif /* STATS_HH_ */
diff --git a/scribo/scribo/text/look_like_text_lines.hh
b/scribo/scribo/text/look_like_text_lines.hh
index 2ced7ce..80d9995 100644
--- a/scribo/scribo/text/look_like_text_lines.hh
+++ b/scribo/scribo/text/look_like_text_lines.hh
@@ -64,6 +64,22 @@ namespace scribo
inline
bool looks_like_a_text_line(const scribo::line_info<L>& l)
{
+ // Special case for two-letter words
+ if (l.card() == 2)
+ {
+ const float ratio = (float) l.bbox().width() / l.bbox().height();
+
+ if (// Minimal width / height ratio
+ ratio > 0.4f && ratio < 2.0f
+ // Minimal height
+ && l.bbox().height() >= 15
+ // Characters must have approximately the same width
+ && l.chars_same_width())
+ {
+ return true;
+ }
+ }
+
return
l.card() >= 3 // at least 3 components
&& l.bbox().height() > 10 // and minimal height
diff --git a/scribo/scribo/text/merging.hh b/scribo/scribo/text/merging.hh
index 26e2da7..8ee349b 100644
--- a/scribo/scribo/text/merging.hh
+++ b/scribo/scribo/text/merging.hh
@@ -188,21 +188,24 @@ namespace scribo
swap_ordering(l1, l2);
parent[l2] = l1; // The smallest label value is root.
- if (lines(l2).card() > lines(l1).card())
+ line_info<L>& l1_info = lines(l1);
+ line_info<L>& l2_info = lines(l2);
+
+ if (l2_info.card() > l1_info.card())
{
// we transfer data from the largest item to the root one.
- scribo::line_info<L> tmp = lines(l1);
- std::swap(lines(l1), lines(l2));
- lines(l1).fast_merge(tmp);
+ scribo::line_info<L> tmp = l1_info;
+ std::swap(l1_info, l2_info);
+ l1_info.fast_merge(tmp);
// We must set manually the tag for lines(l2) since it is
// not used directly in merge process so its tag cannot be
// updated automatically.
- lines(l2).update_tag(line::Merged);
- lines(l2).set_hidden(true);
+ l2_info.update_tag(line::Merged);
+ l2_info.set_hidden(true);
}
else
- lines(l1).fast_merge(lines(l2));
+ l1_info.fast_merge(l2_info);
// l1's tag is automatically set to line::Needs_Precise_Stats_Update
// l2's tag is automatically set to line::Merged
@@ -226,34 +229,57 @@ namespace scribo
bool between_separators(const scribo::line_info<L>& l1,
const scribo::line_info<L>& l2)
{
- // No separators found in image.
+ // No separators found in image.
mln_precondition(l1.holder().components().has_separators());
- unsigned
- col1 = l1.bbox().pcenter().col(),
- col2 = l2.bbox().pcenter().col();
+ const box2d& l1_bbox = l1.bbox();
+ const box2d& l2_bbox = l2.bbox();
+
+ const unsigned
+ col1 = l1_bbox.pcenter().col(),
+ col2 = l2_bbox.pcenter().col();
const mln_ch_value(L, bool)&
separators = l1.holder().components().separators();
+ // Checking for separators starting from 1 / 4, 3/ 4 and the
+ // center of the box
typedef const bool* sep_ptr_t;
- sep_ptr_t sep_ptr, end;
+ sep_ptr_t sep_ptr, sep_ptr_top, sep_ptr_bottom, end;
if (col1 < col2)
{
- sep_ptr = &separators(l1.bbox().pcenter());
+ const unsigned quarter =
+ ((l1_bbox.pcenter().row() - l1_bbox.pmin().row()) >> 1);
+
+ sep_ptr = &separators(l1_bbox.pcenter());
+ sep_ptr_top = &separators(point2d(l1_bbox.pmin().row() + quarter,
+ l1_bbox.pcenter().col()));
+ sep_ptr_bottom = &separators(point2d(l1_bbox.pmax().row() - quarter,
+ l1_bbox.pcenter().col()));
end = sep_ptr + col2 - col1;
}
else
{
- sep_ptr = &separators(l2.bbox().pcenter());
+ const unsigned quarter =
+ ((l2_bbox.pcenter().row() - l2_bbox.pmin().row()) >> 1);
+
+ sep_ptr = &separators(l2_bbox.pcenter());
+ sep_ptr_top = &separators(point2d(l2_bbox.pmin().row() + quarter,
+ l2_bbox.pcenter().col()));
+ sep_ptr_bottom = &separators(point2d(l2_bbox.pmax().row() - quarter,
+ l2_bbox.pcenter().col()));
end = sep_ptr + col1 - col2;
}
// If sep_ptr is true, then a separator is reached.
- while (!*sep_ptr && sep_ptr != end)
+ while (!*sep_ptr && !*sep_ptr_top && !*sep_ptr_bottom && sep_ptr
!= end)
+ {
++sep_ptr;
+ ++sep_ptr_top;
+ ++sep_ptr_bottom;
+ }
- return *sep_ptr;
+ return (*sep_ptr || *sep_ptr_top || *sep_ptr_bottom);
}
@@ -266,16 +292,78 @@ namespace scribo
*/
template <typename L>
- bool lines_can_merge(const scribo::line_info<L>& l1,
+ bool lines_can_merge(scribo::line_info<L>& l1,
const scribo::line_info<L>& l2)
{
// Parameters.
- const float x_ratio_max = 1.7, baseline_delta_max = 3;
+ const float x_ratio_max = 1.7f;
+ const float baseline_delta_max =
+ 0.5f * std::min(l1.x_height(), l2.x_height());
+
+ const box2d& l1_bbox = l1.bbox();
+ const box2d& l2_bbox = l2.bbox();
+
+ const point2d& l1_pmin = l1_bbox.pmin();
+ const point2d& l2_pmin = l2_bbox.pmin();
+ const point2d& l1_pmax = l1_bbox.pmax();
+ const point2d& l2_pmax = l2_bbox.pmax();
+
+ const bool l1_has_separators = l1.holder().components().has_separators();
+ const bool l1_l2_between_separators = (l1_has_separators) ?
+ between_separators(l1, l2) : false;
+ const float l_ted_cw = l2.char_width();
+
+ const float dx = std::max(l1_pmin.col(), l2_pmin.col())
+ - std::min(l1_pmax.col(), l2_pmax.col());
+ const float dy = std::max(l1_pmin.row(), l2_pmin.row())
+ - std::min(l1_pmax.row(), l2_pmax.row());
+
+ // Particular case of "
+ {
+ if (// Must have 2 characters
+ (l1.card() == 2
+ // The box height must be smaller than the touched line x height
+ && l1_bbox.height() < l2.x_height())
+ // The line must be vertically and horizontally close to
+ // the touched line
+ && (dx < l_ted_cw && dy < 0)
+ // No separator between the two lines
+ && not (l1_l2_between_separators))
+ {
+ // Line is then considered as punctuation
+ l1.update_type(line::Punctuation);
+ return true;
+ }
+ }
+
+ // Particular case like merging between a line and [5]
+ {
+ const mln::def::coord
+ top_row_l2 = l2_pmin.row(),
+ top_row_l1 = l1_pmin.row(),
+ bot_row = l2_pmax.row();
+ const float x1 = l1.x_height(), x2 = l2.x_height();
+ const float x_ratio = std::max(x1, x2) / std::min(x1, x2);
+
+ if (// No separator
+ !l1_l2_between_separators
+ // The x height ration must be lower than 2
+ && (x_ratio < 2.0f)
+ // Baseline alignment
+ && (std::abs(bot_row - l1.baseline()) < baseline_delta_max)
+ // The top of the boxes must be aligned
+ && (std::abs(top_row_l2 - top_row_l1) < 5)
+ // Distance between the line and the touched line.
+ && dx < 5.0f * l_ted_cw)
+ {
+ return true;
+ }
+ }
// Similarity of x_height.
{
- float x1 = l1.x_height(), x2 = l2.x_height();
- float x_ratio = std::max(x1, x2) / std::min(x1, x2);
+ const float x1 = l1.x_height(), x2 = l2.x_height();
+ const float x_ratio = std::max(x1, x2) / std::min(x1, x2);
if (x_ratio > x_ratio_max)
return false;
}
@@ -287,22 +375,21 @@ namespace scribo
}
// left / right
- unsigned
- col1 = l1.bbox().pcenter().col(),
- col2 = l2.bbox().pcenter().col();
+ const unsigned
+ col1 = l1_bbox.pcenter().col(),
+ col2 = l2_bbox.pcenter().col();
if (col1 < col2)
{
- if ((col1 + l1.bbox().width() / 4) >= (col2 - l2.bbox().width() / 4))
+ if ((col1 + l1_bbox.width() / 4) >= (col2 - l2_bbox.width() / 4))
return false;
}
else
- if ((col2 + l2.bbox().width() / 4) >= (col1 - l1.bbox().width() / 4))
+ if ((col2 + l2_bbox.width() / 4) >= (col1 - l1_bbox.width() / 4))
return false;
-
// Check that there is no separator in between.
- if (l1.holder().components().has_separators())
- return ! between_separators(l1, l2);
+ if (l1_has_separators)
+ return ! l1_l2_between_separators;
return true;
}
@@ -310,14 +397,14 @@ namespace scribo
- template <typename L>
- int horizontal_distance(const scribo::line_info<L>& l1,
- const scribo::line_info<L>& l2)
+ inline
+ int horizontal_distance(const box2d& l1,
+ const box2d& l2)
{
- if (l1.bbox().pcenter().col() < l2.bbox().pcenter().col())
- return l2.bbox().pmin().col() - l1.bbox().pmax().col();
+ if (l1.pcenter().col() < l2.pcenter().col())
+ return l2.pmin().col() - l1.pmax().col();
else
- return l1.bbox().pmin().col() - l2.bbox().pmax().col();
+ return l1.pmin().col() - l2.pmax().col();
}
@@ -339,7 +426,7 @@ namespace scribo
*/
template <typename L>
- bool non_text_and_text_can_merge(const scribo::line_info<L>& l_cur, //
current
+ bool non_text_and_text_can_merge(scribo::line_info<L>& l_cur, // current
const scribo::line_info<L>& l_ted) // touched
{
if (l_cur.type() == line::Text || l_ted.type() != line::Text)
@@ -353,22 +440,42 @@ namespace scribo
&& between_separators(l_cur, l_ted))
return false;
+ const box2d& l_cur_bbox = l_cur.bbox();
+ const box2d& l_ted_bbox = l_ted.bbox();
+
+ const point2d& l_cur_pmin = l_cur_bbox.pmin();
+ const point2d& l_ted_pmin = l_ted_bbox.pmin();
+ const point2d& l_cur_pmax = l_cur_bbox.pmax();
+ const point2d& l_ted_pmax = l_ted_bbox.pmax();
+
+ const float dx = std::max(l_cur_pmin.col(), l_ted_pmin.col())
+ - std::min(l_cur_pmax.col(), l_ted_pmax.col());
+ const float l_ted_cw = l_ted.char_width();
+ const float l_ted_x_height = l_ted.x_height();
+
+ const unsigned l_cur_height = l_cur_bbox.height();
+ const unsigned l_cur_width = l_cur_bbox.width();
// General case (for tiny components like --> ',:."; <--):
- if (l_cur.bbox().height() < l_ted.x_height()
- && float(l_cur.bbox().width()) / float(l_cur.card()) <
l_ted.char_width())
+ if (l_cur_height < l_ted_x_height
+ && l_cur_height > 0.05f * l_ted_x_height
+ && float(l_cur_width) / float(l_cur.card()) < l_ted.char_width()
+ && dx < l_ted_cw)
+ {
+ l_cur.update_type(line::Punctuation);
return true;
-
+ }
// Special case for '---':
if (// small height:
- l_cur.bbox().height() < l_ted.x_height()
+ l_cur_height < l_ted_x_height
// // not so long width:
- && l_cur.bbox().width() < 5 * l_ted.char_width()
+ && l_cur_width > 0.8 * l_ted_cw
+ && l_cur_width < 5 * l_ted_cw
// align with the 'x' center:
&& std::abs((l_ted.baseline() + l_ted.meanline()) / 2 -
l_cur.bbox().pcenter().row()) < 7
// tiny spacing:
- && unsigned(horizontal_distance(l_cur, l_ted)) < l_ted.char_width()
+ && unsigned(horizontal_distance(l_cur_bbox, l_ted_bbox)) < 2 * l_ted_cw
)
{
return true;
@@ -378,10 +485,11 @@ namespace scribo
// Special case
// Looking for alignement.
- mln::def::coord
+ const mln::def::coord
top_row = l_cur.bbox().pmin().row(),
bot_row = l_cur.bbox().pmax().row();
+ const box2d& l_ted_ebbox = l_ted.ebbox();
// std::cout << "top_row = " << top_row << " - bot_row
= " << bot_row << std::endl;
// std::cout << std::abs(bot_row - l_ted.baseline())
@@ -402,10 +510,11 @@ namespace scribo
// << std::endl;
if ((std::abs(bot_row - l_ted.baseline()) < 5
- || std::abs(bot_row - l_ted.ebbox().pmax().row()) < 5)
+ || std::abs(bot_row - l_ted_ebbox.pmax().row()) < 5)
&&
(std::abs(top_row - l_ted.meanline()) < 5
- || std::abs(top_row - l_ted.ebbox().pmin().row()) < 5))
+ || std::abs(top_row - l_ted_ebbox.pmin().row()) < 5)
+ && dx < 5.0f * l_ted_cw)
{
return true;
}
@@ -536,35 +645,33 @@ namespace scribo
if (parent[l] != l) // not a root, so has already merged, thus ignore it
continue;
- box2d b = lines(l).bbox();
+ const box2d& b = lines(l).bbox();
- unsigned tl, tr, ml, mc, mr, bl, br;
+ // unsigned tl, tr, ml, mc, mr, bl, br;
+
+ const box2d& b_ = lines(l).ebbox();
+
+ /*
+ tl tr
+ x---------------x
+ | |
+ | mc |
+ ml x x x mr
+ | |
+ | |
+ x---------------x
+ bl br
+
+ */
- {
- box2d b_ = lines(l).ebbox();
-
- /*
- tl tr
- x---------------x
- | |
- | mc |
- ml x x x mr
- | |
- | |
- x---------------x
- bl br
-
- */
-
-
- tl = billboard(b_.pmin());
- tr = billboard.at_(b_.pmin().row(), b_.pmax().col());
- ml = billboard.at_(b_.pcenter().row(), b_.pmin().col());
- mc = billboard.at_(b_.pcenter().row(), b_.pcenter().col());
- mr = billboard.at_(b_.pcenter().row(), b_.pmax().col());
- bl = billboard.at_(b_.pmax().row(), b_.pmin().col());
- br = billboard(b_.pmax());
- }
+
+ const unsigned tl = billboard(b_.pmin());
+ const unsigned tr = billboard.at_(b_.pmin().row(), b_.pmax().col());
+ const unsigned ml = billboard.at_(b_.pcenter().row(), b_.pmin().col());
+ const unsigned mc = billboard.at_(b_.pcenter().row(), b_.pcenter().col());
+ const unsigned mr = billboard.at_(b_.pcenter().row(), b_.pmax().col());
+ const unsigned bl = billboard.at_(b_.pmax().row(), b_.pmin().col());
+ const unsigned br = billboard(b_.pmax());
typedef std::set<unsigned> set_t;
std::set<unsigned> labels;
@@ -596,11 +703,47 @@ namespace scribo
{
// Main case: it is an "included" box (falling in an already drawn box)
- if (lines(l).type() == line::Text) // the current object IS a text line
+ const line_info<L>& l_info = lines(l);
+ const line_info<L>& mc_info = lines(mc);
+
+ if (l_info.type() == line::Text) // the current object IS a text line
{
- if (lines(mc).type() == line::Text) // included in a text line => weird
+ if (mc_info.type() == line::Text) // included in a text line => weird
{
++count_txtline_IN_txtline;
+
+ // Particular case of "
+ // {
+ // if ((lines(l).card() == 2 &&
+ // lines(l).bbox().height() < lines(mc).x_height()) &&
+ // not (lines(l).holder().components().has_separators()
+ // && between_separators(lines(l),
+ // lines(mc))))
+
+ const box2d& l_bbox = l_info.bbox();
+ const box2d& mc_bbox = mc_info.bbox();
+
+ const point2d& l_pmin = l_bbox.pmin();
+ const point2d& mc_pmin = mc_bbox.pmin();
+ const point2d& l_pmax = l_bbox.pmax();
+ const point2d& mc_pmax = mc_bbox.pmax();
+
+ const float dx = std::max(l_pmin.col(), mc_pmin.col())
+ - std::min(l_pmax.col(), mc_pmax.col());
+ const float dy = std::max(l_pmin.row(), mc_pmin.row())
+ - std::min(l_pmax.row(), mc_pmax.row());
+ const float l_ted_cw = mc_info.char_width();
+
+ // We accept a line included into another only if it
+ // is horizontally close to the line's bbox and
+ // vertically aligned
+ // Obviously no separators between the two lines
+ if (dx < l_ted_cw && dy < 0
+ && not (l_info.holder().components().has_separators()
+ && between_separators(l_info, mc_info)))
+ l = do_union(lines, l, mc, parent);
+ // }
+
// std::cout << "weird: inclusion of a txt_line in a txt_line!"
<< std::endl;
/// Merge is perform if the current line is a
@@ -649,15 +792,15 @@ namespace scribo
// could be noise or garbage... So adding new
// criterions could fix this issue.
//
- //if (!non_text_and_text_can_merge(lines(l), lines(mc)))
- // continue;
+ if (!non_text_and_text_can_merge(lines(l), lines(mc)))
+ continue;
// Avoid the case when a large title ebbox overlap
// with a text column. In that case, the title may
// merge with punctuation from the text.
- if (lines(l).holder().components().has_separators()
- && between_separators(lines(l), lines(mc)))
- continue;
+ // if (lines(l).holder().components().has_separators()
+ // && between_separators(lines(l), lines(mc)))
+ // continue;
// Mark current line as punctuation.
lines(l).update_type(line::Punctuation);
@@ -832,9 +975,12 @@ namespace scribo
bool operator()(const scribo::line_id_t& l1, const scribo::line_id_t& l2) const
{
- if (lines_(l1).bbox().nsites() == lines_(l2).bbox().nsites())
+ const unsigned l1_nsites = lines_(l1).bbox().nsites();
+ const unsigned l2_nsites = lines_(l2).bbox().nsites();
+
+ if (l1_nsites == l2_nsites)
return l1 < l2;
- return lines_(l1).bbox().nsites() < lines_(l2).bbox().nsites();
+ return l1_nsites < l2_nsites;
}
scribo::line_set<L> lines_;
diff --git a/scribo/scribo/text/paragraphs.hh b/scribo/scribo/text/paragraphs.hh
new file mode 100644
index 0000000..a3ff802
--- /dev/null
+++ b/scribo/scribo/text/paragraphs.hh
@@ -0,0 +1,1027 @@
+#include <mln/util/array.hh>
+#include <mln/accu/shape/bbox.hh>
+#include <mln/core/image/image2d.hh>
+#include <mln/core/alias/neighb2d.hh>
+#include <mln/draw/box.hh>
+#include <mln/data/convert.hh>
+#include <mln/value/int_u16.hh>
+#include <mln/value/label_16.hh>
+#include <mln/value/int_u8.hh>
+#include <mln/value/rgb8.hh>
+#include <mln/io/ppm/save.hh>
+#include <mln/io/pgm/save.hh>
+#include <mln/geom/rotate.hh>
+#include <mln/literal/colors.hh>
+
+#include <scribo/core/macros.hh>
+#include <scribo/core/line_set.hh>
+#include <scribo/core/line_info.hh>
+
+using namespace mln;
+
+namespace scribo
+{
+
+ namespace internal
+ {
+
+//-------------------------------------
+// Extracting root of links
+//-------------------------------------
+ template <typename T>
+ inline
+ unsigned
+ find_root(util::array<T>& parent, unsigned x)
+ {
+ unsigned tmp_x = x;
+
+ while (parent(tmp_x) != tmp_x)
+ tmp_x = parent(tmp_x);
+
+ while (parent(x) != x)
+ {
+ const unsigned tmp = parent(x);
+ x = parent(x);
+ parent(tmp) = tmp_x;
+ }
+
+ return x;
+ }
+ }
+
+ namespace filter
+ {
+
+//---------------------------------------------------------------------
+// This method aims to cut the links between lines that do not fit the
+// different criteria
+//---------------------------------------------------------------------
+
+ template <typename L>
+ inline
+ void paragraph_links(const util::array<value::int_u16>& left,
+ const util::array<value::int_u16>& right,
+ util::array<value::int_u16>& output,
+ const util::array< line_info<L> >& lines,
+ const image2d<bool>& input)
+ {
+ output = left;
+
+ const unsigned nlines = lines.nelements();
+
+ // image2d<value::rgb8> links = data::convert(value::rgb8(), input);
+ // for (unsigned l = 0; l < nlines; ++l)
+ // {
+ // mln::draw::line(links, lines(l).bbox().pcenter(),
lines(left(l)).bbox().pcenter(), literal::red);
+ // }
+ // mln::io::ppm::save(links, "out_links.ppm");
+
+ // For each line
+ for (unsigned l = 0; l < nlines; ++l)
+ {
+ // Neighbors
+
+ const value::int_u16 left_nbh = output(l);
+ const value::int_u16 right_nbh = right(l);
+ const value::int_u16 lol_nbh = output(left_nbh);
+
+ // Line features
+ const float x_height = lines(l).x_height();
+ const float left_x_height = lines(left_nbh).x_height();
+ const float right_x_height = lines(right_nbh).x_height();
+
+ const box2d& left_line_bbox = lines(left_nbh).bbox();
+ const box2d& current_line_bbox = lines(l).bbox();
+ const box2d& right_line_bbox = lines(right_nbh).bbox();
+ const box2d& lol_line_bbox = lines(lol_nbh).bbox(); // lol : left neighbor of the
left neighbor
+
+ const int lline_col_min = left_line_bbox.pmin().col();
+ const int cline_col_min = current_line_bbox.pmin().col();
+ const int rline_col_min = right_line_bbox.pmin().col();
+ const int lolline_col_min = lol_line_bbox.pmin().col();
+
+ const int lline_col_max = left_line_bbox.pmax().col();
+ const int cline_col_max = current_line_bbox.pmax().col();
+ const int rline_col_max = right_line_bbox.pmax().col();
+
+ const int lline_cw = lines(left_nbh).char_width();
+ const int cline_cw = lines(l).char_width();
+ const int rline_cw = lines(right_nbh).char_width();
+ // Maximal x variation to consider two lines vertically aligned
+ const int delta_alignment = cline_cw;
+
+ // Checks the baseline distances of the two neighbors
+ {
+ // Current line baseline
+ const int c_baseline = lines(l).baseline();
+
+ // Baseline distance with the left and right neighbors
+ const int lc_baseline = lines(left_nbh).baseline() - c_baseline;
+ const int rc_baseline = c_baseline -lines(right_nbh).baseline();
+
+ // Max baseline distance between the two neighbors
+ // const float delta_baseline_max = std::max(lc_baseline, rc_baseline);
+ // const float delta_baseline_min = std::min(lc_baseline,
+ // rc_baseline);
+
+ // Only two lines, meaning the current line has only one neighbor
+ bool two_lines = false;
+
+ // If the current line has no left neighbor
+ if (lc_baseline == 0)
+ {
+ // ror : right neighbor of the right neighbor
+ const value::int_u16 ror_nbh = right(right_nbh);
+ const box2d& ror_line_bbox = lines(ror_nbh).bbox();
+
+ // If the current line has a ror
+ if (ror_nbh != right_nbh
+ && output(ror_nbh) == right_nbh)
+ {
+ // Distance between the current line and the right neighbor
+ const float right_distance =
+ current_line_bbox.pcenter().row() - right_line_bbox.pcenter().row();
+ // Distance between the right neighbor and the ror
+ const float ror_distance =
+ right_line_bbox.pcenter().row() - ror_line_bbox.pcenter().row();
+ // ror x_height
+ const float ror_x_height = lines(ror_nbh).x_height();
+
+ // Conditions to cut the link between the current line
+ // and its right neighbor
+ if (right_distance > 1.4f * ror_distance
+ && std::max(ror_x_height, right_x_height) <
+ 1.2f * std::min(ror_x_height, right_x_height)
+ && output(right_nbh) == l)
+ {
+ output(right_nbh) = right_nbh;
+ continue;
+ }
+ }
+ // Otherwise we only have a group of two lines
+ else
+ {
+ // We determine the distance between the two lines
+ const float distance = lines(l).meanline() - lines(right_nbh).baseline();
+ two_lines = true;
+
+ // If the distance between the two lines is greater than
+ // the minimum x height of the two lines then we cut the
+ // link between them
+ if (distance > std::min(x_height, right_x_height)
+ && output(right_nbh) == l)
+ {
+ output(right_nbh) = right_nbh;
+ continue;
+ }
+ }
+
+ // Lines features
+ const float min_x_height = std::min(x_height, right_x_height);
+ const float max_x_height = std::max(x_height, right_x_height);
+ const float min_char_width = std::min(rline_cw, cline_cw);
+ const float max_char_width = std::max(rline_cw, cline_cw);
+
+ // Condition to cut the link between the current line and
+ // its right neighbor
+ if ((max_x_height > min_x_height * 1.2f) &&
+ !(max_char_width <= 1.2f * min_char_width))
+ {
+ if (output(right_nbh) == l)
+ {
+ output(right_nbh) = right_nbh;
+ continue;
+ }
+ }
+
+ // If we only have two lines we stop the study
+ if (two_lines)
+ continue;
+ }
+ // If the current line has no right neighbor
+ else if (rc_baseline == 0)
+ {
+ // lol : left neighbor of the left neighbor
+
+ // If the left neighbor of the current line has a left neighbor
+ if (lol_nbh != left_nbh)
+ {
+ // Distance between the current line and its left neighbor
+ const float left_distance =
+ left_line_bbox.pcenter().row() - current_line_bbox.pcenter().row();
+ // Distance between the left neighbor and the left
+ // neighbor of its left neighbor
+ const float lol_distance =
+ lol_line_bbox.pcenter().row() - left_line_bbox.pcenter().row();
+ // lol x height
+ const float lol_x_height = lines(lol_nbh).x_height();
+
+ // Conditions to cut the link between the current line
+ // and its left neighbor
+ if (left_distance > 1.4f * lol_distance
+ && std::max(lol_x_height, left_x_height) <
+ 1.2f * std::min(lol_x_height, left_x_height))
+ {
+ output(l) = l;
+ continue;
+ }
+ }
+ // Otherwise we only have a group of two lines
+ else
+ {
+ // Distance between the current line and it left neighbor
+ const float distance = lines(left_nbh).meanline() -
+ lines(l).baseline();
+
+ two_lines = true;
+
+ // If the distance is greater than the min x height
+ // between the two lines
+ if (distance > std::min(x_height, left_x_height))
+ {
+ output(l) = l;
+ continue;
+ }
+ }
+
+ // Lines features
+ const float min_x_height = std::min(x_height, left_x_height);
+ const float max_x_height = std::max(x_height, left_x_height);
+ const float min_char_width = std::min(lline_cw, cline_cw);
+ const float max_char_width = std::max(lline_cw, cline_cw);
+
+ // Condition to cut the link between the current line and
+ // its left neighbor
+ if ((max_x_height > min_x_height * 1.2f) &&
+ !(max_char_width <= 1.2f * min_char_width))
+ {
+ output(l) = l;
+ continue;
+ }
+
+ // If we only have two lines we stop the study
+ if (two_lines)
+ continue;
+ }
+ // The current line has at least one left and one right neighbor
+ else // if (delta_baseline_max >= delta_baseline_min)
+ {
+ // Distance between the left and the current line
+ const float left_distance =
+ lines(left_nbh).meanline() - lines(l).baseline();
+ // Distance between the right and the current line
+ const float right_distance =
+ lines(l).meanline() - lines(right_nbh).baseline();
+
+ // If the left line is too far compared to the right one
+ // we cut the link with it
+ if (left_distance > 1.2f * right_distance
+ && std::max(x_height, left_x_height) > 1.2f * std::min(x_height,
left_x_height))
+ {
+ output(l) = l;
+ continue;
+ }
+ // If the right line is too far compared to the left one
+ // we cut the link with it
+ else if (right_distance > 1.2f * left_distance
+ && std::max(x_height, right_x_height) > 1.2f * std::min(x_height,
right_x_height)
+ && output(right_nbh) == l)
+ {
+ output(right_nbh) = right_nbh;
+ continue;
+ }
+
+ // If the distance between the baseline of the left
+ // neighbor and the baseline of the current line is
+ // greater than the one between the current line baseline
+ // and the right line baseline we have to study the texte
+ // features of the right and left lines
+ if (lc_baseline > rc_baseline)
+ {
+ const float cw_max = std::max(lline_cw, cline_cw);
+ const float cw_min = std::min(lline_cw, cline_cw);
+ const float min_x_height = std::min(x_height, left_x_height);
+ const float max_x_height = std::max(x_height, left_x_height);
+
+ if ((max_x_height > min_x_height * 1.2f) &&
+ !(cw_max <= 1.2f * cw_min))
+ {
+ output(l) = l;
+ continue;
+ }
+
+ {
+ const float min_x_height = std::min(x_height, right_x_height);
+ const float max_x_height = std::max(x_height, right_x_height);
+ const float cw_max = std::max(rline_cw, cline_cw);
+ const float cw_min = std::min(rline_cw, cline_cw);
+
+ if ((max_x_height > min_x_height * 1.2f)
+ && !(cw_max <= 1.2f * cw_min)
+ && output(right_nbh) == l)
+ {
+ output(right_nbh) = right_nbh;
+ continue;
+ }
+ }
+ }
+ else
+ {
+ const float cw_max = std::max(rline_cw, cline_cw);
+ const float cw_min = std::min(rline_cw, cline_cw);
+ const float min_x_height = std::min(x_height, right_x_height);
+ const float max_x_height = std::max(x_height, right_x_height);
+
+ if ((max_x_height > min_x_height * 1.2f)
+ && !(cw_max <= 1.2f * cw_min)
+ && output(right_nbh) == l)
+ {
+ output(right_nbh) = right_nbh;
+ continue;
+ }
+
+ {
+ const float min_x_height = std::min(x_height, left_x_height);
+ const float max_x_height = std::max(x_height, left_x_height);
+ const float cw_max = std::max(lline_cw, cline_cw);
+ const float cw_min = std::min(lline_cw, cline_cw);
+
+ if ((max_x_height > min_x_height * 1.2f)
+ && !(cw_max <= 1.2f * cw_min))
+ {
+ output(l) = l;
+ continue;
+ }
+ }
+ }
+ }
+ }
+
+ // If we arrive here, it means than the lines in the
+ // neighborhood of the current line are quite similar. We can
+ // then begin to study the indentations in order to determine
+ // the beginning of new paragraphs
+
+//-----------------------------------------------------------------------------
+// ___________________________
+// |___________________________|
+// ________________________
+// |________________________|
+// ___________________________
+// |___________________________|
+// ___________________________
+// |___________________________|
+//
+// Simple case : paragraphs are justified on the left. We try to find any
+// indentation like above.
+//
+//-----------------------------------------------------------------------------
+
+ {
+ // Check if the current line neighbors are aligned
+ bool left_right_aligned = false;
+ bool left_lol_aligned = false;
+ const int dx_lr = std::abs(lline_col_min - rline_col_min);
+ const int dx_llol = std::abs(lline_col_min - lolline_col_min);
+
+ if (dx_lr < delta_alignment)
+ left_right_aligned = true;
+
+ if (dx_llol < delta_alignment)
+ left_lol_aligned = true;
+
+ if (left_right_aligned && left_lol_aligned)
+ {
+ const int left_right_col_min = std::min(lline_col_min, rline_col_min);
+ const int dx_lrc = std::abs(left_right_col_min - cline_col_min);
+ const float l_char_width = 1.5f * lines(l).char_width();
+
+ if (dx_lrc > l_char_width &&
+ dx_lrc < 3.0f * l_char_width &&
+ cline_col_min > rline_col_min &&
+ cline_col_min > lline_col_min)
+ {
+ output(right_nbh) = right_nbh;
+ continue;
+ }
+ }
+ }
+
+//-----------------------------------------------------------------------------
+// ___________________________
+// |___________________________|
+// ___________________
+// |___________________| End of the paragraph - Current line
+// ________________________
+// |________________________| Beginning of a new one
+// ___________________________
+// |___________________________| Left of left of current line
+//
+// End of paragraph case : we try to find an end to the current paragraph
+//
+//-----------------------------------------------------------------------------
+
+ {
+ // Check if the current line neighbors are aligned
+ bool left_right_max_aligned = false;
+ bool left_current_min_aligned = false;
+ bool lol_current_min_aligned = false;
+ const bool lol_is_left = output(left_nbh) == left_nbh;
+ const int dx_lr_max = std::abs(lline_col_max - rline_col_max);
+ const int dx_lc_min = std::abs(lline_col_min - cline_col_min);
+ const int dx_lolc_min = std::abs(lolline_col_min - cline_col_min);
+
+ if (dx_lr_max < delta_alignment)
+ left_right_max_aligned = true;
+
+ if (dx_lc_min < delta_alignment)
+ left_current_min_aligned = true;
+
+ if (dx_lolc_min < delta_alignment)
+ lol_current_min_aligned = true;
+
+ if (!left_current_min_aligned && left_right_max_aligned &&
+ (lol_current_min_aligned || lol_is_left))
+ {
+ const int dx_lrc = std::abs(lline_col_max - cline_col_max);
+ const int l_char_width = lines(l).char_width();
+
+ if (dx_lrc > l_char_width &&
+ cline_col_max < lline_col_max &&
+ cline_col_min < lline_col_min &&
+ (lline_col_min > lolline_col_min || lol_is_left))
+ {
+ output(l) = l;
+ continue;
+ }
+ }
+ }
+
+
+//-----------------------------------------------------------------------------
+// ___________________________
+// |___________________________|
+// ___________________________
+// |___________________________|
+// ________________________
+// |________________________|
+//
+// Simple case : paragraphs are justified on the left. We try to find any
+// indentation like above at the end of a column.
+//
+//-----------------------------------------------------------------------------
+
+ if (left_nbh == l)
+ {
+ const value::int_u16 ror_nbh = right(right_nbh);
+ const box2d& ror_line_bbox = lines(ror_nbh).bbox();
+ const int rorline_col_min = ror_line_bbox.pmin().col();
+
+ bool right_ror_min_aligned = false;
+ const int dx_rror_min = std::abs(rline_col_min - rorline_col_min);
+
+ if (dx_rror_min < delta_alignment)
+ right_ror_min_aligned = true;
+
+ if (right_ror_min_aligned)
+ {
+ const int right_ror_col_min = std::min(rline_col_min, rorline_col_min);
+ const int dx_rrorc = std::abs(right_ror_col_min - cline_col_min);
+ const float l_char_width = 1.5f * lines(l).char_width();
+
+ if (dx_rrorc > l_char_width &&
+ dx_rrorc < 3.0f * l_char_width &&
+ cline_col_min > rline_col_min &&
+ cline_col_max >= rline_col_max)
+ {
+ output(right_nbh) = right_nbh;
+ continue;
+ }
+ }
+ }
+ }
+
+
+ // Only debug
+
+ {
+ image2d<value::rgb8> debug = data::convert(value::rgb8(), input);
+
+ for (unsigned i = 0; i < output.nelements(); ++i)
+ output(i) = internal::find_root(output, i);
+
+ mln::util::array<accu::shape::bbox<point2d> > nbbox(output.nelements());
+ for (unsigned i = 0; i < nlines; ++i)
+ {
+ // if (lines(i).is_textline())
+ // {
+ // mln::draw::box(debug, lines(i).bbox(), literal::red);
+ nbbox(output(i)).take(lines(i).bbox());
+ // }
+ }
+
+ for (unsigned i = 0; i < nbbox.nelements(); ++i)
+ if (nbbox(i).is_valid())
+ {
+ box2d b = nbbox(i).to_result();
+ mln::draw::box(debug, b, literal::orange);
+ b.enlarge(1);
+ mln::draw::box(debug, b, literal::orange);
+ b.enlarge(1);
+ mln::draw::box(debug, b, literal::orange);
+ }
+
+ mln::io::ppm::save(debug, "out_paragraph.ppm");
+ }
+
+ }
+ }
+
+//-------------------------------------------------------------
+// Preparation of the lines before linking them.
+// For each line we draw the top and the bottom of it.
+// Assuming than i is the number of the line. Then the top of the line
+// will be affected with the value 2 * i in the block image and the
+// bottom with 2 * i + 1.
+//
+//-------------------------------------------------------------
+
+ template <typename L>
+ inline
+ void prepare_lines(const box2d& domain,
+ const util::array< line_info<L> >& lines,
+ image2d<value::int_u16>& blocks,
+ util::array<box2d>& rbbox)
+ {
+ std::map< int, std::vector< const box2d* > > drawn_lines;
+ const unsigned nlines = lines.nelements();
+
+ // For each line
+ for (unsigned l = 0; l < nlines; ++l)
+ {
+ // Rotation of the bounding box
+ box2d b = geom::rotate(lines(l).bbox(), -90, domain.pcenter());
+ rbbox.append(b);
+
+ const unsigned index = l + 1;
+ const unsigned even_index = 2 * index;
+ const unsigned odd_index = even_index + 1;
+
+ // Top of the line
+ {
+ bool not_finished = true;
+ int col_offset = 0;
+
+ while (not_finished)
+ {
+ // Looking for a column in the image to draw the top of the
+ // line
+
+ const int col = b.pmax().col() + col_offset;
+ std::map< int, std::vector< const box2d* > >::iterator it
+ = drawn_lines.find(col);
+
+ if (it != drawn_lines.end())
+ {
+ const std::vector< const box2d* >& lines = (*it).second;
+ const unsigned nb_lines = lines.size();
+ unsigned i = 0;
+
+ for (i = 0; i < nb_lines; ++i)
+ {
+ const box2d* box = lines[i];
+ const int min_row = std::max(b.pmin().row(), box->pmin().row());
+ const int max_row = std::min(b.pmax().row(), box->pmax().row());
+
+ if (min_row - max_row <= 0)
+ break;
+ }
+
+ if (i == nb_lines)
+ {
+ mln::draw::line(blocks, point2d(b.pmin().row(), col),
+ point2d(b.pmax().row(), col), even_index);
+ not_finished = false;
+ drawn_lines[col].push_back(&(rbbox[l]));
+ }
+ else
+ ++col_offset;
+ }
+ else
+ {
+ mln::draw::line(blocks, point2d(b.pmin().row(), col),
+ point2d(b.pmax().row(), col), even_index);
+ not_finished = false;
+ drawn_lines[col].push_back(&(rbbox[l]));
+ }
+ }
+ }
+
+ // Bottom of the line
+ {
+ bool not_finished = true;
+ int col_offset = 0;
+
+ while (not_finished)
+ {
+ // Looking for a column in the image to draw the bottom of
+ // the line
+
+ const int col = b.pmin().col() - col_offset;
+ std::map< int, std::vector< const box2d* > >::iterator it
+ = drawn_lines.find(col);
+
+ if (it != drawn_lines.end())
+ {
+ const std::vector< const box2d* >& lines = (*it).second;
+ const unsigned nb_lines = lines.size();
+ unsigned i = 0;
+
+ for (i = 0; i < nb_lines; ++i)
+ {
+ const box2d* box = lines[i];
+ const int min_row = std::max(b.pmin().row(), box->pmin().row());
+ const int max_row = std::min(b.pmax().row(), box->pmax().row());
+
+ if (min_row - max_row <= 0)
+ break;
+ }
+
+ if (i == nb_lines)
+ {
+ mln::draw::line(blocks, point2d(b.pmin().row(), col),
+ point2d(b.pmax().row(), col), odd_index);
+ not_finished = false;
+ drawn_lines[col].push_back(&(rbbox[l]));
+ }
+ else
+ ++col_offset;
+ }
+ else
+ {
+ mln::draw::line(blocks, point2d(b.pmin().row(), col),
+ point2d(b.pmax().row(), col), odd_index);
+ not_finished = false;
+ drawn_lines[col].push_back(&(rbbox[l]));
+ }
+ }
+ }
+ }
+ }
+
+ template <typename L>
+ inline
+ void
+ process_left_link(image2d<value::int_u16>& blocks,
+ const util::array<box2d>& rbbox,
+ const util::array< line_info<L> >& lines,
+ util::array<value::int_u16>& left)
+ {
+ typedef value::int_u16 V;
+
+ // At the beginning each line is its own neighbor
+ for (unsigned i = 0; i < left.nelements(); ++i)
+ left(i) = i;
+
+ const unsigned nlines = lines.nelements();
+
+ // For each line
+ for (unsigned i = 0; i < nlines; ++i)
+ {
+ // Max distance for the line search
+ int dmax = 1.5f * lines(i).x_height();
+
+ // Starting points in the current line box
+ point2d c = rbbox(i).pcenter();
+ point2d q(rbbox(i).pmin().row() + ((c.row() - rbbox(i).pmin().row()) / 4),
c.col());
+
+ int
+ midcol = (rbbox(i).pmax().col()
+ - rbbox(i).pmin().col()) / 2;
+
+ // Left
+ {
+ // marge gauche
+ int
+ nleftima = c.col() - blocks.domain().pmin().col(),
+ // Distance gauche
+ nleft = std::min(nleftima, midcol + dmax);
+
+ V
+ // Starting points in the box
+ *p = &blocks(c),
+ *p2 = &blocks(q),
+ // End of search
+ *pstop = p - nleft - 1,
+ // Line neighbor
+ *nbh_p = 0;
+
+ // While we haven't found a neighbor or reached the limit
+ for (; p != pstop; --p, --p2)
+ {
+ if (*p2 != literal::zero // Not the background
+ && ((*p2 % 2) == 0) // Looking for the bottom of a line
+ && left((*p2 >> 1) - 1) != i) // No loops
+ {
+ // Neightbor found, we stop the research
+ nbh_p = p2;
+ break;
+ }
+
+ if (*p != literal::zero // Not the background
+ && ((*p % 2) == 0) // Looking for the bottom of a line
+ && left((*p >> 1) - 1) != i) // No loops
+ {
+ // Neightbor found, we stop the research
+ nbh_p = p;
+ break;
+ }
+ }
+
+ // If a neighbor was found, then we have found the top of the
+ // line. We are then looking for the bottom of the encountered
+ // line. If during the search process we find a complete line
+ // included in the touched line, this line is considered as
+ // the neighbor under certain conditions (see below)
+
+ //---------------------------------------------------------------
+ // _________________________ |
+ // |_________________________| => Current line | Search direction
+ // v
+ // => First encountered top line
+ // __________________________________________________ 2Q
+ // | Q |
+ // | _________________________ |2P
+ // | |_____________P___________| => Second top |2P + 1
+ // | line |
+ // |__________________________________________________|2Q + 1
+ //
+ //
+ //---------------------------------------------------------------
+
+ if (nbh_p)
+ {
+ std::vector<V> lines_nbh;
+ const V end_p = *nbh_p + 1;
+ const V* nbh_p_copy = nbh_p;
+
+ for (; *nbh_p != end_p; --nbh_p)
+ {
+ if ((*nbh_p) != literal::zero) // Not the background
+ {
+ if ((*nbh_p) % 2 == 0)// We have found the top of
+ // another line
+ lines_nbh.push_back(*nbh_p);
+ else
+ {
+ // We have found the bottom of a line. We are looking if
+ // we have already encountered the top of this
+ // line. If so, we link the current line with this one
+ // under certain conditions:
+
+ if (std::find(lines_nbh.begin(), lines_nbh.end(),
+ (*nbh_p) - 1) != lines_nbh.end())
+ {
+ // If we can link the complete line with the current line
+ if (// It must be in the search range
+ nbh_p > pstop
+ // Avoid loops
+ && left(((*nbh_p - 1) >> 1) - 1) != i)
+ left(i) = ((*nbh_p - 1) >> 1) - 1;
+
+ // We have found a complete line so we stop the search
+ break;
+ }
+ }
+ }
+ }
+
+
+ // If we haven't found any included line in the first
+ // neighbor, then the line is considered as the neighbor of
+ // the current line
+ if (*nbh_p == end_p)
+ left(i) = (*nbh_p_copy >> 1) - 1;
+ }
+ }
+ }
+ }
+
+
+ // We assume that the lines have been rotated
+ template <typename L>
+ inline
+ void
+ process_right_link(image2d<value::int_u16>& blocks,
+ const util::array<box2d>& rbbox,
+ const util::array< line_info<L> >& lines,
+ util::array<value::int_u16>& right)
+ {
+ typedef value::int_u16 V;
+
+ // At the beginning each line is its own neighbor
+ for (unsigned i = 0; i < right.nelements(); ++i)
+ right(i) = i;
+
+ const unsigned nlines = lines.nelements();
+
+ // For each line
+ for (unsigned i = 0; i < nlines; ++i)
+ {
+ // Max distance for the line search
+ int dmax = 1.5f * lines(i).x_height();
+
+ // Starting points in the current line box
+ point2d c = rbbox(i).pcenter();
+ point2d q(rbbox(i).pmax().row() - ((rbbox(i).pmax().row() - c.row()) / 4),
c.col());
+
+ int
+ midcol = (rbbox(i).pmax().col()
+ - rbbox(i).pmin().col()) / 2;
+
+ // Right
+ {
+ int
+ nrightima = geom::ncols(blocks) - c.col() + blocks.domain().pmin().col(),
+ nright = std::min(nrightima, midcol + dmax);
+
+ V
+ // Starting points in the box
+ *p = &blocks(c),
+ *p2 = &blocks(q),
+ // End of search
+ *pstop = p + nright - 1,
+ // Line neighbor
+ *nbh_p = 0;
+
+ // While we haven't found a neighbor or reached the limit
+ for (; p != pstop; ++p, ++p2)
+ {
+ if (*p2 != literal::zero // Not the background
+ && ((*p2 % 2) == 1) // Looking for the bottom of a line
+ && right(((*p2 - 1) >> 1) - 1) != i) // No loops
+ {
+ // Neightbor found, we stop the research
+ nbh_p = p2;
+ break;
+ }
+
+ if (*p != literal::zero // Not the background
+ && ((*p % 2) == 1) // Looking for the bottom of a line
+ && right(((*p - 1) >> 1) - 1) != i) // No loops
+ {
+ // Neightbor found, we stop the research
+ nbh_p = p;
+ break;
+ }
+ }
+
+ // If a neighbor was found, then we have found the bottom of the
+ // line. We are then looking for the top of the encountered
+ // line. If during the search process we find a complete line
+ // included in the touched line, this line is considered as
+ // the neighbor under certain conditions (see below)
+
+ //---------------------------------------------------------------
+ //
+ //
+ // __________________________________________________ 2Q
+ // | Q |
+ // | _________________________ |2P
+ // | |_____________P___________| => Second bottom |2P + 1
+ // | line |
+ // |__________________________________________________|2Q + 1
+ // => First encountered bottom line
+ // _________________________ ^
+ // |_________________________| => Current line | Search direction
+ // |
+ //---------------------------------------------------------------
+
+ if (nbh_p)
+ {
+ std::vector<V> lines_nbh;
+ const V end_p = *nbh_p - 1;
+ const V* nbh_p_copy = nbh_p;
+
+ for (; *nbh_p != end_p; ++nbh_p)
+ {
+ if (*nbh_p != literal::zero) // Not the background
+ {
+ if (*nbh_p % 2 == 1) // We have found the bottom of
+ // another line
+ lines_nbh.push_back(*nbh_p);
+ else
+ {
+ // We have found the top of a line. We are looking if
+ //we have already encountered the bottom of this
+ // line. If so, we link the current line with this one
+ // under certain conditions:
+
+ if (std::find(lines_nbh.begin(), lines_nbh.end(),
+ *nbh_p + 1) != lines_nbh.end())
+ {
+ // If we can link the complete line with the current line
+ if (// It must be in the search range
+ nbh_p < pstop
+ // Avoid loops
+ && right((*nbh_p >> 1) - 1) != i)
+ right(i) = (*nbh_p >> 1) - 1;
+
+ // We have found a complete line, so we stop the search
+ break;
+ }
+ }
+ }
+ }
+
+ // If we haven't found any included line in the first
+ // neighbor, then the line is considered as the neighbor of
+ // the current line
+
+ if (*nbh_p == end_p)
+ right(i) = ((*nbh_p_copy - 1) >> 1) - 1;
+ }
+ }
+ }
+ }
+
+//-----------------------------------------------------------------------
+// Finalizing the links by merging information extracted from the left
+// and right links
+//-----------------------------------------------------------------------
+
+ template< typename L >
+ inline
+ void finalize_links(util::array<value::int_u16>& left,
+ util::array<value::int_u16>& right,
+ const util::array< line_info<L> >& lines)
+ {
+ const unsigned nlines = lines.nelements();
+
+ for (unsigned i = 0; i < nlines; ++i)
+ {
+ const unsigned left_value = left(i);
+ const unsigned right_value = right(i);
+
+ // If the right neighbor of my left neighbor is itself then its
+ // right neighbor is me
+ {
+ value::int_u16& v = right(left_value);
+
+ if (v == left_value)
+ v = i;
+ }
+
+ // If the left neighbor of my right neighbor is itself then its
+ // left neighbor is me
+ {
+ value::int_u16& v = left(right_value);
+
+ if (v == right_value)
+ v = i;
+ }
+ }
+ }
+
+ template <typename L>
+ inline
+ void extract_paragraphs(line_set<L>& lines,
+ const image2d<bool>& input)
+ {
+ typedef value::int_u16 V;
+
+ image2d<V> blocks(geom::rotate(input.domain(), -90,
input.domain().pcenter()));
+ data::fill(blocks, 0);
+
+ util::array< line_info<L> > lines_info;
+
+ for_all_lines(l, lines)
+ {
+ if (lines(l).is_textline())
+ lines_info.append(lines(l));
+ }
+
+ const unsigned nlines = lines_info.nelements();
+ util::array<box2d> rbbox;
+ util::array<V> left(nlines);
+ util::array<V> right(nlines);
+ util::array<V> output;
+
+ rbbox.reserve(nlines);
+ output.reserve(nlines);
+
+ std::cout << "Preparing lines" << std::endl;
+ prepare_lines(input.domain(), lines_info, blocks, rbbox);
+// io::pgm::save(blocks, "blocks.pgm");
+ std::cout << "Linking left" << std::endl;
+ process_left_link(blocks, rbbox, lines_info, left);
+ std::cout << "Linking right" << std::endl;
+ process_right_link(blocks, rbbox, lines_info, right);
+ std::cout << "Finalizing links" << std::endl;
+ finalize_links(left, right, lines_info);
+ // std::cout << "Finalizing merging" << std::endl;
+ // finalize_line_merging(left, right, lines);
+ std::cout << "Extracting paragraphs" << std::endl;
+ filter::paragraph_links(left, right, output, lines_info, input);
+ }
+}
--
1.5.6.5