last-svn-commit-885-g33e4509 Better paragraph detection and fix some bug

--- scribo/sandbox/raphael/code/my/document/clean.hh | 138 +++++++++++++++---- .../sandbox/raphael/code/my/document/document.hh | 148 ++++++++++++++++++- scribo/sandbox/raphael/code/my/document/letters.hh | 17 +++ scribo/sandbox/raphael/code/my/util/union.hh | 24 +++- scribo/sandbox/raphael/code/test.cc | 19 ++- 5 files changed, 303 insertions(+), 43 deletions(-) create mode 100644 scribo/sandbox/raphael/code/my/document/letters.hh diff --git a/scribo/sandbox/raphael/code/my/document/clean.hh b/scribo/sandbox/raphael/code/my/document/clean.hh index 18669d3..56c7445 100644 --- a/scribo/sandbox/raphael/code/my/document/clean.hh +++ b/scribo/sandbox/raphael/code/my/document/clean.hh @@ -315,30 +315,33 @@ namespace mymln { typedef vertex_image<point2d,bool> v_ima_g; typedef p_vertices<mln::util::graph, fun::i2v::array<mln::point2d> > g_vertices_p; - v_ima_g mask = doc.fun_mask_letters(); + v_ima_g mask = doc.fun_mask_all_letters(); mln_piter_(v_ima_g) v(mask.domain()); typedef graph_elt_neighborhood_if<mln::util::graph, g_vertices_p, v_ima_g> nbh_t; nbh_t nbh(mask); mln_niter_(nbh_t) q(nbh, v); for_all(v) { - if(doc.contain_letter(v)) - { if(doc.contain_line(v)) { for_all(q) { - if(doc.allign_V(q,v) && doc.allign_size(q, v) && doc.allign_proximity(q,v)) - { - doc.add_to_line_link(v, q); - } - else if(doc.allign_size_height_line(q,v) && doc.allign_proximity_line(q,v) && doc.allign_V_line(q, v)) + if(doc.contain_line(q)) { - doc.add_to_line_link(v, q); + if(doc.allign_V(q,v) && doc.allign_size(q, v) && doc.allign_proximity(q,v)) + { + doc.add_to_line_link(v, q); + } + else if(doc.allign_size_height_line(q,v)) + { + if(doc.allign_proximity_line(q,v) && doc.allign_V_line(q, v)) + { + doc.add_to_line_link(v, q); + } + } } } } - } } doc.propage_line_link(); } @@ -352,7 +355,7 @@ namespace mymln #endif typedef vertex_image<point2d,bool> v_ima_g; typedef p_vertices<mln::util::graph, fun::i2v::array<mln::point2d> > g_vertices_p; - v_ima_g mask = doc.fun_mask_start_lines(); + v_ima_g mask = doc.fun_mask_start_end_lines(); mln_piter_(v_ima_g) v(mask.domain()); typedef graph_elt_neighborhood_if<mln::util::graph, g_vertices_p, v_ima_g> nbh_t; nbh_t nbh(mask); @@ -364,41 +367,71 @@ namespace mymln { if(doc.contain_line(v) && doc.get_beginning_of_line(v) == doc[v]) { - doc.jump_to_paragraph(v); + doc.jump_to_paragraph(v); + if(!doc.contain_paragraph(v)) + { doc.add_to_paragraph(v); } + for_all(q) { - if(doc.allign_H_Large(q,v) && doc.allign_size(q, v)) + if(doc.allign_H_Large(q,v) && doc.allign_size(q, v) && doc.allign_proximity_V_line(v,q)) { if(doc.contain_paragraph(q)) { - if(!doc.contain_paragraph(v)) - { - doc.add_to_paragraph(v); - - } - doc.add_to_paragraph_link(q, v); + doc.add_to_paragraph(q); + doc.add_to_paragraph_link(q, v); draw::line(out, q,v, mln::literal::green); } else { - + doc.add_to_paragraph(q); + doc.add_to_paragraph_link(v, q); + draw::line(out, q,v, mln::literal::magenta); + } + + } + } + } + else if(doc.contain_line(v) && doc.get_end_of_line(v) == doc[v]){} + else if(doc.contain_line(v)) + { + for_all(q) + { + if( + doc.get_beginning_of_line(q) == doc[q] && + doc.allign_H_Large(q,v) && + doc.allign_size(q, v) && + doc.allign_proximity_V_line(v,q) && + doc.allign_bottom_line(q,v) + ) + { + if(doc.contain_paragraph(q)) + { + doc.jump_to_paragraph(q); if(!doc.contain_paragraph(v)) { - doc.add_to_paragraph(q); doc.add_to_paragraph(v); - doc.add_to_paragraph_self_link(q); doc.add_to_paragraph_link(q, v); } else { - doc.add_to_paragraph(q); doc.add_to_paragraph_link(v, q); } - draw::line(out, q,v, mln::literal::magenta); + draw::line(out, q,v, mln::literal::blue); + } + else + { + doc.jump_to_paragraph(v); + if(!doc.contain_paragraph(v)) + { + doc.add_to_paragraph(v); + } + doc.add_to_paragraph(q); + doc.add_to_paragraph_link(v, q); + draw::line(out, q,v, mln::literal::blue); } - } } + } } } @@ -426,8 +459,7 @@ namespace mymln doc.get_line_length(q) < 5 && doc.allign_smaller_line(v,q) && doc.get_line_length(v) > 3 && - doc.allign_proximity_line(v,q) && - doc.allign_V_line(v,q) + doc.allign_proximity_line(v,q) ) { if(doc.allign_base_line_line(v,q) && doc.get_line_length(q) < 3) @@ -437,8 +469,60 @@ namespace mymln } } } + } + doc.propage_line_link(); + } + template<typename L, typename F, typename D> + void clean_alone_letters_lines(mymln::document::document<L,F,D>& doc, std::string dgb_out,image2d<bool> s) + { + image2d<value::rgb8> out; + mln::initialize(out, s); + typedef vertex_image<point2d,bool> v_ima_g; + typedef p_vertices<mln::util::graph, fun::i2v::array<mln::point2d> > g_vertices_p; + v_ima_g mask = doc.fun_mask_alone_letters(); + mln_piter_(v_ima_g) v(mask.domain()); + typedef graph_elt_neighborhood_if<mln::util::graph, g_vertices_p, v_ima_g> nbh_t; + nbh_t nbh(mask); + mln_niter_(nbh_t) q(nbh, v); + for_all(v) + { + if(doc.contain_line(v)) + { + for_all(q) + { + draw::line(out, q,v, mln::literal::red); + if(doc.line_has(v,q)) + {doc.add_to_line_link(v, q); draw::line(out, q,v, mln::literal::green);} + + } + } + } + doc.propage_line_link(); + io::ppm::save(mln::debug::superpose(out, s, literal::white),dgb_out); + } + + template<typename L, typename F, typename D> + void remove_alone_letter(mymln::document::document<L,F,D>& doc) + { + typedef vertex_image<point2d,bool> v_ima_g; + typedef p_vertices<mln::util::graph, fun::i2v::array<mln::point2d> > g_vertices_p; + v_ima_g mask = doc.fun_mask_alone_letters(); + mln_piter_(v_ima_g) v(mask.domain()); + typedef graph_elt_neighborhood_if<mln::util::graph, g_vertices_p, v_ima_g> nbh_t; + nbh_t nbh(mask); + mln_niter_(nbh_t) q(nbh, v); + for_all(v) + { + for_all(q) + { + if(doc.in_header(q) || doc.in_footer(q)){continue;} + doc.add_noise(q); + } } } + + + } } diff --git a/scribo/sandbox/raphael/code/my/document/document.hh b/scribo/sandbox/raphael/code/my/document/document.hh index f6204de..1a84edb 100644 --- a/scribo/sandbox/raphael/code/my/document/document.hh +++ b/scribo/sandbox/raphael/code/my/document/document.hh @@ -79,6 +79,17 @@ namespace mymln Areas_Number_ = Areas + 1; } + /* OPERATION ON PAGE */ + inline bool in_header(const point2d& p) + { return p[0] < (img_influ.domain().len(0) / 8);} + inline bool in_header(Label lbl) + { return in_header(_bboxgp[lbl]); } + + inline bool in_footer(const point2d& p) + { return p[0] > ((img_influ.domain().len(0) / 8) * 7);} + inline bool in_footer(Label lbl) + { return in_footer(_bboxgp[lbl]); } + /* OPERATION ON PARAGRAPH */ inline bool link_paragraphs() { @@ -140,7 +151,8 @@ namespace mymln inline void split_line_exclusive(const Label lbl) { if(lbl == 0){return;} - lines_union.add_self_link(lbl); + if(!lines_union.is_self_link(lbl)) + lines_union.add_self_link(lbl); Label pos = get_end_of_line(lbl); if(pos == lbl){return;} @@ -165,7 +177,8 @@ namespace mymln inline void split_line(const Label lbl) { if(lbl == 0){return;} - lines_union.add_self_link(lbl); + if(!lines_union.is_self_link(lbl)) + lines_union.add_self_link(lbl); Label pos = get_beginning_of_line(lbl); if(pos == lbl){return;} @@ -201,7 +214,8 @@ namespace mymln add_to_line(N); } else if(end_lines_mask(N)) - lines_union.add_self_link(N); + if(!lines_union.is_self_link(N)) + lines_union.add_self_link(N); else {lines_union.invalidate_link(N);} } @@ -238,6 +252,7 @@ namespace mymln add_to_line(N); } else if(start_lines_mask(N)) + if(!lines_union.is_self_link(N)) lines_union.add_self_link(N); else {lines_union.invalidate_link(N);} @@ -325,6 +340,7 @@ namespace mymln alone_letters_mask(lbl) = false; noise_mask(lbl) = true; + lines_union[lbl] = 0; } void inline add(Label lbl, int link) { @@ -571,7 +587,24 @@ namespace mymln inline bool allign_size_height( const point2d& Left, const point2d& Right) {return allign_size_height(img_influ(Left), img_influ(Right));} + + inline bool allign_proximity_V( const point2d& Left, const point2d& Right) + {return allign_proximity_V(img_influ(Left), img_influ(Right));} + inline bool allign_proximity_V( const Label Left, const Label Right) + { + short int SizeL0 = label_size_(0, Left); + short int SizeL1 = label_size_(1, Left); + short int Swap = 0; + if(SizeL0 < SizeL1) + { SizeL0 = SizeL1; } + short int Dis = _bboxgp[Left].pmin()[0] - _bboxgp[Right].pmin()[0]; + if(Dis < 0) + Dis = -Dis; + return Dis < SizeL0 * 1.5f; + } + + inline bool allign_proximity( const point2d& Left, const point2d& Right) {return allign_proximity(img_influ(Left), img_influ(Right));} @@ -601,7 +634,7 @@ namespace mymln { short int SizeL = lines_bbox[lines_union[Left]].len(0); short int SizeR = lines_bbox[lines_union[Right]].len(0); - return SizeR > (SizeL / 2) && SizeR < (SizeL * 2); + return SizeR > (SizeL / 2.2f) && SizeR < (SizeL * 2.2); } inline bool allign_proximity_line( const Label Left, const Label Right) @@ -625,7 +658,28 @@ namespace mymln } + inline bool allign_proximity_V_line( const point2d& Left, const point2d& Right) + {return allign_proximity_V_line(img_influ(Left), img_influ(Right));} + inline bool allign_proximity_V_line( const Label Left, const Label Right) + { + box2d LB = lines_bbox[lines_union[Left]]; + box2d RB = lines_bbox[lines_union[Right]]; + + int DisA = LB.pmax()[0] - RB.pmin()[0]; + int DisB = RB.pmax()[0] - LB.pmin()[0]; + if(DisA < 0){DisA = -DisA;} + if(DisB < 0){DisB = -DisB;} + if(DisA > DisB) + { DisA = DisB; } + + unsigned int HA = LB.len(0); + unsigned int HB = RB.len(0); + + if(HA < HB) + { HA = HB; } + return (DisA * 1.5f) < HA; + } inline bool allign_proximity_large( const point2d& Left, const point2d& Right) {return allign_proximity_large(img_influ(Left), img_influ(Right));} @@ -696,6 +750,7 @@ namespace mymln { short int allignV = lines_bbox[lines_union[Left]].pcenter()[0] - lines_bbox[lines_union[Right]].pcenter()[0]; if(allignV<0){allignV = -allignV;} + allignV *= 2; return allignV < lines_bbox[lines_union[Left]].len(0) && allignV < lines_bbox[lines_union[Right]].len(0); } @@ -746,6 +801,20 @@ namespace mymln allignV < lines_bbox[lines_union[Left]].len(0) && lines_bbox[lines_union[Left]].pcenter()[0] < lines_bbox[lines_union[Right]].pcenter()[0]; } + inline bool allign_bottom(const point2d& Left, const point2d& Right) + {return allign_bottom(img_influ(Left), img_influ(Right));} + inline bool allign_bottom(const Label Left, const Label Right) + { + return _bboxgp[Left].pmin()[0] < _bboxgp[Right].pmin()[0]; + } + + inline bool allign_bottom_line(const point2d& Left, const point2d& Right) + {return allign_bottom_line(img_influ(Left), img_influ(Right));} + inline bool allign_bottom_line(const Label Left, const Label Right) + { + return lines_bbox[lines_union[Left]].pmin()[0] < lines_bbox[lines_union[Right]].pmin()[0]; + } + inline bool allign_base_line(const point2d& Left, const point2d& Right) @@ -775,6 +844,34 @@ namespace mymln std::cout << " lines(s) : " << CLine << std::endl; } + void debug_save_all(std::string file, image2d<bool> source) + { + image2d<value::rgb8> ima_color; + mln::initialize(ima_color,img_influ); + + for(unsigned int N = 0; N < lines_bbox.size(); N++) + { + if(lines_bbox[N].is_valid()) + { + draw::box(ima_color, lines_bbox[N], mln::literal::blue); + } + } + for(unsigned int N = 0; N < paragraphs_bbox.size(); N++) + { + if(paragraphs_bbox[N].is_valid()) + { + draw::box(ima_color, paragraphs_bbox[N], mln::literal::red); + } + } + for(unsigned int N = 0; N < lines_first_label.size(); N++) + { + if(_bboxgp[lines_first_label[N]].is_valid()) + { + draw::box(ima_color, _bboxgp[lines_first_label[N]], mln::literal::yellow); + } + } + io::ppm::save(mln::debug::superpose(ima_color, source, literal::white) , file); + } void debug_save_paragraphs(std::string file) { mymln::debug::save_label_image(img, paragraphs_union , file);} void debug_save_lines(std::string file) @@ -900,7 +997,7 @@ namespace mymln cook_lines_(); } inline void reset_implicit_separators() - { implicit_separators_union.reset(); } + { implicit_separators_union.reset(); lines_split.fill(0);} inline void cook_lines() { lines_len = mln::util::array<unsigned int>(NLine + 1); @@ -1027,6 +1124,14 @@ namespace mymln { SeqP++; while(lines_iter_valid() && !lines_seq[SeqP]){SeqP++;} } inline void lines_iter_valid() { return SeqP < Areas_Number_; } + + + inline void cook_paragraphs() + { + paragraphs_bbox = mln::util::array<box2d>(NPar + 1); + cook_paragraphs_(); + } + private: fun::i2v::array<bool> implicit_separators_left_mask; fun::i2v::array<bool> implicit_separators_right_mask; @@ -1100,7 +1205,7 @@ namespace mymln } } } - + inline void cook_separators_right_() { implicit_separators_right_mask(0) = false; @@ -1234,12 +1339,12 @@ namespace mymln /* COOK THE FIRST AND THE LAST LABEL OF THE LINE */ if(lines_first_label[lines_union[N]] == 0) lines_first_label[lines_union[N]] = N; - else if(_bboxgp[N].pcenter()[1] < _bboxgp[lines_first_label[lines_union[N]]].pcenter()[1]) + else if(_bboxgp[N].pmin()[1] < _bboxgp[lines_first_label[lines_union[N]]].pmin()[1]) lines_first_label[lines_union[N]] = N; if(lines_last_label[lines_union[N]] == 0) lines_last_label[lines_union[N]] = N; - else if(_bboxgp[N].pcenter()[1] > _bboxgp[lines_last_label[lines_union[N]]].pcenter()[1]) + else if(_bboxgp[N].pmax()[1] > _bboxgp[lines_last_label[lines_union[N]]].pmax()[1]) lines_last_label[lines_union[N]] = N; /* FILL THE MASK WITH FALSE:MAYBE USELESS IF THE MASK IS INITIALIZED */ @@ -1275,6 +1380,10 @@ namespace mymln { lines_bbox[lines_union[N]].merge(_bboxgp[N]); } + if(lines_len[lines_union[N]] == 1) + { letters_mask(N) = false; alone_letters_mask(N) = true; } + else if(lines_union[N]) + { letters_mask(N) = true; alone_letters_mask(N) = false; } } } @@ -1411,11 +1520,34 @@ namespace mymln unsigned int NPar ; mln::util::array<unsigned int> paragraphs_first_label; mln::util::array<unsigned int> paragraphs_last_label; + mln::util::array<unsigned int> paragraphs_assoc; mln::util::array<box2d> paragraphs_bbox; inline void cook_paragraphs_() { + mln::util::array<unsigned int> paragraphs_assoc(lines_union.size()); + for(int N = 0; N < paragraphs_union.size(); N++) + { + if(paragraphs_union[N]) + { + if(paragraphs_assoc[lines_union[N]]) + { paragraphs_union.add_link(N, paragraphs_assoc[lines_union[N]]); } + else + {paragraphs_assoc[lines_union[N]] = N;} + } + } + paragraphs_union.propage_links(); + for(int N = 0; N < paragraphs_bbox.size(); N++) + { + paragraphs_bbox[N] = box2d(); + } + for(int N = 0; N < paragraphs_union.size(); N++) + { + if(paragraphs_union[N]) + paragraphs_bbox[paragraphs_union[N]].merge(lines_bbox[lines_union[N]]); + + } } diff --git a/scribo/sandbox/raphael/code/my/document/letters.hh b/scribo/sandbox/raphael/code/my/document/letters.hh new file mode 100644 index 0000000..6701943 --- /dev/null +++ b/scribo/sandbox/raphael/code/my/document/letters.hh @@ -0,0 +1,17 @@ +#ifndef INC_CLEAN_LETTER_DOC +#define INC_CLEAN_LETTER_DOC +#include<my/document/document.hh> +#include <mln/core/image/graph_elt_neighborhood.hh> +#include <mln/core/image/vertex_image.hh> +using namespace mln; + +namespace mymln +{ + namespace document + { + void clean_letter_aberation() + { + + } + } +} \ No newline at end of file diff --git a/scribo/sandbox/raphael/code/my/util/union.hh b/scribo/sandbox/raphael/code/my/util/union.hh index 53fcbb3..90a7e68 100644 --- a/scribo/sandbox/raphael/code/my/util/union.hh +++ b/scribo/sandbox/raphael/code/my/util/union.hh @@ -28,12 +28,23 @@ namespace mymln inline void invalidate_link(const Label A) { mark_link[A] = 0; } inline void add_self_link(const Label A) - { mark_link[A] = A; } + { + if(!A){return;} + if(mark_link[A] == 0) + mark_link[A] = A; + else + { + unsigned int Pos = find_parent_(A); + if(Pos) + mark_link[Pos] = A; + mark_link[A] = A; + } + } inline unsigned int link(const unsigned int index) {return mark_link[index]; } inline void add_link(const Label A, const Label B) { - + if(!B || !A){return;} unsigned int Pos = find_parent_(A); if(mark_link[B] == 0) { @@ -84,7 +95,14 @@ namespace mymln inline unsigned int find_parent_(const Label A) { unsigned int Pos = A; - while(Pos != mark_link[Pos] && Pos != 0){Pos = mark_link[Pos];} + unsigned int OldPos = A; + while(Pos != mark_link[Pos] && Pos != 0) + { + + Pos = mark_link[Pos]; + mark_link[OldPos] = mark_link[Pos]; + OldPos = Pos; + } return Pos; } mln::util::array<unsigned int> mark; diff --git a/scribo/sandbox/raphael/code/test.cc b/scribo/sandbox/raphael/code/test.cc index b009c2e..feaf817 100644 --- a/scribo/sandbox/raphael/code/test.cc +++ b/scribo/sandbox/raphael/code/test.cc @@ -127,25 +127,32 @@ void Process(std::string File, std::string Dir) mymln::document::separators::separators_find_allign(doc); mymln::document::separators::separators_make_clean(doc); doc.cook_separators(); + std::cout << "-> compute separator left " << endl; doc.cook_line_splitting(); + mymln::document::clean_line_link_item(doc); mymln::document::clean_proximity_lines(doc); mymln::document::clean_quote_lines(doc); - doc.reset_implicit_separators(); + std::cout << "-> clean separator right " << endl; mymln::document::separators::separators_find_allign_right(doc); mymln::document::separators::separators_make_clean(doc); + std::cout << "-> compute separator right " << endl; doc.cook_separators_right(); doc.cook_line_splitting_exclusive(); + std::cout << "-> clean separator right " << endl; mymln::document::clean_line_link_item(doc); mymln::document::clean_proximity_lines(doc); + std::cout << "-> clean " << endl; mymln::document::clean_quote_lines(doc); - + mymln::document::clean_alone_letters_lines(doc, Dir + "/" + "alone_graph_" + File, doc.image_mask_letters()); + doc.recook_lines(); + mymln::document::remove_alone_letter(doc); doc.recook_lines(); mymln::document::clean_paragraph_items(doc, Dir + "/" + "para_graph_" + File, doc.image_mask_letters()); - + doc.cook_paragraphs(); std::cout << "WORK ON GRAPH : " << timer.stop() << endl; //io::ppm::save(ima_influ, "separator.ppm"); //io::pbm::save(doc.image_mask_separators(),"separators"); @@ -157,7 +164,8 @@ void Process(std::string File, std::string Dir) - doc.debug_save_paragraphs(Dir + "/" + "lines_" + File); + //doc.debug_save_lines(Dir + "/" + "lines_" + File); + doc.debug_save_all(Dir + "/" + "debug_" + File, ima); //mymln::debug::save_graph_image(doc.fun_mask_implicit_separators_left(), doc.image_mask_letters(), Dir + "/" + "graph_imp_sep_line_" + File); //doc.debug_save_separators(Dir + "/" + "imp_sep_graph_" + File); @@ -199,8 +207,9 @@ void Process(std::string File, std::string Dir) // mymln::debug::save_graph_image(doc.fun_mask_separators(), ima, "separator_graph_" + File); //mymln::debug::save_graph_image(area_grph, doc.image_mask_letters(), Dir + "/" + "graph_" + File); //mymln::debug::save_graph_image(doc.fun_mask_letters(), doc.image_mask_letters(), Dir + "/" + "container_graph_" + File); -mln::util::array<box2d> linebx = doc.bbox_mask_lines(); + mln::util::array<box2d> linebx = doc.bbox_mask_lines(); mymln::debug::save_boxes_image(linebx, doc.image_mask_letters(), Dir + "/" + "lbox_" + File); + //mymln::debug::save_boxes_image(doc.bbox_enlarge_mask_letters(10, 0), ima, "linebox_" + File); } -- 1.7.2.5
participants (1)
-
Raphael Boissel