olena-2.0-388-gd4b28f4 Work with mutlipages PDF

--- scribo/sandbox/icdar_13_table/Makefile | 13 ++- scribo/sandbox/icdar_13_table/src/main.cc | 184 +++++++++++++++++------------ 2 files changed, 114 insertions(+), 83 deletions(-) diff --git a/scribo/sandbox/icdar_13_table/Makefile b/scribo/sandbox/icdar_13_table/Makefile index 2420eec..4e552dd 100644 --- a/scribo/sandbox/icdar_13_table/Makefile +++ b/scribo/sandbox/icdar_13_table/Makefile @@ -1,8 +1,8 @@ CCACHE= CC=g++ -CFLAGS=-Wall -Werror -O3 -CLIBS=-I../../../milena/ -I../../ -CLEAN=*.o $(OUTPUT) output/* log final.xml +CFLAGS=-Wall -Werror -O3 -DHAVE_TESSERACT_3 -DNDEBUG +CLIBS=-I../../../milena/ -I../../ -I/usr/include/poppler +CLEAN=*.o output/* log final.xml SRC=src/main.cc OUTPUT=table @@ -10,9 +10,12 @@ OUTPUT=table all: table table: - $(CCACHE) $(CC) $(CFLAGS) -DHAVE_TESSERACT_3 $(CLIBS) $(SRC) -ltesseract -o $(OUTPUT) + $(CCACHE) $(CC) $(CFLAGS) $(CLIBS) $(SRC) -ltesseract -lpoppler-cpp -o $(OUTPUT) clean: rm -rf $(CLEAN) -.PHONY: table +mrproper: clean + rm -f $(OUTPUT) + +.PHONY: table clean mrproper diff --git a/scribo/sandbox/icdar_13_table/src/main.cc b/scribo/sandbox/icdar_13_table/src/main.cc index 7151b16..be394ba 100644 --- a/scribo/sandbox/icdar_13_table/src/main.cc +++ b/scribo/sandbox/icdar_13_table/src/main.cc @@ -8,6 +8,7 @@ #include <mln/fun/v2v/rgb_to_luma.hh> #include <mln/io/pbm/all.hh> +#include <mln/io/pdf/load.hh> #include <mln/io/ppm/all.hh> #include <mln/labeling/all.hh> @@ -180,6 +181,10 @@ void find_borders(image2d<bool>& ima, bottom = find_bottom(ima); } + /********/ + /* MAIN */ + /********/ + int main(int argc, char** argv) { typedef value::label_16 V; @@ -195,116 +200,139 @@ int main(int argc, char** argv) V nhlines, nvlines; L hlines_ima, vlines_ima; scribo::component_set<L> hlines, vlines; + std::ostringstream path; - // PARAMETERS + // Parameters unsigned av_height = 4; unsigned av_width = 4; int delta_prox_h = 5; unsigned min_height = 71; unsigned min_width = 31; + unsigned dpi = 72; - // Loadin and binarization + // Loading and binarization std::ofstream xml; start_xml(xml, "final.xml", argv[1]); - io::ppm::load(original, argv[1]); - filtered = data::transform(original, mln::fun::v2v::rgb_to_luma<value::int_u8>()); + //io::ppm::load(original, argv[1]); + util::array< image2d<value::rgb8> > pdf; + io::pdf::load(pdf, argv[1], dpi); + for (unsigned page = 0; page < pdf.nelements(); ++page) + { + original = pdf[page]; + filtered = data::transform(original, mln::fun::v2v::rgb_to_luma<value::int_u8>()); - bin = scribo::binarization::sauvola(filtered, 81, 0.44); - final = data::convert(value::rgb8(), bin); + bin = scribo::binarization::sauvola(filtered, 81, 0.44); + final = data::convert(value::rgb8(), bin); - initialize(mask, bin); - initialize(ima_texts, bin); - initialize(ima_tables, bin); - data::fill(ima_tables, false); + initialize(mask, bin); + initialize(ima_texts, bin); + initialize(ima_tables, bin); + data::fill(ima_tables, false); - bin_without_lines = duplicate(bin); + bin_without_lines = duplicate(bin); - // Lines extraction - hlines = scribo::primitive::extract::lines_h_discontinued(bin, c4(), nhlines, min_width, 2); - vlines = scribo::primitive::extract::lines_v_discontinued(bin, c4(), nvlines, min_height, 2); + // Lines extraction + hlines = scribo::primitive::extract::lines_h_discontinued(bin, c4(), nhlines, min_width, 2); + vlines = scribo::primitive::extract::lines_v_discontinued(bin, c4(), nvlines, min_height, 2); - get_horizontal_lines(hlines, ima_tables, bin_without_lines, av_height); - get_vertical_lines(vlines, ima_tables, bin_without_lines, av_width, delta_prox_h); + get_horizontal_lines(hlines, ima_tables, bin_without_lines, av_height); + get_vertical_lines(vlines, ima_tables, bin_without_lines, av_width, delta_prox_h); - // Denoising - bin_without_lines_denoised = scribo::preprocessing::denoise_fg(bin_without_lines, c8(), 3); + // Denoising + bin_without_lines_denoised = scribo::preprocessing::denoise_fg(bin_without_lines, c8(), 4); - // Set the tables mask - unsigned n; - labeled = labeling::blobs(ima_tables, c8(), n); - masks = scribo::component_set< image2d<unsigned> >(labeled, n); - data::fill(mask, false); + // Set the tables mask + unsigned n; + labeled = labeling::blobs(ima_tables, c8(), n); + masks = scribo::component_set< image2d<unsigned> >(labeled, n); + data::fill(mask, false); - for (unsigned i = 1; i <= masks.nelements(); ++i) - data::fill((mask | masks(i).bbox()).rw(), true); + for (unsigned i = 1; i <= masks.nelements(); ++i) + data::fill((mask | masks(i).bbox()).rw(), true); - // Compose table zones with bin_without_lines_lines - ima_texts = logical::and_(bin_without_lines_denoised, mask); + // Compose table zones with bin_without_lines_lines + ima_texts = logical::and_(bin_without_lines_denoised, mask); - // Isolate texts between tables - for (unsigned i = 1; i <= masks.nelements(); ++i) - { - image2d<bool> table_mask, isolated_text; - std::ostringstream path; - bool empty = true; + // Isolate texts between tables + for (unsigned i = 1; i <= masks.nelements(); ++i) + { + image2d<bool> table_mask, isolated_text; + bool empty = true; - initialize(table_mask, bin); - data::fill(table_mask, false); - data::fill((table_mask | masks(i).bbox()).rw(), true); + initialize(table_mask, bin); + data::fill(table_mask, false); + data::fill((table_mask | masks(i).bbox()).rw(), true); - isolated_text = logical::and_(bin_without_lines_denoised, table_mask); + isolated_text = logical::and_(bin_without_lines_denoised, table_mask); - mln_piter_(image2d<bool>) p(isolated_text.domain()); + mln_piter_(image2d<bool>) p(isolated_text.domain()); - for_all(p) - empty = empty && !(isolated_text(p)); + for_all(p) + empty = empty && !(isolated_text(p)); - if (!empty) - { - path << "output/8_" << i << "_isolated.pbm"; - io::pbm::save(isolated_text, path.str()); + if (!empty) + { + path.str(""); + path << "output/p" << page << "_8_" << i << "_isolated.pbm"; + io::pbm::save(isolated_text, path.str()); - // Find coordinated - unsigned left, right, top, bottom; - find_borders(isolated_text, left, right, top, bottom); + // Find coordinates + unsigned left, right, top, bottom; + find_borders(isolated_text, left, right, top, bottom); - std::cout << "(" << left << "," << top << ") ->" - << "(" << right << "," << bottom << ")" << std::endl; + point2d p1, p2, p3, p4; - point2d p1, p2, p3, p4; + p1 = point2d(top, left); + p2 = point2d(top, right); + p3 = point2d(bottom, right); + p4 = point2d(bottom, left); - p1 = point2d(top, left); - p2 = point2d(top, right); - p3 = point2d(bottom, right); - p4 = point2d(bottom, left); + draw::line(final, p1, p2, literal::green); + draw::line(final, p2, p3, literal::green); + draw::line(final, p3, p4, literal::green); + draw::line(final, p4, p1, literal::green); - draw::line(final, p1, p2, literal::red); - draw::line(final, p2, p3, literal::red); - draw::line(final, p3, p4, literal::red); - draw::line(final, p4, p1, literal::red); - - write_table(xml, p1, p3); + write_table(xml, p1, p3); + } } - } - // Get lines images - hlines_ima = hlines.labeled_image(); - vlines_ima = vlines.labeled_image(); - ima_hlines = data::convert(bool(), hlines_ima); - ima_vlines = data::convert(bool(), vlines_ima); - - // Write images and close XML - io::pbm::save(bin, "output/0_bin.pbm"); - io::pbm::save(bin_without_lines, "output/1_bin_without_lines.pbm"); - io::pbm::save(bin_without_lines_denoised, "output/2_bin_without_lines_denoised.pbm"); - io::pbm::save(ima_hlines, "output/3_hlines.pbm"); - io::pbm::save(ima_vlines, "output/4_vlines.pbm"); - io::pbm::save(ima_tables, "output/5_tables.pbm"); - io::pbm::save(mask, "output/6_mask.pbm"); - io::pbm::save(ima_texts, "output/7_texts.pbm"); - /* Save 8_i_isolated */ - io::ppm::save(final, "output/9_final.ppm"); + // Get lines images + hlines_ima = hlines.labeled_image(); + vlines_ima = vlines.labeled_image(); + ima_hlines = data::convert(bool(), hlines_ima); + ima_vlines = data::convert(bool(), vlines_ima); + + // Write images and close XML + path.str(""); path << "output/p" << page << "_0_bin.pbm"; + io::pbm::save(bin, path.str()); + + path.str(""); path << "output/p" << page << "_1_bin_without_lines.pbm"; + io::pbm::save(bin_without_lines, path.str()); + + path.str(""); path << "output/p" << page << "_2_bin_without_lines_denoised.pbm"; + io::pbm::save(bin_without_lines_denoised, path.str()); + + path.str(""); path << "output/p" << page << "_3_hlines.pbm"; + io::pbm::save(ima_hlines, path.str()); + + path.str(""); path << "output/p" << page << "_4_vlines.pbm"; + io::pbm::save(ima_vlines, path.str()); + + path.str(""); path << "output/p" << page << "_5_tables.pbm"; + io::pbm::save(ima_tables, path.str()); + + path.str(""); path << "output/p" << page << "_6_mask.pbm"; + io::pbm::save(mask, path.str()); + + path.str(""); path << "output/p" << page << "_7_texts.pbm"; + io::pbm::save(ima_texts, path.str()); + + /* Save 8_i_isolated */ + + path.str(""); path << "output/p" << page << "_9_final.pbm"; + io::ppm::save(final, path.str()); + } end_xml(xml); -- 1.7.2.5
participants (1)
-
Anthony Seure