olena-2.0-389-g277ebe7c Start new toolchain (scribo toolchain inspired)

--- scribo/sandbox/icdar_13_table/Makefile | 2 +- scribo/sandbox/icdar_13_table/src/new.cc | 146 ++++++++++++++++++++++++++++++ 2 files changed, 147 insertions(+), 1 deletions(-) create mode 100644 scribo/sandbox/icdar_13_table/src/new.cc diff --git a/scribo/sandbox/icdar_13_table/Makefile b/scribo/sandbox/icdar_13_table/Makefile index 4e552dd..b9600a4 100644 --- a/scribo/sandbox/icdar_13_table/Makefile +++ b/scribo/sandbox/icdar_13_table/Makefile @@ -4,7 +4,7 @@ CFLAGS=-Wall -Werror -O3 -DHAVE_TESSERACT_3 -DNDEBUG CLIBS=-I../../../milena/ -I../../ -I/usr/include/poppler CLEAN=*.o output/* log final.xml -SRC=src/main.cc +SRC=src/new.cc OUTPUT=table all: table diff --git a/scribo/sandbox/icdar_13_table/src/new.cc b/scribo/sandbox/icdar_13_table/src/new.cc new file mode 100644 index 0000000..963aa7d --- /dev/null +++ b/scribo/sandbox/icdar_13_table/src/new.cc @@ -0,0 +1,146 @@ +#include <mln/binarization/all.hh> + +#include <mln/core/image/image2d.hh> + +#include <mln/data/all.hh> +#include <mln/draw/line.hh> + +#include <mln/fun/v2v/rgb_to_luma.hh> + +#include <mln/io/pbm/all.hh> +#include <mln/io/pdf/load.hh> +#include <mln/io/ppm/all.hh> + +#include <mln/labeling/all.hh> +#include <mln/literal/all.hh> +#include <mln/logical/and.hh> + +#include <mln/value/all.hh> + +#include <tesseract/baseapi.h> + +#include <scribo/binarization/sauvola.hh> +#include <scribo/core/component_set.hh> +#include <scribo/preprocessing/denoise_fg.hh> +#include <scribo/primitive/extract/vertical_separators.hh> +#include <scribo/primitive/remove/separators.hh> +#include <scribo/primitive/extract/separators_nonvisible.hh> + +#include <scribo/primitive/link/internal/dmax_width_and_height.hh> +#include <scribo/primitive/link/with_single_right_link_dmax_ratio.hh> +#include <scribo/primitive/link/with_single_left_link_dmax_ratio.hh> +#include <scribo/primitive/link/merge_double_link.hh> + +using namespace mln; + +void start_xml(std::ofstream& xml, const char* name, const char* pdf) +{ + xml.open(name); + xml << "<?xml version\"1.0\" encoding=\"UTF-8\"?>" << std::endl + << "<document filename='" << pdf << "'>" << std::endl; +} + +void end_xml(std::ofstream& xml) +{ + xml << "</document>" << std::endl; + xml.close(); +} + +void write_table(std::ofstream& xml, const point2d& start, const point2d& end) +{ + static unsigned table = 0; + static unsigned region = 0; + static unsigned page = 1; + + xml << "\t<table id='" << table << "'>" << std::endl + << "\t\t<region id='" << region << "' page='" << page << "'>" << std::endl + << "\t\t<bounding-box x1='" << start[1] << "' y1='" << start[0] << "' " + << "x2='" << end[1] << "' y2='" << end[0] << "'/>" << std::endl + << "\t\t</region>" << std::endl + << "\t</table>" << std::endl; + + ++table; +} + +int main(int argc, char** argv) +{ + typedef value::label_16 V; + typedef image2d<V> L; + + std::ofstream xml; + std::ostringstream path; + image2d<value::rgb8> original; + image2d<value::int_u8> filtered; + image2d<bool> bin, separators, bin_without_separators, whitespaces, + denoised, comp, links; + scribo::component_set< image2d<unsigned> > components; + + unsigned dpi = 72; + + // Loading and binarization + start_xml(xml, "final.xml", argv[1]); + + util::array< image2d<value::rgb8> > pdf; + io::pdf::load(pdf, argv[1], dpi); + for (unsigned page = 0; page < pdf.nelements(); ++page) + { + original = pdf[page]; + filtered = data::transform(original, fun::v2v::rgb_to_luma<value::int_u8>()); + bin = scribo::binarization::sauvola(filtered, 81, 0.44); + + // Find separators + separators = scribo::primitive::extract::vertical_separators(bin, 81); + bin_without_separators = scribo::primitive::remove::separators(bin, separators); + whitespaces = scribo::primitive::extract::separators_nonvisible(bin); + + // Denoise + denoised = scribo::preprocessing::denoise_fg(bin_without_separators, c8(), 4); + + // Extract components + unsigned ncomponents; + components = scribo::primitive::extract::components(denoised, c8(), ncomponents); + + initialize(comp, denoised); + data::fill(comp, false); + for (unsigned i = 1; i <= components.nelements(); ++i) + data::fill((comp | components(i).bbox()).rw(), true); + + scribo::object_links< image2d<unsigned> > right_link = scribo::primitive::link::with_single_right_link_dmax_ratio(components, + scribo::primitive::link::internal::dmax_width_and_height(1), + scribo::anchor::MassCenter); + + scribo::object_links< image2d<unsigned> > left_link = scribo::primitive::link::with_single_left_link_dmax_ratio(components, + scribo::primitive::link::internal::dmax_width_and_height(1), + scribo::anchor::MassCenter); + + scribo::object_links< image2d<unsigned> > merged_links = scribo::primitive::link::merge_double_link(left_link, right_link); + + initialize(links, denoised); + data::fill(links, false); + for (unsigned i = 1; i <= merged_links.components().nelements(); ++i) + data::fill((links | merged_links.components()(i).bbox()).rw(), true); + + // Write images and close XML + path.str(""); path << "output/p" << page << "_0_bin.pbm"; + io::pbm::save(bin, path.str()); + + path.str(""); path << "output/p" << page << "_1_bin_without_separators.pbm"; + io::pbm::save(bin_without_separators, path.str()); + + path.str(""); path << "output/p" << page << "_2_whitespaces.pbm"; + io::pbm::save(whitespaces, path.str()); + + path.str(""); path << "output/p" << page << "_3_denoised.pbm"; + io::pbm::save(denoised, path.str()); + + path.str(""); path << "output/p" << page << "_4_components.pbm"; + io::pbm::save(comp, path.str()); + + path.str(""); path << "output/p" << page << "_5_links.pbm"; + io::pbm::save(links, path.str()); + } + + end_xml(xml); + + return 0; +} -- 1.7.2.5
participants (1)
-
Anthony Seure