last-svn-commit-652-ga37cf92 src/content_in_doc.cc: New example extracting document content.

--- scribo/ChangeLog | 4 ++ scribo/src/Makefile.am | 15 +++++ .../src/{pbm_text_in_doc.cc => content_in_doc.cc} | 56 +++++++++++++------- 3 files changed, 56 insertions(+), 19 deletions(-) copy scribo/src/{pbm_text_in_doc.cc => content_in_doc.cc} (74%) diff --git a/scribo/ChangeLog b/scribo/ChangeLog index 9400016..5ed918f 100644 --- a/scribo/ChangeLog +++ b/scribo/ChangeLog @@ -1,5 +1,9 @@ 2010-11-15 Guillaume Lazzara <z@lrde.epita.fr> + * src/content_in_doc.cc: New example extracting document content. + +2010-11-15 Guillaume Lazzara <z@lrde.epita.fr> + * scribo/toolchain/text_in_doc.hh: Make use of non visible separators information. diff --git a/scribo/src/Makefile.am b/scribo/src/Makefile.am index a2c72b2..cd7618c 100644 --- a/scribo/src/Makefile.am +++ b/scribo/src/Makefile.am @@ -85,6 +85,21 @@ if HAVE_TESSERACT $(TIFF_LDFLAGS) \ $(MAGICKXX_LDFLAGS) + + utilexec_PROGRAMS += content_in_doc + content_in_doc_SOURCES = content_in_doc.cc + content_in_doc_CPPFLAGS = $(AM_CPPFLAGS) \ + -I/home/lazzara/git/oln/scribo/sandbox/green/ \ + -I/home/lazzara/git/oln/scribo/sandbox/z/ \ + $(TESSERACT_CPPFLAGS) \ + $(TIFF_CPPFLAGS) \ + $(MAGICKXX_CPPFLAGS) + content_in_doc_LDFLAGS = $(AM_LDFLAGS) \ + $(TESSERACT_LDFLAGS) \ + $(TIFF_LDFLAGS) \ + $(MAGICKXX_LDFLAGS) \ + -lpthread + endif HAVE_TESSERACT endif HAVE_MAGICKXX diff --git a/scribo/src/pbm_text_in_doc.cc b/scribo/src/content_in_doc.cc similarity index 74% copy from scribo/src/pbm_text_in_doc.cc copy to scribo/src/content_in_doc.cc index 23ed9e7..fe3eacf 100644 --- a/scribo/src/pbm_text_in_doc.cc +++ b/scribo/src/content_in_doc.cc @@ -1,5 +1,4 @@ -// Copyright (C) 2009, 2010 EPITA Research and Development Laboratory -// (LRDE) +// Copyright (C) 2010 EPITA Research and Development Laboratory (LRDE) // // This file is part of Olena. // @@ -24,9 +23,6 @@ // exception does not however invalidate any other reasons why the // executable file might be covered by the GNU General Public License. -#ifdef HAVE_CONFIG_H -# include <config.h> -#endif #include <libgen.h> #include <fstream> @@ -35,30 +31,42 @@ #include <mln/core/image/image2d.hh> #include <mln/core/alias/neighb2d.hh> -#include <mln/io/pbm/all.hh> +#include <mln/io/pbm/save.hh> +#include <mln/io/magick/load.hh> + +#include <mln/value/label_8.hh> + +#include <mln/core/var.hh> + +#include <mln/accu/count_value.hh> + +#include <mln/draw/box_plain.hh> #include <scribo/toolchain/text_in_doc.hh> +#include <scribo/toolchain/text_in_doc_preprocess.hh> +#include <scribo/core/document.hh> #include <scribo/core/line_set.hh> #include <scribo/debug/usage.hh> #include <scribo/make/debug_filename.hh> +#include <scribo/primitive/extract/elements.hh> + #include <scribo/preprocessing/crop_without_localization.hh> +#include <scribo/preprocessing/crop.hh> #include <scribo/io/xml/save.hh> #include <scribo/io/text_boxes/save.hh> - const char *args_desc[][2] = { - { "input.pbm", "A binary image. 'False' for object, 'True'\ -for the background." }, - { "out.txt", "Text output" }, + { "input.*", "An image." }, + { "out.xml", "Result of the document analysis." }, { "denoise_enabled", "1 enables denoising, 0 disables it. (enabled by default)" }, { "pmin_row", "Row index of the top left corner of the Region of interest." }, { "pmin_col", "Col index of the top left corner of the Region of interest." }, @@ -69,6 +77,7 @@ for the background." }, }; + int main(int argc, char* argv[]) { using namespace scribo; @@ -76,8 +85,8 @@ int main(int argc, char* argv[]) if (argc != 3 && argc != 4 && argc != 5 && argc != 8 && argc != 9) return scribo::debug::usage(argv, - "Find text lines using left/right validation and display x-height in a binarized article.", - "input.pbm out.txt <denoise_enabled> [<pmin_row> <pmin_col> <pmax_row> <pmax_col>] <debug_dir>", + "Find text lines and elements in a document", + "input.* out.xml <denoise_enabled> [<pmin_row> <pmin_col> <pmax_row> <pmax_col>] <debug_dir>", args_desc); bool debug = false; @@ -91,9 +100,12 @@ int main(int argc, char* argv[]) trace::entering("main"); + typedef image2d<scribo::def::lbl_type> L; + scribo::document<L> doc(argv[1]); - image2d<bool> input; - mln::io::pbm::load(input, argv[1]); + // Preprocess document + image2d<bool> + input = toolchain::text_in_doc_preprocess(doc.image(), false); // Optional Cropping @@ -119,15 +131,21 @@ int main(int argc, char* argv[]) // Run document toolchain. - typedef image2d<scribo::def::lbl_type> L; + + // Text + std::cout << "Extracting text" << std::endl; line_set<L> lines = scribo::toolchain::text_in_doc(input, denoise, debug); + doc.set_text(lines); + + // Elements + std::cout << "Extracting Elements" << std::endl; + component_set<L> elements = scribo::primitive::extract::elements(doc, input); + doc.set_elements(elements); - // Saving results - scribo::io::xml::save(argv[1], lines, "out.xml", true); - // Specify shift due to potential previous crop. - scribo::io::text_boxes::save(lines, argv[2], crop_shift); + // Saving results + scribo::io::xml::save(doc, argv[2], true); trace::exiting("main"); } -- 1.5.6.5
participants (1)
-
Guillaume Lazzara