---
scribo/ChangeLog | 4 ++
scribo/src/Makefile.am | 15 +++++
.../src/{pbm_text_in_doc.cc => content_in_doc.cc} | 56 +++++++++++++-------
3 files changed, 56 insertions(+), 19 deletions(-)
copy scribo/src/{pbm_text_in_doc.cc => content_in_doc.cc} (74%)
diff --git a/scribo/ChangeLog b/scribo/ChangeLog
index 07d166b..9eddce6 100644
--- a/scribo/ChangeLog
+++ b/scribo/ChangeLog
@@ -1,5 +1,9 @@
2010-11-15 Guillaume Lazzara <z(a)lrde.epita.fr>
+ * src/content_in_doc.cc: New example extracting document content.
+
+2010-11-15 Guillaume Lazzara <z(a)lrde.epita.fr>
+
* scribo/toolchain/text_in_doc.hh: Make use of non visible
separators information.
diff --git a/scribo/src/Makefile.am b/scribo/src/Makefile.am
index a2c72b2..cd7618c 100644
--- a/scribo/src/Makefile.am
+++ b/scribo/src/Makefile.am
@@ -85,6 +85,21 @@ if HAVE_TESSERACT
$(TIFF_LDFLAGS) \
$(MAGICKXX_LDFLAGS)
+
+ utilexec_PROGRAMS += content_in_doc
+ content_in_doc_SOURCES = content_in_doc.cc
+ content_in_doc_CPPFLAGS = $(AM_CPPFLAGS) \
+ -I/home/lazzara/git/oln/scribo/sandbox/green/ \
+ -I/home/lazzara/git/oln/scribo/sandbox/z/ \
+ $(TESSERACT_CPPFLAGS) \
+ $(TIFF_CPPFLAGS) \
+ $(MAGICKXX_CPPFLAGS)
+ content_in_doc_LDFLAGS = $(AM_LDFLAGS) \
+ $(TESSERACT_LDFLAGS) \
+ $(TIFF_LDFLAGS) \
+ $(MAGICKXX_LDFLAGS) \
+ -lpthread
+
endif HAVE_TESSERACT
endif HAVE_MAGICKXX
diff --git a/scribo/src/pbm_text_in_doc.cc b/scribo/src/content_in_doc.cc
similarity index 74%
copy from scribo/src/pbm_text_in_doc.cc
copy to scribo/src/content_in_doc.cc
index 23ed9e7..fe3eacf 100644
--- a/scribo/src/pbm_text_in_doc.cc
+++ b/scribo/src/content_in_doc.cc
@@ -1,5 +1,4 @@
-// Copyright (C) 2009, 2010 EPITA Research and Development Laboratory
-// (LRDE)
+// Copyright (C) 2010 EPITA Research and Development Laboratory (LRDE)
//
// This file is part of Olena.
//
@@ -24,9 +23,6 @@
// exception does not however invalidate any other reasons why the
// executable file might be covered by the GNU General Public License.
-#ifdef HAVE_CONFIG_H
-# include <config.h>
-#endif
#include <libgen.h>
#include <fstream>
@@ -35,30 +31,42 @@
#include <mln/core/image/image2d.hh>
#include <mln/core/alias/neighb2d.hh>
-#include <mln/io/pbm/all.hh>
+#include <mln/io/pbm/save.hh>
+#include <mln/io/magick/load.hh>
+
+#include <mln/value/label_8.hh>
+
+#include <mln/core/var.hh>
+
+#include <mln/accu/count_value.hh>
+
+#include <mln/draw/box_plain.hh>
#include <scribo/toolchain/text_in_doc.hh>
+#include <scribo/toolchain/text_in_doc_preprocess.hh>
+#include <scribo/core/document.hh>
#include <scribo/core/line_set.hh>
#include <scribo/debug/usage.hh>
#include <scribo/make/debug_filename.hh>
+#include <scribo/primitive/extract/elements.hh>
+
#include <scribo/preprocessing/crop_without_localization.hh>
+#include <scribo/preprocessing/crop.hh>
#include <scribo/io/xml/save.hh>
#include <scribo/io/text_boxes/save.hh>
-
const char *args_desc[][2] =
{
- { "input.pbm", "A binary image. 'False' for object,
'True'\
-for the background." },
- { "out.txt", "Text output" },
+ { "input.*", "An image." },
+ { "out.xml", "Result of the document analysis." },
{ "denoise_enabled", "1 enables denoising, 0 disables it. (enabled by
default)" },
{ "pmin_row", "Row index of the top left corner of the Region of
interest." },
{ "pmin_col", "Col index of the top left corner of the Region of
interest." },
@@ -69,6 +77,7 @@ for the background." },
};
+
int main(int argc, char* argv[])
{
using namespace scribo;
@@ -76,8 +85,8 @@ int main(int argc, char* argv[])
if (argc != 3 && argc != 4 && argc != 5 && argc != 8 &&
argc != 9)
return scribo::debug::usage(argv,
- "Find text lines using left/right validation and display x-height in a binarized
article.",
- "input.pbm out.txt <denoise_enabled> [<pmin_row> <pmin_col>
<pmax_row> <pmax_col>] <debug_dir>",
+ "Find text lines and elements in a document",
+ "input.* out.xml <denoise_enabled> [<pmin_row> <pmin_col>
<pmax_row> <pmax_col>] <debug_dir>",
args_desc);
bool debug = false;
@@ -91,9 +100,12 @@ int main(int argc, char* argv[])
trace::entering("main");
+ typedef image2d<scribo::def::lbl_type> L;
+ scribo::document<L> doc(argv[1]);
- image2d<bool> input;
- mln::io::pbm::load(input, argv[1]);
+ // Preprocess document
+ image2d<bool>
+ input = toolchain::text_in_doc_preprocess(doc.image(), false);
// Optional Cropping
@@ -119,15 +131,21 @@ int main(int argc, char* argv[])
// Run document toolchain.
- typedef image2d<scribo::def::lbl_type> L;
+
+ // Text
+ std::cout << "Extracting text" << std::endl;
line_set<L>
lines = scribo::toolchain::text_in_doc(input, denoise, debug);
+ doc.set_text(lines);
+
+ // Elements
+ std::cout << "Extracting Elements" << std::endl;
+ component_set<L> elements = scribo::primitive::extract::elements(doc, input);
+ doc.set_elements(elements);
- // Saving results
- scribo::io::xml::save(argv[1], lines, "out.xml", true);
- // Specify shift due to potential previous crop.
- scribo::io::text_boxes::save(lines, argv[2], crop_shift);
+ // Saving results
+ scribo::io::xml::save(doc, argv[2], true);
trace::exiting("main");
}
--
1.5.6.5