last-svn-commit-135-g40d5180 Add a specific toolchain for Nepomuk integration.

* convert/from_qimage.hh: New. convert an image2d to a QImage. * core/line_info.hh: Add a new member has_text. * text/recognition.hh: Remove an invalid precondition. * toolchain/nepomuk/text_extraction.hh: New. Specific toolchain for Nepomuk. * tests/Makefile.am: Add toolchain/* subdirs. * tests/toolchain/Makefile.am, * tests/toolchain/nepomuk/Makefile.am: New. * tests/img/wildly.pbm: New. New test image. * tests/toolchain/nepomuk/text_extraction.cc: New. New test. --- scribo/ChangeLog | 22 +++ .../crop.hh => convert/from_qimage.hh} | 68 +++++---- scribo/core/line_info.hh | 8 + scribo/tests/Makefile.am | 1 + scribo/tests/img/wildly.pbm | Bin 0 -> 2208 bytes scribo/tests/{text => toolchain}/Makefile.am | 7 +- .../{filter => toolchain/nepomuk}/Makefile.am | 27 +++- .../tests/toolchain/nepomuk/text_extraction.cc | 31 ++--- scribo/text/recognition.hh | 2 - scribo/toolchain/nepomuk/text_extraction.hh | 160 ++++++++++++++++++++ 10 files changed, 265 insertions(+), 61 deletions(-) copy scribo/{preprocessing/crop.hh => convert/from_qimage.hh} (55%) create mode 100644 scribo/tests/img/wildly.pbm copy scribo/tests/{text => toolchain}/Makefile.am (86%) copy scribo/tests/{filter => toolchain/nepomuk}/Makefile.am (56%) copy milena/mln/fun/n2v/all.hh => scribo/tests/toolchain/nepomuk/text_extraction.cc (75%) create mode 100644 scribo/toolchain/nepomuk/text_extraction.hh diff --git a/scribo/ChangeLog b/scribo/ChangeLog index de7acfa..a473c52 100644 --- a/scribo/ChangeLog +++ b/scribo/ChangeLog @@ -1,3 +1,25 @@ +2010-06-03 Guillaume Lazzara <z@lrde.epita.fr> + + Add a specific toolchain for Nepomuk integration. + + * convert/from_qimage.hh: New. convert an image2d to a QImage. + + * core/line_info.hh: Add a new member has_text. + + * text/recognition.hh: Remove an invalid precondition. + + * toolchain/nepomuk/text_extraction.hh: New. Specific toolchain + for Nepomuk. + + * tests/Makefile.am: Add toolchain/* subdirs. + + * tests/toolchain/Makefile.am, + * tests/toolchain/nepomuk/Makefile.am: New. + + * tests/img/wildly.pbm: New. New test image. + + * tests/toolchain/nepomuk/text_extraction.cc: New. New test. + 2010-05-25 Guillaume Lazzara <z@lrde.epita.fr> Cleanup sample tools. diff --git a/scribo/preprocessing/crop.hh b/scribo/convert/from_qimage.hh similarity index 55% copy from scribo/preprocessing/crop.hh copy to scribo/convert/from_qimage.hh index c289f86..b297c2f 100644 --- a/scribo/preprocessing/crop.hh +++ b/scribo/convert/from_qimage.hh @@ -23,62 +23,70 @@ // exception does not however invalidate any other reasons why the // executable file might be covered by the GNU General Public License. -#ifndef SCRIBO_PREPROCESSING_CROP_HH -# define SCRIBO_PREPROCESSING_CROP_HH -# include <mln/core/concept/image.hh> -# include <mln/data/paste.hh> + +#ifndef SCRIBO_CONVERT_FROM_QIMAGE_HH +# define SCRIBO_CONVERT_FROM_QIMAGE_HH /// \file /// -/// \brief Crop an image preserving the localization. +/// Extract text from a document. -namespace scribo -{ +# include <QtGui/QImage> - namespace preprocessing - { +# include <mln/value/qt/rgb32.hh> - using namespace mln; +# if QT_VERSION < 0x040000 +# error "Qt library too old. You need at least Qt 4.x." +# endif // ! QT_VERSION - /*! \brief crop an image preserving the localization. +namespace scribo +{ - \param[in] input An image. - \param[in] domain A region of interest. + namespace convert + { - \return An image defined on the domain \p domain with the - corresponding data copied from \p input. + /*! \brief Convert a QImage to mln::image2d. + \param[in] ima A QImage. Prefer using QImage::Format_RGB32 + image format to avoid conversions. + + \return A RGB8 2D image in Milena's format. */ - template <typename I> - mln_concrete(I) - crop(const Image<I>& input, const mln_box(I)& domain); + mln::image2d<mln::value::qt::rgb32> + from_qimage(const QImage& ima); # ifndef MLN_INCLUDE_ONLY - - template <typename I> - mln_concrete(I) - crop(const Image<I>& input, const mln_box(I)& domain) + mln::image2d<mln::value::qt::rgb32> + from_qimage(const QImage& ima) { - trace::entering("scribo::preprocessing::crop"); - mln_assertion(exact(input).is_valid()); + QImage tmp = ima; + + if (ima.format() != QImage::Format_RGB32) + tmp = ima.convertToFormat(QImage::Format_RGB32); - mln_concrete(I) output(domain); - data::paste(input | domain, output); + const int + nrows = tmp.height(), + ncols = tmp.width(); + + mln::image2d<mln::value::qt::rgb32> output(nrows, ncols, 0); + + QImage qima(ncols, nrows, QImage::Format_RGB32); + std::memcpy(output.buffer(), + tmp.scanLine(0), + output.nelements() * 4); - trace::exiting("scribo::preprocessing::crop"); return output; } - # endif // ! MLN_INCLUDE_ONLY - } // end of namespace scribo::preprocessing + } // end of namespace scribo::convert } // end of namespace scribo -#endif // ! SCRIBO_PREPROCESSING_CROP_HH +#endif // ! SCRIBO_CONVERT_FROM_QIMAGE_HH diff --git a/scribo/core/line_info.hh b/scribo/core/line_info.hh index d0066c0..55d1430 100644 --- a/scribo/core/line_info.hh +++ b/scribo/core/line_info.hh @@ -125,6 +125,7 @@ namespace scribo bool indented() const; + bool has_text() const; const std::string& text() const; void update_text(const std::string& str); @@ -587,6 +588,13 @@ namespace scribo } template <typename L> + bool + line_info<L>::has_text() const + { + return !text_.empty(); + } + + template <typename L> const std::string& line_info<L>::text() const { diff --git a/scribo/tests/Makefile.am b/scribo/tests/Makefile.am index 61570c8..4ef9ca1 100644 --- a/scribo/tests/Makefile.am +++ b/scribo/tests/Makefile.am @@ -25,6 +25,7 @@ SUBDIRS = \ preprocessing \ table \ text \ + toolchain \ unit_test # Regen files recursively. diff --git a/scribo/tests/img/wildly.pbm b/scribo/tests/img/wildly.pbm new file mode 100644 index 0000000000000000000000000000000000000000..518fbec9b0814bb982ac9d901da02e5af1447d6d GIT binary patch literal 2208 zcmeIx-)qxQ6bJAV0|v@iUVSLb1z!}gYt=$G#=>w_go&dJ@v*7RXk|-6R>$^Wl0Pl} z1^x?x{Q>693_`Bw+}mI2Qt&|(!CM&je)hdL$vMgKi`ql>ve!L0dDq)zAKtS|+rJpQ z4tw5cHLa(gKA#WVSL3TEU1!AHQ}(RcY~!zmn_k<F?OPLj$OhN!?fA@gEY@l?85>O| z-tP9cOVT*=hjzpE#uKY?>f^5N@tcza*0V2c%l|lHmNR6B_N9I1dgrz?VMo@$#Y0{H znmsz|9d;krTAeoQv}^Ex{NW1l7{JK2V&$?5xtZ&~>R@7C&6U+W80bn4E3QS=uE)B} z2T&TuZ*qA)<gjwS9?^C0I>0~e9i8sqh~@Jt5rD=co*A6e^YRqk;6SZjawY#lUO-4C zDR8@bLwPOLGYHfSbYv``9xJhlj{iYdGT0;!g&V>#8({B+o9JppZbX<3u-0|T;|a;s zVc9X6Xh3-rfbs=&D1XNT>3D8>U*z{?K3w>waBMl9V}eKL=<!Zg<yx|nZ7k=Fs84w< z@v?f^6LK~6@&_f4W9j|8v+Vj;;YykW&9X<7SF$+pmR-A%emnC}-v89GYi1(yrmUO# zgN^~<(Ta2FA$@Y9p0fV4Pc_5_3!nC7<|d-_Sf%#|8!5&<=+d!2;=P*s7P*NR!S{qm z1>LL|TcCl;GoGu(9{<i0xRPEMe#>)(r~DgFWm^C?;2u6in)dPi6H(DFxu^8EUs^2| z$zocXt@F5)nlF?!u151nX|m3>Qfj`Km3a=+iHeo08d&Sff!|)`Sm$a|8UM@p6Ql7> A$^ZZW literal 0 HcmV?d00001 diff --git a/scribo/tests/text/Makefile.am b/scribo/tests/toolchain/Makefile.am similarity index 86% copy from scribo/tests/text/Makefile.am copy to scribo/tests/toolchain/Makefile.am index f6cb0e5..826fb86 100644 --- a/scribo/tests/text/Makefile.am +++ b/scribo/tests/toolchain/Makefile.am @@ -1,4 +1,4 @@ -# Copyright (C) 2009 EPITA Research and Development Laboratory (LRDE). +# Copyright (C) 2010 EPITA Research and Development Laboratory (LRDE). # # This file is part of Olena. # @@ -19,6 +19,5 @@ include $(top_srcdir)/scribo/tests/tests.mk -check_PROGRAMS = - -TESTS = $(check_PROGRAMS) +SUBDIRS = \ + nepomuk \ No newline at end of file diff --git a/scribo/tests/filter/Makefile.am b/scribo/tests/toolchain/nepomuk/Makefile.am similarity index 56% copy from scribo/tests/filter/Makefile.am copy to scribo/tests/toolchain/nepomuk/Makefile.am index a023e4e..4bce3bd 100644 --- a/scribo/tests/filter/Makefile.am +++ b/scribo/tests/toolchain/nepomuk/Makefile.am @@ -1,4 +1,4 @@ -# Copyright (C) 2009 EPITA Research and Development Laboratory (LRDE). +# Copyright (C) 2010 EPITA Research and Development Laboratory (LRDE). # # This file is part of Olena. # @@ -19,12 +19,27 @@ include $(top_srcdir)/scribo/tests/tests.mk -check_PROGRAMS = \ - objects_with_holes \ - small_and_large_bboxes +check_PROGRAMS = -objects_with_holes_SOURCES = objects_with_holes.cc -small_and_large_bboxes_SOURCES = small_and_large_bboxes.cc + +if HAVE_QT +if HAVE_TESSERACT + +check_PROGRAMS += text_extraction +text_extraction_SOURCES = text_extraction.cc +text_extraction_CXXFLAGS = $(QT_CXXFLAGS) $(AM_CXXFLAGS) +text_extraction_CPPFLAGS = $(QT_CPPFLAGS) $(AM_CPPFLAGS) \ + $(TESSERACT_CPPFLAGS) \ + $(TIFF_CPPFLAGS) +text_extraction_LDFLAGS = $(QT_LDFLAGS) $(LDFLAGS) \ + $(TESSERACT_LDFLAGS) \ + $(TIFF_LDFLAGS) \ + -lpthread +text_extraction_LDADD = $(QT_LIBS) $(LDADD) + +endif HAVE_TESSERACT +endif HAVE_QT + TESTS = $(check_PROGRAMS) diff --git a/milena/mln/fun/n2v/all.hh b/scribo/tests/toolchain/nepomuk/text_extraction.cc similarity index 75% copy from milena/mln/fun/n2v/all.hh copy to scribo/tests/toolchain/nepomuk/text_extraction.cc index 0e0e55c..c1b51ea 100644 --- a/milena/mln/fun/n2v/all.hh +++ b/scribo/tests/toolchain/nepomuk/text_extraction.cc @@ -23,29 +23,22 @@ // exception does not however invalidate any other reasons why the // executable file might be covered by the GNU General Public License. -#ifndef MLN_FUN_N2V_ALL_HH -# define MLN_FUN_N2V_ALL_HH - /// \file /// -/// File that includes all functions from nil to value. - +/// Test of scribo::toolchain::nepomuk::text_extraction -namespace mln -{ +#include <QtGui/QImage> +#include <QtCore> +#include <scribo/toolchain/nepomuk/text_extraction.hh> - namespace fun - { +#include <scribo/tests/data.hh> - /// \brief Namespace of functions from nil to value. - /// - /// \ingroup modfun - namespace n2v {} +int main() +{ + QImage ima(SCRIBO_IMG_DIR "/wildly.pbm"); + QSet<QString> words = scribo::toolchain::nepomuk::text_extraction(ima); - } + mln_assertion(words.size() == 1); + mln_assertion(words.contains("Wildly")); + return 0; } - - -# include <mln/fun/n2v/white_gaussian.hh> - -#endif // ! MLN_FUN_N2V_ALL_HH diff --git a/scribo/text/recognition.hh b/scribo/text/recognition.hh index f8d8f4f..44533e9 100644 --- a/scribo/text/recognition.hh +++ b/scribo/text/recognition.hh @@ -104,8 +104,6 @@ namespace scribo { trace::entering("scribo::text::recognition"); - mln_precondition(lines.is_valid()); - // Initialize Tesseract. TessBaseAPI::InitWithLanguage(NULL, NULL, language, NULL, false, 0, NULL); diff --git a/scribo/toolchain/nepomuk/text_extraction.hh b/scribo/toolchain/nepomuk/text_extraction.hh new file mode 100644 index 0000000..ed486f5 --- /dev/null +++ b/scribo/toolchain/nepomuk/text_extraction.hh @@ -0,0 +1,160 @@ +// Copyright (C) 2010 EPITA Research and Development Laboratory (LRDE) +// +// This file is part of Olena. +// +// Olena is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation, version 2 of the License. +// +// Olena is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Olena. If not, see <http://www.gnu.org/licenses/>. +// +// As a special exception, you may use this file as part of a free +// software project without restriction. Specifically, if other files +// instantiate templates or use macros or inline functions from this +// file, or you compile this file and link it with other files to produce +// an executable, this file does not by itself cause the resulting +// executable to be covered by the GNU General Public License. This +// exception does not however invalidate any other reasons why the +// executable file might be covered by the GNU General Public License. + + +#ifndef SCRIBO_TOOLCHAIN_TEXT_EXTRACTION_HH +# define SCRIBO_TOOLCHAIN_TEXT_EXTRACTION_HH + +/// \file +/// +/// Extract text from a document. + + +# include <QtCore/QString> +# include <QtCore/QStringList> +# include <QtGui/QImage> + +# include <mln/core/image/image2d.hh> +# include <mln/data/transform.hh> +# include <mln/logical/not.hh> +# include <mln/value/qt/rgb32.hh> +# include <mln/fun/v2v/qt_rgb_to_int_u.hh> + +# include <scribo/convert/from_qimage.hh> +# include <scribo/binarization/sauvola_ms.hh> +# include <scribo/preprocessing/deskew.hh> +# include <scribo/toolchain/text_in_doc.hh> + + +namespace scribo +{ + + namespace toolchain + { + + namespace nepomuk + { + + /*! \brief Extract text from a document. + + This is a convenient routine to be used in Nepomuk. + + + + \param[in] ima A document image. The + + \return A set of recognized words. + + */ + QSet<QString> + text_extraction(const QImage& input); + + +# ifndef MLN_INCLUDE_ONLY + + QSet<QString> + text_extraction(const QImage& input) + { + trace::entering("scribo::toolchain::nepomuk::text_extraction"); + + mln_precondition(!input.isNull()); + + typedef image2d<scribo::def::lbl_type> L; + + // Convert image to Milena's format. + mln::image2d<mln::value::qt::rgb32> + input_mln = scribo::convert::from_qimage(input); + + image2d<bool> input_bin; + + + // Preprocess + { + // Convert to Gray level image. + image2d<value::int_u8> + input_gl = data::transform(input_mln, + mln::fun::v2v::qt_rgb_to_int_u<8>()); + + // Deskew if needed. + input_gl = preprocessing::deskew(input_gl); + + // Binarize foreground to use it in the processing chain. + input_bin = scribo::binarization::sauvola_ms(input_gl, 101, 3); + } + + + + + line_set<L> lines_bg, lines_fg; + // Process + { + // Run document toolchain. + lines_bg = scribo::toolchain::text_in_doc(input_bin, false, false); + + // Negate document. + logical::not_inplace(input_bin); + + // Run document toolchain. + lines_fg = scribo::toolchain::text_in_doc(input_bin, false, false); + } + + + QSet<QString> output; + + // Construct output + { + QTextCodec *codec = QTextCodec::codecForName("UTF-8"); + + QString tmp_out; + QTextStream stream(&tmp_out, QIODevice::WriteOnly); + stream.setCodec("UTF-8"); + + for_all_lines(l, lines_bg) + if (lines_bg(l).has_text()) + stream << " " << codec->toUnicode(lines_bg(l).text().c_str()); + + for_all_lines(l, lines_fg) + if (lines_fg(l).has_text()) + stream << " " << codec->toUnicode(lines_fg(l).text().c_str()); + + QStringList list = tmp_out.split(' ', QString::SkipEmptyParts); + + output = QSet<QString>::fromList(list); + } + + trace::exiting("scribo::toolchain::nepomuk::text_extraction"); + return output; + } + +# endif // ! MLN_INCLUDE_ONLY + + } // end of namespace scribo::toolchain::nepomuk + + } // end of namespace scribo::toolchain + +} // end of namespace scribo + + +#endif // ! SCRIBO_TOOLCHAIN_TEXT_EXTRACTION_HH -- 1.5.6.5
participants (1)
-
Guillaume Lazzara