* convert/from_qimage.hh: New. convert an image2d to a QImage.
* core/line_info.hh: Add a new member has_text.
* text/recognition.hh: Remove an invalid precondition.
* toolchain/nepomuk/text_extraction.hh: New. Specific toolchain
for Nepomuk.
* tests/Makefile.am: Add toolchain/* subdirs.
* tests/toolchain/Makefile.am,
* tests/toolchain/nepomuk/Makefile.am: New.
* tests/img/wildly.pbm: New. New test image.
* tests/toolchain/nepomuk/text_extraction.cc: New. New test.
---
scribo/ChangeLog | 22 +++
.../crop.hh => convert/from_qimage.hh} | 68 +++++----
scribo/core/line_info.hh | 8 +
scribo/tests/Makefile.am | 1 +
scribo/tests/img/wildly.pbm | Bin 0 -> 2208 bytes
scribo/tests/{text => toolchain}/Makefile.am | 7 +-
.../{filter => toolchain/nepomuk}/Makefile.am | 27 +++-
.../tests/toolchain/nepomuk/text_extraction.cc | 31 ++---
scribo/text/recognition.hh | 2 -
scribo/toolchain/nepomuk/text_extraction.hh | 160 ++++++++++++++++++++
10 files changed, 265 insertions(+), 61 deletions(-)
copy scribo/{preprocessing/crop.hh => convert/from_qimage.hh} (55%)
create mode 100644 scribo/tests/img/wildly.pbm
copy scribo/tests/{text => toolchain}/Makefile.am (86%)
copy scribo/tests/{filter => toolchain/nepomuk}/Makefile.am (56%)
copy milena/mln/fun/n2v/all.hh => scribo/tests/toolchain/nepomuk/text_extraction.cc
(75%)
create mode 100644 scribo/toolchain/nepomuk/text_extraction.hh
diff --git a/scribo/ChangeLog b/scribo/ChangeLog
index de7acfa..a473c52 100644
--- a/scribo/ChangeLog
+++ b/scribo/ChangeLog
@@ -1,3 +1,25 @@
+2010-06-03 Guillaume Lazzara <z(a)lrde.epita.fr>
+
+ Add a specific toolchain for Nepomuk integration.
+
+ * convert/from_qimage.hh: New. convert an image2d to a QImage.
+
+ * core/line_info.hh: Add a new member has_text.
+
+ * text/recognition.hh: Remove an invalid precondition.
+
+ * toolchain/nepomuk/text_extraction.hh: New. Specific toolchain
+ for Nepomuk.
+
+ * tests/Makefile.am: Add toolchain/* subdirs.
+
+ * tests/toolchain/Makefile.am,
+ * tests/toolchain/nepomuk/Makefile.am: New.
+
+ * tests/img/wildly.pbm: New. New test image.
+
+ * tests/toolchain/nepomuk/text_extraction.cc: New. New test.
+
2010-05-25 Guillaume Lazzara <z(a)lrde.epita.fr>
Cleanup sample tools.
diff --git a/scribo/preprocessing/crop.hh b/scribo/convert/from_qimage.hh
similarity index 55%
copy from scribo/preprocessing/crop.hh
copy to scribo/convert/from_qimage.hh
index c289f86..b297c2f 100644
--- a/scribo/preprocessing/crop.hh
+++ b/scribo/convert/from_qimage.hh
@@ -23,62 +23,70 @@
// exception does not however invalidate any other reasons why the
// executable file might be covered by the GNU General Public License.
-#ifndef SCRIBO_PREPROCESSING_CROP_HH
-# define SCRIBO_PREPROCESSING_CROP_HH
-# include <mln/core/concept/image.hh>
-# include <mln/data/paste.hh>
+
+#ifndef SCRIBO_CONVERT_FROM_QIMAGE_HH
+# define SCRIBO_CONVERT_FROM_QIMAGE_HH
/// \file
///
-/// \brief Crop an image preserving the localization.
+/// Extract text from a document.
-namespace scribo
-{
+# include <QtGui/QImage>
- namespace preprocessing
- {
+# include <mln/value/qt/rgb32.hh>
- using namespace mln;
+# if QT_VERSION < 0x040000
+# error "Qt library too old. You need at least Qt 4.x."
+# endif // ! QT_VERSION
- /*! \brief crop an image preserving the localization.
+namespace scribo
+{
- \param[in] input An image.
- \param[in] domain A region of interest.
+ namespace convert
+ {
- \return An image defined on the domain \p domain with the
- corresponding data copied from \p input.
+ /*! \brief Convert a QImage to mln::image2d.
+ \param[in] ima A QImage. Prefer using QImage::Format_RGB32
+ image format to avoid conversions.
+
+ \return A RGB8 2D image in Milena's format.
*/
- template <typename I>
- mln_concrete(I)
- crop(const Image<I>& input, const mln_box(I)& domain);
+ mln::image2d<mln::value::qt::rgb32>
+ from_qimage(const QImage& ima);
# ifndef MLN_INCLUDE_ONLY
-
- template <typename I>
- mln_concrete(I)
- crop(const Image<I>& input, const mln_box(I)& domain)
+ mln::image2d<mln::value::qt::rgb32>
+ from_qimage(const QImage& ima)
{
- trace::entering("scribo::preprocessing::crop");
- mln_assertion(exact(input).is_valid());
+ QImage tmp = ima;
+
+ if (ima.format() != QImage::Format_RGB32)
+ tmp = ima.convertToFormat(QImage::Format_RGB32);
- mln_concrete(I) output(domain);
- data::paste(input | domain, output);
+ const int
+ nrows = tmp.height(),
+ ncols = tmp.width();
+
+ mln::image2d<mln::value::qt::rgb32> output(nrows, ncols, 0);
+
+ QImage qima(ncols, nrows, QImage::Format_RGB32);
+ std::memcpy(output.buffer(),
+ tmp.scanLine(0),
+ output.nelements() * 4);
- trace::exiting("scribo::preprocessing::crop");
return output;
}
-
# endif // ! MLN_INCLUDE_ONLY
- } // end of namespace scribo::preprocessing
+ } // end of namespace scribo::convert
} // end of namespace scribo
-#endif // ! SCRIBO_PREPROCESSING_CROP_HH
+#endif // ! SCRIBO_CONVERT_FROM_QIMAGE_HH
diff --git a/scribo/core/line_info.hh b/scribo/core/line_info.hh
index d0066c0..55d1430 100644
--- a/scribo/core/line_info.hh
+++ b/scribo/core/line_info.hh
@@ -125,6 +125,7 @@ namespace scribo
bool indented() const;
+ bool has_text() const;
const std::string& text() const;
void update_text(const std::string& str);
@@ -587,6 +588,13 @@ namespace scribo
}
template <typename L>
+ bool
+ line_info<L>::has_text() const
+ {
+ return !text_.empty();
+ }
+
+ template <typename L>
const std::string&
line_info<L>::text() const
{
diff --git a/scribo/tests/Makefile.am b/scribo/tests/Makefile.am
index 61570c8..4ef9ca1 100644
--- a/scribo/tests/Makefile.am
+++ b/scribo/tests/Makefile.am
@@ -25,6 +25,7 @@ SUBDIRS = \
preprocessing \
table \
text \
+ toolchain \
unit_test
# Regen files recursively.
diff --git a/scribo/tests/img/wildly.pbm b/scribo/tests/img/wildly.pbm
new file mode 100644
index 0000000000000000000000000000000000000000..518fbec9b0814bb982ac9d901da02e5af1447d6d
GIT binary patch
literal 2208
zcmeIx-)qxQ6bJAV0|v@iUVSLb1z!}gYt=$G#=>w_go&dJ@v*7RXk|-6R>$^Wl0Pl}
z1^x?x{Q>693_`Bw+}mI2Qt&|(!CM&je)hdL$vMgKi`ql>ve!L0dDq)zAKtS|+rJpQ
z4tw5cHLa(gKA#WVSL3TEU1!AHQ}(RcY~!zmn_k<F?OPLj$OhN!?fA@gEY@l?85>O|
z-tP9cOVT*=hjzpE#uKY?>f^5N@tcza*0V2c%l|lHmNR6B_N9I1dgrz?VMo@$#Y0{H
znmsz|9d;krTAeoQv}^Ex{NW1l7{JK2V&$?5xtZ&~>R@7C&6U+W80bn4E3QS=uE)B}
z2T&TuZ*qA)<gjwS9?^C0I>0~e9i8sqh~@Jt5rD=co*A6e^YRqk;6SZjawY#lUO-4C
zDR8@bLwPOLGYHfSbYv``9xJhlj{iYdGT0;!g&V>#8({B+o9JppZbX<3u-0|T;|a;s
zVc9X6Xh3-rfbs=&D1XNT>3D8>U*z{?K3w>waBMl9V}eKL=<!Zg<yx|nZ7k=Fs84w<
z@v?f^6LK~6@&_f4W9j|8v+Vj;;YykW&9X<7SF$+pmR-A%emnC}-v89GYi1(yrmUO#
zgN^~<(Ta2FA$@Y9p0fV4Pc_5_3!nC7<|d-_Sf%#|8!5&<=+d!2;=P*s7P*NR!S{qm
z1>LL|TcCl;GoGu(9{<i0xRPEMe#>)(r~DgFWm^C?;2u6in)dPi6H(DFxu^8EUs^2|
z$zocXt@F5)nlF?!u151nX|m3>Qfj`Km3a=+iHeo08d&Sff!|)`Sm$a|8UM@p6Ql7>
A$^ZZW
literal 0
HcmV?d00001
diff --git a/scribo/tests/text/Makefile.am b/scribo/tests/toolchain/Makefile.am
similarity index 86%
copy from scribo/tests/text/Makefile.am
copy to scribo/tests/toolchain/Makefile.am
index f6cb0e5..826fb86 100644
--- a/scribo/tests/text/Makefile.am
+++ b/scribo/tests/toolchain/Makefile.am
@@ -1,4 +1,4 @@
-# Copyright (C) 2009 EPITA Research and Development Laboratory (LRDE).
+# Copyright (C) 2010 EPITA Research and Development Laboratory (LRDE).
#
# This file is part of Olena.
#
@@ -19,6 +19,5 @@
include $(top_srcdir)/scribo/tests/tests.mk
-check_PROGRAMS =
-
-TESTS = $(check_PROGRAMS)
+SUBDIRS = \
+ nepomuk
\ No newline at end of file
diff --git a/scribo/tests/filter/Makefile.am b/scribo/tests/toolchain/nepomuk/Makefile.am
similarity index 56%
copy from scribo/tests/filter/Makefile.am
copy to scribo/tests/toolchain/nepomuk/Makefile.am
index a023e4e..4bce3bd 100644
--- a/scribo/tests/filter/Makefile.am
+++ b/scribo/tests/toolchain/nepomuk/Makefile.am
@@ -1,4 +1,4 @@
-# Copyright (C) 2009 EPITA Research and Development Laboratory (LRDE).
+# Copyright (C) 2010 EPITA Research and Development Laboratory (LRDE).
#
# This file is part of Olena.
#
@@ -19,12 +19,27 @@
include $(top_srcdir)/scribo/tests/tests.mk
-check_PROGRAMS = \
- objects_with_holes \
- small_and_large_bboxes
+check_PROGRAMS =
-objects_with_holes_SOURCES = objects_with_holes.cc
-small_and_large_bboxes_SOURCES = small_and_large_bboxes.cc
+
+if HAVE_QT
+if HAVE_TESSERACT
+
+check_PROGRAMS += text_extraction
+text_extraction_SOURCES = text_extraction.cc
+text_extraction_CXXFLAGS = $(QT_CXXFLAGS) $(AM_CXXFLAGS)
+text_extraction_CPPFLAGS = $(QT_CPPFLAGS) $(AM_CPPFLAGS) \
+ $(TESSERACT_CPPFLAGS) \
+ $(TIFF_CPPFLAGS)
+text_extraction_LDFLAGS = $(QT_LDFLAGS) $(LDFLAGS) \
+ $(TESSERACT_LDFLAGS) \
+ $(TIFF_LDFLAGS) \
+ -lpthread
+text_extraction_LDADD = $(QT_LIBS) $(LDADD)
+
+endif HAVE_TESSERACT
+endif HAVE_QT
+
TESTS = $(check_PROGRAMS)
diff --git a/milena/mln/fun/n2v/all.hh
b/scribo/tests/toolchain/nepomuk/text_extraction.cc
similarity index 75%
copy from milena/mln/fun/n2v/all.hh
copy to scribo/tests/toolchain/nepomuk/text_extraction.cc
index 0e0e55c..c1b51ea 100644
--- a/milena/mln/fun/n2v/all.hh
+++ b/scribo/tests/toolchain/nepomuk/text_extraction.cc
@@ -23,29 +23,22 @@
// exception does not however invalidate any other reasons why the
// executable file might be covered by the GNU General Public License.
-#ifndef MLN_FUN_N2V_ALL_HH
-# define MLN_FUN_N2V_ALL_HH
-
/// \file
///
-/// File that includes all functions from nil to value.
-
+/// Test of scribo::toolchain::nepomuk::text_extraction
-namespace mln
-{
+#include <QtGui/QImage>
+#include <QtCore>
+#include <scribo/toolchain/nepomuk/text_extraction.hh>
- namespace fun
- {
+#include <scribo/tests/data.hh>
- /// \brief Namespace of functions from nil to value.
- ///
- /// \ingroup modfun
- namespace n2v {}
+int main()
+{
+ QImage ima(SCRIBO_IMG_DIR "/wildly.pbm");
+ QSet<QString> words = scribo::toolchain::nepomuk::text_extraction(ima);
- }
+ mln_assertion(words.size() == 1);
+ mln_assertion(words.contains("Wildly"));
+ return 0;
}
-
-
-# include <mln/fun/n2v/white_gaussian.hh>
-
-#endif // ! MLN_FUN_N2V_ALL_HH
diff --git a/scribo/text/recognition.hh b/scribo/text/recognition.hh
index f8d8f4f..44533e9 100644
--- a/scribo/text/recognition.hh
+++ b/scribo/text/recognition.hh
@@ -104,8 +104,6 @@ namespace scribo
{
trace::entering("scribo::text::recognition");
- mln_precondition(lines.is_valid());
-
// Initialize Tesseract.
TessBaseAPI::InitWithLanguage(NULL, NULL, language, NULL, false, 0, NULL);
diff --git a/scribo/toolchain/nepomuk/text_extraction.hh
b/scribo/toolchain/nepomuk/text_extraction.hh
new file mode 100644
index 0000000..ed486f5
--- /dev/null
+++ b/scribo/toolchain/nepomuk/text_extraction.hh
@@ -0,0 +1,160 @@
+// Copyright (C) 2010 EPITA Research and Development Laboratory (LRDE)
+//
+// This file is part of Olena.
+//
+// Olena is free software: you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation, version 2 of the License.
+//
+// Olena is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with Olena. If not, see <http://www.gnu.org/licenses/>.
+//
+// As a special exception, you may use this file as part of a free
+// software project without restriction. Specifically, if other files
+// instantiate templates or use macros or inline functions from this
+// file, or you compile this file and link it with other files to produce
+// an executable, this file does not by itself cause the resulting
+// executable to be covered by the GNU General Public License. This
+// exception does not however invalidate any other reasons why the
+// executable file might be covered by the GNU General Public License.
+
+
+#ifndef SCRIBO_TOOLCHAIN_TEXT_EXTRACTION_HH
+# define SCRIBO_TOOLCHAIN_TEXT_EXTRACTION_HH
+
+/// \file
+///
+/// Extract text from a document.
+
+
+# include <QtCore/QString>
+# include <QtCore/QStringList>
+# include <QtGui/QImage>
+
+# include <mln/core/image/image2d.hh>
+# include <mln/data/transform.hh>
+# include <mln/logical/not.hh>
+# include <mln/value/qt/rgb32.hh>
+# include <mln/fun/v2v/qt_rgb_to_int_u.hh>
+
+# include <scribo/convert/from_qimage.hh>
+# include <scribo/binarization/sauvola_ms.hh>
+# include <scribo/preprocessing/deskew.hh>
+# include <scribo/toolchain/text_in_doc.hh>
+
+
+namespace scribo
+{
+
+ namespace toolchain
+ {
+
+ namespace nepomuk
+ {
+
+ /*! \brief Extract text from a document.
+
+ This is a convenient routine to be used in Nepomuk.
+
+
+
+ \param[in] ima A document image. The
+
+ \return A set of recognized words.
+
+ */
+ QSet<QString>
+ text_extraction(const QImage& input);
+
+
+# ifndef MLN_INCLUDE_ONLY
+
+ QSet<QString>
+ text_extraction(const QImage& input)
+ {
+ trace::entering("scribo::toolchain::nepomuk::text_extraction");
+
+ mln_precondition(!input.isNull());
+
+ typedef image2d<scribo::def::lbl_type> L;
+
+ // Convert image to Milena's format.
+ mln::image2d<mln::value::qt::rgb32>
+ input_mln = scribo::convert::from_qimage(input);
+
+ image2d<bool> input_bin;
+
+
+ // Preprocess
+ {
+ // Convert to Gray level image.
+ image2d<value::int_u8>
+ input_gl = data::transform(input_mln,
+ mln::fun::v2v::qt_rgb_to_int_u<8>());
+
+ // Deskew if needed.
+ input_gl = preprocessing::deskew(input_gl);
+
+ // Binarize foreground to use it in the processing chain.
+ input_bin = scribo::binarization::sauvola_ms(input_gl, 101, 3);
+ }
+
+
+
+
+ line_set<L> lines_bg, lines_fg;
+ // Process
+ {
+ // Run document toolchain.
+ lines_bg = scribo::toolchain::text_in_doc(input_bin, false, false);
+
+ // Negate document.
+ logical::not_inplace(input_bin);
+
+ // Run document toolchain.
+ lines_fg = scribo::toolchain::text_in_doc(input_bin, false, false);
+ }
+
+
+ QSet<QString> output;
+
+ // Construct output
+ {
+ QTextCodec *codec = QTextCodec::codecForName("UTF-8");
+
+ QString tmp_out;
+ QTextStream stream(&tmp_out, QIODevice::WriteOnly);
+ stream.setCodec("UTF-8");
+
+ for_all_lines(l, lines_bg)
+ if (lines_bg(l).has_text())
+ stream << " " <<
codec->toUnicode(lines_bg(l).text().c_str());
+
+ for_all_lines(l, lines_fg)
+ if (lines_fg(l).has_text())
+ stream << " " <<
codec->toUnicode(lines_fg(l).text().c_str());
+
+ QStringList list = tmp_out.split(' ', QString::SkipEmptyParts);
+
+ output = QSet<QString>::fromList(list);
+ }
+
+ trace::exiting("scribo::toolchain::nepomuk::text_extraction");
+ return output;
+ }
+
+# endif // ! MLN_INCLUDE_ONLY
+
+ } // end of namespace scribo::toolchain::nepomuk
+
+ } // end of namespace scribo::toolchain
+
+} // end of namespace scribo
+
+
+#endif // ! SCRIBO_TOOLCHAIN_TEXT_EXTRACTION_HH
--
1.5.6.5