* scribo/core/paragraph_set.hh,
* scribo/core/document.hh: Add new methods.
* scribo/core/macros.hh: Add a missing macro.
* scribo/io/xml/save.hh,
* scribo/primitive/extract/elements.hh: Make use of the methods in
document class.
* src/pbm_text_in_doc.cc: Add recognized language as an option.
* tests/toolchain/nepomuk/text_extraction.cc: Make the test not
case dependent.
---
scribo/ChangeLog | 18 +++++++++
scribo/scribo/core/document.hh | 41 +++++++++++++++------
scribo/scribo/core/macros.hh | 3 ++
scribo/scribo/core/paragraph_set.hh | 20 ++++++++++
scribo/scribo/io/xml/save.hh | 20 +++++-----
scribo/scribo/primitive/extract/elements.hh | 5 ++-
scribo/src/pbm_text_in_doc.cc | 40 ++++++++++++--------
scribo/tests/toolchain/nepomuk/text_extraction.cc | 4 +-
8 files changed, 110 insertions(+), 41 deletions(-)
diff --git a/scribo/ChangeLog b/scribo/ChangeLog
index 3ec57f0..c947550 100644
--- a/scribo/ChangeLog
+++ b/scribo/ChangeLog
@@ -1,5 +1,23 @@
2011-01-25 Guillaume Lazzara <z(a)lrde.epita.fr>
+ Small fixes in Scribo.
+
+ * scribo/core/paragraph_set.hh,
+ * scribo/core/document.hh: Add new methods.
+
+ * scribo/core/macros.hh: Add a missing macro.
+
+ * scribo/io/xml/save.hh,
+ * scribo/primitive/extract/elements.hh: Make use of the methods in
+ document class.
+
+ * src/pbm_text_in_doc.cc: Add recognized language as an option.
+
+ * tests/toolchain/nepomuk/text_extraction.cc: Make the test not
+ case dependent.
+
+2011-01-25 Guillaume Lazzara <z(a)lrde.epita.fr>
+
Add an option to choose the recognized language.
* scribo/toolchain/nepomuk/text_extraction.hh,
diff --git a/scribo/scribo/core/document.hh b/scribo/scribo/core/document.hh
index f4a78ff..b547da4 100644
--- a/scribo/scribo/core/document.hh
+++ b/scribo/scribo/core/document.hh
@@ -1,4 +1,5 @@
-// Copyright (C) 2010 EPITA Research and Development Laboratory (LRDE)
+// Copyright (C) 2010, 2011 EPITA Research and Development Laboratory
+// (LRDE)
//
// This file is part of Olena.
//
@@ -59,15 +60,24 @@ namespace scribo
bool is_valid() const;
- const line_set<L>& text() const;
+ /*! \brief Check whether this document contains text.
+
+ If it returns true, that document contains paragraphs, lines and
+ text components.
+
+ */
bool has_text() const;
- void set_text(const line_set<L>& line);
+
+ mln::def::coord height() const;
+ mln::def::coord width() const;
+
+ const line_set<L>& lines() const;
const paragraph_set<L>& paragraphs() const;
void set_paragraphs(const paragraph_set<L>& parset);
- const component_set<L>& elements() const;
bool has_elements() const;
+ const component_set<L>& elements() const;
void set_elements(const component_set<L>& elements);
const mln::image2d<value::rgb8>& image() const;
@@ -150,10 +160,18 @@ namespace scribo
template <typename L>
- const line_set<L>&
- document<L>::text() const
+ mln::def::coord
+ document<L>::width() const
{
- return lines_;
+ return image_.ncols();
+ }
+
+
+ template <typename L>
+ mln::def::coord
+ document<L>::height() const
+ {
+ return image_.nrows();
}
@@ -161,17 +179,18 @@ namespace scribo
bool
document<L>::has_text() const
{
- return lines_.is_valid();
+ return parset_.is_valid();
}
template <typename L>
- void
- document<L>::set_text(const line_set<L>& line)
+ const line_set<L>&
+ document<L>::lines() const
{
- lines_ = line;
+ return parset_.lines();
}
+
template <typename L>
const paragraph_set<L>&
document<L>::paragraphs() const
diff --git a/scribo/scribo/core/macros.hh b/scribo/scribo/core/macros.hh
index 1060358..887539f 100644
--- a/scribo/scribo/core/macros.hh
+++ b/scribo/scribo/core/macros.hh
@@ -62,4 +62,7 @@
# define for_all_anchors(E, S) \
for_all_elements(E, S)
+# define for_all_paragraph_lines(E, S) \
+ for_all_elements(E, S)
+
#endif // ! SCRIBO_CORE_MACROS_HH
diff --git a/scribo/scribo/core/paragraph_set.hh b/scribo/scribo/core/paragraph_set.hh
index afb59c5..355eaa9 100644
--- a/scribo/scribo/core/paragraph_set.hh
+++ b/scribo/scribo/core/paragraph_set.hh
@@ -53,9 +53,13 @@ namespace scribo
paragraph_info<L>& operator()(unsigned i);
const paragraph_info<L>& operator()(unsigned i) const;
+ bool is_valid() const;
+
+ const line_set<L>& lines() const;
private:
mln::util::array<paragraph_info<L> > pars_;
+ line_set<L> lines_;
};
@@ -82,6 +86,7 @@ namespace scribo
paragraph_set<L>::paragraph_set(const line_links<L>& llinks, unsigned
npars)
: pars_(npars + 1, paragraph_info<L>(llinks))
{
+ lines_ = llinks.lines();
}
template <typename L>
@@ -106,6 +111,21 @@ namespace scribo
}
+ template <typename L>
+ bool
+ paragraph_set<L>::is_valid() const
+ {
+ return !pars_.is_empty();
+ }
+
+
+ template <typename L>
+ const line_set<L>&
+ paragraph_set<L>::lines() const
+ {
+ return lines_;
+ }
+
namespace make
{
diff --git a/scribo/scribo/io/xml/save.hh b/scribo/scribo/io/xml/save.hh
index 1bcdd6f..41d4fef 100644
--- a/scribo/scribo/io/xml/save.hh
+++ b/scribo/scribo/io/xml/save.hh
@@ -130,9 +130,6 @@ namespace scribo
abort();
}
- const line_set<L>& lines = doc.text();
- const paragraph_set<L>& parset = doc.paragraphs();
-
std::map<char, std::string> html_map;
html_map['\"'] = """;
html_map['<'] = "<";
@@ -150,13 +147,16 @@ namespace scribo
file << " </pcMetadata>" << std::endl;
file << " <page image_filename=\"" << doc.filename()
- << "\" image_width=\"" <<
lines.components().labeled_image().ncols()
- << "\" image_height=\"" <<
lines.components().labeled_image().nrows()
+ << "\" image_width=\"" << doc.width()
+ << "\" image_height=\"" << doc.height()
<< "\">" << std::endl;
// Text
if (doc.has_text())
{
+ const line_set<L>& lines = doc.lines();
+ const paragraph_set<L>& parset = doc.paragraphs();
+
for_all_paragraphs(p, parset)
{
const mln::util::array<line_id_t>& line_ids = parset(p).line_ids();
@@ -224,9 +224,6 @@ namespace scribo
abort();
}
- const line_set<L>& lines = doc.text();
- const paragraph_set<L>& parset = doc.paragraphs();
-
std::map<char, std::string> html_map;
html_map['\"'] = """;
html_map['<'] = "<";
@@ -244,13 +241,16 @@ namespace scribo
file << " </pcMetadata>" << std::endl;
file << " <page image_filename=\"" << doc.filename()
- << "\" image_width=\"" <<
lines.components().labeled_image().ncols()
- << "\" image_height=\"" <<
lines.components().labeled_image().nrows()
+ << "\" image_width=\"" << doc.width()
+ << "\" image_height=\"" << doc.height()
<< "\">" << std::endl;
// Text
if (doc.has_text())
{
+ const line_set<L>& lines = doc.lines();
+ const paragraph_set<L>& parset = doc.paragraphs();
+
for_all_paragraphs(p, parset)
{
const mln::util::array<line_id_t>& line_ids = parset(p).line_ids();
diff --git a/scribo/scribo/primitive/extract/elements.hh
b/scribo/scribo/primitive/extract/elements.hh
index 2e6a0cb..ddf2c92 100644
--- a/scribo/scribo/primitive/extract/elements.hh
+++ b/scribo/scribo/primitive/extract/elements.hh
@@ -1,4 +1,5 @@
-// Copyright (C) 2010 EPITA Research and Development Laboratory (LRDE)
+// Copyright (C) 2010, 2011 EPITA Research and Development Laboratory
+// (LRDE)
//
// This file is part of Olena.
//
@@ -114,7 +115,7 @@ namespace scribo
mln_precondition(doc.is_valid());
mln_precondition(input.is_valid());
- const line_set<L>& lines = doc.text();
+ const line_set<L>& lines = doc.lines();
// Element extraction
diff --git a/scribo/src/pbm_text_in_doc.cc b/scribo/src/pbm_text_in_doc.cc
index 2240f42..721ff47 100644
--- a/scribo/src/pbm_text_in_doc.cc
+++ b/scribo/src/pbm_text_in_doc.cc
@@ -1,5 +1,5 @@
-// Copyright (C) 2009, 2010 EPITA Research and Development Laboratory
-// (LRDE)
+// Copyright (C) 2009, 2010, 2011 EPITA Research and Development
+// Laboratory (LRDE)
//
// This file is part of Olena.
//
@@ -49,7 +49,6 @@
#include <scribo/preprocessing/crop_without_localization.hh>
-#include <scribo/io/xml/save.hh>
#include <scribo/io/text_boxes/save.hh>
@@ -65,6 +64,7 @@ for the background." },
{ "pmin_col", "Col index of the top left corner of the Region of
interest." },
{ "pmax_row", "Row index of the bottom right corner of the Region of
interest." },
{ "pmax_col", "Col index of the bottom right corner of the Region of
interest." },
+ { "language", "Language to be used for the text recognition. [eng|fra]
(Default: eng)" },
{ "find_lines", "Find vertical lines. (Default 1)" },
{ "find_whitespaces", "Find whitespaces separators. (Default 1)"
},
{ "debug_dir", "Output directory for debug image" },
@@ -77,16 +77,16 @@ int main(int argc, char* argv[])
using namespace scribo;
using namespace mln;
- if (argc != 3 && argc != 4 && argc != 5 && argc != 8 &&
argc != 9)
+ if (argc < 3 || argc > 12)
return scribo::debug::usage(argv,
"Find text lines using left/right validation and display x-height in a binarized
article.",
- "input.pbm out.txt <denoise_enabled> [<pmin_row> <pmin_col>
<pmax_row> <pmax_col>] <find_lines> <find_whitespaces>
<debug_dir>",
+ "input.pbm out.txt <denoise_enabled> [<pmin_row> <pmin_col>
<pmax_row> <pmax_col>] <language> <find_lines>
<find_whitespaces> <debug_dir>",
args_desc);
bool debug = false;
// Enable debug output.
- if (argc == 7 || argc == 11)
+ if (argc == 8 || argc == 12)
{
scribo::make::internal::debug_filename_prefix = argv[argc - 1];
debug = true;
@@ -101,7 +101,7 @@ int main(int argc, char* argv[])
// Optional Cropping
point2d crop_shift = literal::origin;
- if (argc >= 11)
+ if (argc >= 12)
{
mln::def::coord
minr = atoi(argv[4]),
@@ -120,13 +120,24 @@ int main(int argc, char* argv[])
bool denoise = (argc > 3 && atoi(argv[3]) != 0);
+ std::string language = "eng";
+ if (argc > 4 && argc < 12)
+ language = argv[4];
+ else if (argc == 12)
+ language = argv[8];
+
bool find_line_seps = true;
- if (argc >= 4 && argc < 11)
- find_line_seps = (atoi(argv[3]) != 0);
+ if (argc > 5 && argc < 12)
+ find_line_seps = (atoi(argv[5]) != 0);
+ else if (argc == 12)
+ find_line_seps = (atoi(argv[9]) != 0);
bool find_whitespace_seps = true;
- if (argc >= 5 && argc < 11)
- find_line_seps = (atoi(argv[4]) != 0);
+ if (argc > 6 && argc < 12)
+ find_whitespace_seps = (atoi(argv[6]) != 0);
+ else if (argc == 12)
+ find_whitespace_seps = (atoi(argv[10]) != 0);
+
std::cout << "Running with the following options :"
<< "find_lines_seps = " << find_line_seps
@@ -136,15 +147,12 @@ int main(int argc, char* argv[])
// Run document toolchain.
line_set<L>
- lines = scribo::toolchain::text_in_doc(input, denoise, find_line_seps,
+ lines = scribo::toolchain::text_in_doc(input, denoise,
+ language, find_line_seps,
find_whitespace_seps, debug);
scribo::document<L> doc;
doc.set_filename(argv[1]);
- doc.set_text(lines);
-
- // Saving results
- scribo::io::xml::save(doc, "out.xml", true);
// Specify shift due to potential previous crop.
scribo::io::text_boxes::save(lines, argv[2], crop_shift);
diff --git a/scribo/tests/toolchain/nepomuk/text_extraction.cc
b/scribo/tests/toolchain/nepomuk/text_extraction.cc
index 7191650..eeafd6b 100644
--- a/scribo/tests/toolchain/nepomuk/text_extraction.cc
+++ b/scribo/tests/toolchain/nepomuk/text_extraction.cc
@@ -43,9 +43,9 @@ int main()
QImage ima(SCRIBO_IMG_DIR "/wildly.pbm");
QSet<QString> words = scribo::toolchain::nepomuk::text_extraction(ima);
- words = words.toLower();
mln_assertion(words.size() == 1);
- mln_assertion(words.contains("wildly"));
+ QString word = words.toList().at(0).toLower();
+ mln_assertion(word == "wildly");
return 0;
}
--
1.5.6.5