* demo/xml2doc/templates/pdf/line.xsl: Do not print lines with
confidence < 60%.
* scribo/core/line_info.hh: Add access to text confidence.
* scribo/io/xml/internal/extended_page_xml_visitor.hh,
* scribo/io/xml/internal/full_xml_visitor.hh: Save text confidence.
* scribo/io/xml/load.hh: Load text confidence.
---
scribo/ChangeLog | 14 ++
scribo/demo/xml2doc/templates/pdf/line.xsl | 157 ++++++++++---------
scribo/scribo/core/line_info.hh | 12 ++-
.../io/xml/internal/extended_page_xml_visitor.hh | 1 +
scribo/scribo/io/xml/internal/full_xml_visitor.hh | 1 +
scribo/scribo/io/xml/load.hh | 1 +
6 files changed, 110 insertions(+), 76 deletions(-)
diff --git a/scribo/ChangeLog b/scribo/ChangeLog
index 5561f60..dbad5ca 100644
--- a/scribo/ChangeLog
+++ b/scribo/ChangeLog
@@ -1,3 +1,17 @@
+2012-09-07 Guillaume Lazzara <z(a)lrde.epita.fr>
+
+ Take OCR confidence into account in document reconstruction.
+
+ * demo/xml2doc/templates/pdf/line.xsl: Do not print lines with
+ confidence < 60%.
+
+ * scribo/core/line_info.hh: Add access to text confidence.
+
+ * scribo/io/xml/internal/extended_page_xml_visitor.hh,
+ * scribo/io/xml/internal/full_xml_visitor.hh: Save text confidence.
+
+ * scribo/io/xml/load.hh: Load text confidence.
+
2012-05-11 Guillaume Lazzara <z(a)lrde.epita.fr>
Rename point_at_index, delta_index and index_of_point.
diff --git a/scribo/demo/xml2doc/templates/pdf/line.xsl
b/scribo/demo/xml2doc/templates/pdf/line.xsl
index 0c1b9d7..8ce024b 100644
--- a/scribo/demo/xml2doc/templates/pdf/line.xsl
+++ b/scribo/demo/xml2doc/templates/pdf/line.xsl
@@ -124,93 +124,102 @@
</xsl:variable>
<!-- END OF lines coordinates -->
- <fo:block-container position="absolute"
border-width="5mm">
-
- <xsl:attribute name="left">
- <xsl:value-of select="$x1" />px
- </xsl:attribute>
- <xsl:attribute name="top">
- <xsl:value-of select="$y1" />px
- </xsl:attribute>
-
- <xsl:attribute name="right">
- <xsl:value-of select="$xmax" />px
- </xsl:attribute>
- <xsl:attribute name="bottom">
- <xsl:value-of select="$ymax" />px
- </xsl:attribute>
-
-
- <xsl:attribute name="width">
- <xsl:value-of select="$xmax - $x1" />px
- </xsl:attribute>
-
- <xsl:attribute name="color">
- <xsl:value-of select="$colour" />
- </xsl:attribute>
-
- <!-- Adjusting height if font is different from Times. -->
- <xsl:variable name="fsize">
- <xsl:choose>
- <xsl:when test="($a + $d) > (1.16 * (37 * $char_width) div
17)">
- <xsl:value-of select="((37 * $char_width) div 17)" />
- </xsl:when>
- <xsl:otherwise>
- <xsl:value-of select="($a + $d)" />
- </xsl:otherwise>
- </xsl:choose>
- </xsl:variable>
-
- <!-- if necessary, put letter-spacing="-Npt" ~ -3 <= N <= -1
- in fo:block-->
-
- <!-- text-align-last="justify" will help justifying and using a
- uniform font size (it stretchs the text, at least for PDF)
- but it relies on several lines... We need paragraph
- information.
- -->
-
- <!-- FIXME: using a table allows to justify a single line of
- text. This is an UGLY HACK. Font size is also tweaked for
- now but it should not since we have font information.
- -->
- <fo:table table-layout="fixed">
-
- <xsl:attribute name="width">
- <xsl:value-of select="$xmax -$x1" />px
- </xsl:attribute>
-
- <fo:table-column column-number="1">
- <xsl:attribute name="column-width">
- <xsl:value-of select="$xmax -$x1" />px
+
+ <!-- Display lines with high enough OCR confidence -->
+ <xsl:choose>
+ <xsl:when test="@textConfidence > 60">
+
+
+ <fo:block-container position="absolute" border-width="5mm">
+
+ <xsl:attribute name="left">
+ <xsl:value-of select="$x1" />px
+ </xsl:attribute>
+ <xsl:attribute name="top">
+ <xsl:value-of select="$y1" />px
+ </xsl:attribute>
+
+ <xsl:attribute name="right">
+ <xsl:value-of select="$xmax" />px
+ </xsl:attribute>
+ <xsl:attribute name="bottom">
+ <xsl:value-of select="$ymax" />px
+ </xsl:attribute>
+
+
+ <xsl:attribute name="width">
+ <xsl:value-of select="$xmax - $x1" />px
+ </xsl:attribute>
+
+ <xsl:attribute name="color">
+ <xsl:value-of select="$colour" />
</xsl:attribute>
- </fo:table-column>
- <fo:table-body start-indent="0pt" text-align="justify"
text-align-last="justify">
+ <!-- Adjusting height if font is different from Times. -->
+ <xsl:variable name="fsize">
+ <xsl:choose>
+ <xsl:when test="($a + $d) > (1.16 * (37 * $char_width) div
17)">
+ <xsl:value-of select="((37 * $char_width) div 17)" />
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:value-of select="($a + $d)" />
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:variable>
+
+ <!-- if necessary, put letter-spacing="-Npt" ~ -3 <= N <= -1
+ in fo:block-->
+
+ <!-- text-align-last="justify" will help justifying and using a
+ uniform font size (it stretchs the text, at least for PDF)
+ but it relies on several lines... We need paragraph
+ information.
+ -->
+
+ <!-- FIXME: using a table allows to justify a single line of
+ text. This is an UGLY HACK. Font size is also tweaked for
+ now but it should not since we have font information.
+ -->
+ <fo:table table-layout="fixed">
+
+ <xsl:attribute name="width">
+ <xsl:value-of select="$xmax -$x1" />px
+ </xsl:attribute>
+
+ <fo:table-column column-number="1">
+ <xsl:attribute name="column-width">
+ <xsl:value-of select="$xmax -$x1" />px
+ </xsl:attribute>
+ </fo:table-column>
+
+ <fo:table-body start-indent="0pt" text-align="justify"
text-align-last="justify">
+
+ <fo:table-row>
- <fo:table-row>
+ <fo:table-cell>
- <fo:table-cell>
+ <fo:block font-family="Times" wrap-option="no-wrap"
white-space-collapse="true" text-align-last="justify"
text-align="justify">
- <fo:block font-family="Times" wrap-option="no-wrap"
white-space-collapse="true" text-align-last="justify"
text-align="justify">
+ <xsl:attribute name="font-size">
+ <xsl:value-of select="$fsize" />px
+ </xsl:attribute>
- <xsl:attribute name="font-size">
- <xsl:value-of select="$fsize" />px
- </xsl:attribute>
+ <xsl:value-of select="@text"/>
+ </fo:block>
- <xsl:value-of select="@text"/>
- </fo:block>
+ </fo:table-cell>
- </fo:table-cell>
+ </fo:table-row>
- </fo:table-row>
+ </fo:table-body>
- </fo:table-body>
+ </fo:table>
- </fo:table>
+ </fo:block-container>
+ </xsl:when>
- </fo:block-container>
+ </xsl:choose>
</xsl:for-each>
diff --git a/scribo/scribo/core/line_info.hh b/scribo/scribo/core/line_info.hh
index b121205..59ce1d3 100644
--- a/scribo/scribo/core/line_info.hh
+++ b/scribo/scribo/core/line_info.hh
@@ -1,4 +1,4 @@
-// Copyright (C) 2009, 2010, 2011 EPITA Research and Development
+// Copyright (C) 2009, 2010, 2011, 2012 EPITA Research and Development
// Laboratory (LRDE)
//
// This file is part of Olena.
@@ -758,6 +758,14 @@ namespace scribo
template <typename L>
+ float
+ line_info<L>::text_confidence() const
+ {
+ return data_->text_confidence_;
+ }
+
+
+ template <typename L>
const std::string&
line_info<L>::text() const
{
@@ -775,7 +783,7 @@ namespace scribo
template <typename L>
void
- line_info<L>::update_text(const std::string& str, float confidence = 1.0)
+ line_info<L>::update_text(const std::string& str, float confidence = 100.0)
{
data_->text_confidence_ = confidence;
data_->text_ = str;
diff --git a/scribo/scribo/io/xml/internal/extended_page_xml_visitor.hh
b/scribo/scribo/io/xml/internal/extended_page_xml_visitor.hh
index c766515..8c83ffa 100644
--- a/scribo/scribo/io/xml/internal/extended_page_xml_visitor.hh
+++ b/scribo/scribo/io/xml/internal/extended_page_xml_visitor.hh
@@ -337,6 +337,7 @@ namespace scribo
<< "\" dHeight=\"" << line.d_height()
<< "\" aHeight=\"" << line.a_height()
<< "\" charWidth=\"" << line.char_width()
+ << "\" textConfidence=\"" << line.text_confidence()
<< "\">" << std::endl;
internal::print_box_coords(output, line.bbox(), " ");
diff --git a/scribo/scribo/io/xml/internal/full_xml_visitor.hh
b/scribo/scribo/io/xml/internal/full_xml_visitor.hh
index e398389..23b48a8 100644
--- a/scribo/scribo/io/xml/internal/full_xml_visitor.hh
+++ b/scribo/scribo/io/xml/internal/full_xml_visitor.hh
@@ -518,6 +518,7 @@ namespace scribo
<< "\" dHeight=\"" << line.d_height()
<< "\" aHeight=\"" << line.a_height()
<< "\" charWidth=\"" << line.char_width()
+ << "\" textConfidence=\"" << line.text_confidence()
<< "\">" << std::endl;
internal::print_box_coords(output, line.bbox(), " ");
diff --git a/scribo/scribo/io/xml/load.hh b/scribo/scribo/io/xml/load.hh
index 6b570f8..62ad4a2 100644
--- a/scribo/scribo/io/xml/load.hh
+++ b/scribo/scribo/io/xml/load.hh
@@ -284,6 +284,7 @@ namespace scribo
line_data = new scribo::internal::line_info_data<L>(lines,
mln::util::array<component_id_t>());
line_data->text_ = atts.value("text").toUtf8().constData();
+ line_data->text_confidence_ =
atts.value("textConfidence").toUtf8().constData();
line_data->hidden_ = false;
line_data->tag_ =
static_cast<line::Tag>(atts.value("tag").toInt());
--
1.7.2.5