olena-2.0-74-g031310f Take OCR confidence into account in document reconstruction.

* demo/xml2doc/templates/pdf/line.xsl: Do not print lines with confidence < 60%. * scribo/core/line_info.hh: Add access to text confidence. * scribo/io/xml/internal/extended_page_xml_visitor.hh, * scribo/io/xml/internal/full_xml_visitor.hh: Save text confidence. * scribo/io/xml/load.hh: Load text confidence. --- scribo/ChangeLog | 14 ++ scribo/demo/xml2doc/templates/pdf/line.xsl | 157 ++++++++++--------- scribo/scribo/core/line_info.hh | 12 ++- .../io/xml/internal/extended_page_xml_visitor.hh | 1 + scribo/scribo/io/xml/internal/full_xml_visitor.hh | 1 + scribo/scribo/io/xml/load.hh | 1 + 6 files changed, 110 insertions(+), 76 deletions(-) diff --git a/scribo/ChangeLog b/scribo/ChangeLog index 5561f60..dbad5ca 100644 --- a/scribo/ChangeLog +++ b/scribo/ChangeLog @@ -1,3 +1,17 @@ +2012-09-07 Guillaume Lazzara <z@lrde.epita.fr> + + Take OCR confidence into account in document reconstruction. + + * demo/xml2doc/templates/pdf/line.xsl: Do not print lines with + confidence < 60%. + + * scribo/core/line_info.hh: Add access to text confidence. + + * scribo/io/xml/internal/extended_page_xml_visitor.hh, + * scribo/io/xml/internal/full_xml_visitor.hh: Save text confidence. + + * scribo/io/xml/load.hh: Load text confidence. + 2012-05-11 Guillaume Lazzara <z@lrde.epita.fr> Rename point_at_index, delta_index and index_of_point. diff --git a/scribo/demo/xml2doc/templates/pdf/line.xsl b/scribo/demo/xml2doc/templates/pdf/line.xsl index 0c1b9d7..8ce024b 100644 --- a/scribo/demo/xml2doc/templates/pdf/line.xsl +++ b/scribo/demo/xml2doc/templates/pdf/line.xsl @@ -124,93 +124,102 @@ </xsl:variable> <!-- END OF lines coordinates --> - <fo:block-container position="absolute" border-width="5mm"> - - <xsl:attribute name="left"> - <xsl:value-of select="$x1" />px - </xsl:attribute> - <xsl:attribute name="top"> - <xsl:value-of select="$y1" />px - </xsl:attribute> - - <xsl:attribute name="right"> - <xsl:value-of select="$xmax" />px - </xsl:attribute> - <xsl:attribute name="bottom"> - <xsl:value-of select="$ymax" />px - </xsl:attribute> - - - <xsl:attribute name="width"> - <xsl:value-of select="$xmax - $x1" />px - </xsl:attribute> - - <xsl:attribute name="color"> - <xsl:value-of select="$colour" /> - </xsl:attribute> - - <!-- Adjusting height if font is different from Times. --> - <xsl:variable name="fsize"> - <xsl:choose> - <xsl:when test="($a + $d) > (1.16 * (37 * $char_width) div 17)"> - <xsl:value-of select="((37 * $char_width) div 17)" /> - </xsl:when> - <xsl:otherwise> - <xsl:value-of select="($a + $d)" /> - </xsl:otherwise> - </xsl:choose> - </xsl:variable> - - <!-- if necessary, put letter-spacing="-Npt" ~ -3 <= N <= -1 - in fo:block--> - - <!-- text-align-last="justify" will help justifying and using a - uniform font size (it stretchs the text, at least for PDF) - but it relies on several lines... We need paragraph - information. - --> - - <!-- FIXME: using a table allows to justify a single line of - text. This is an UGLY HACK. Font size is also tweaked for - now but it should not since we have font information. - --> - <fo:table table-layout="fixed"> - - <xsl:attribute name="width"> - <xsl:value-of select="$xmax -$x1" />px - </xsl:attribute> - - <fo:table-column column-number="1"> - <xsl:attribute name="column-width"> - <xsl:value-of select="$xmax -$x1" />px + + <!-- Display lines with high enough OCR confidence --> + <xsl:choose> + <xsl:when test="@textConfidence > 60"> + + + <fo:block-container position="absolute" border-width="5mm"> + + <xsl:attribute name="left"> + <xsl:value-of select="$x1" />px + </xsl:attribute> + <xsl:attribute name="top"> + <xsl:value-of select="$y1" />px + </xsl:attribute> + + <xsl:attribute name="right"> + <xsl:value-of select="$xmax" />px + </xsl:attribute> + <xsl:attribute name="bottom"> + <xsl:value-of select="$ymax" />px + </xsl:attribute> + + + <xsl:attribute name="width"> + <xsl:value-of select="$xmax - $x1" />px + </xsl:attribute> + + <xsl:attribute name="color"> + <xsl:value-of select="$colour" /> </xsl:attribute> - </fo:table-column> - <fo:table-body start-indent="0pt" text-align="justify" text-align-last="justify"> + <!-- Adjusting height if font is different from Times. --> + <xsl:variable name="fsize"> + <xsl:choose> + <xsl:when test="($a + $d) > (1.16 * (37 * $char_width) div 17)"> + <xsl:value-of select="((37 * $char_width) div 17)" /> + </xsl:when> + <xsl:otherwise> + <xsl:value-of select="($a + $d)" /> + </xsl:otherwise> + </xsl:choose> + </xsl:variable> + + <!-- if necessary, put letter-spacing="-Npt" ~ -3 <= N <= -1 + in fo:block--> + + <!-- text-align-last="justify" will help justifying and using a + uniform font size (it stretchs the text, at least for PDF) + but it relies on several lines... We need paragraph + information. + --> + + <!-- FIXME: using a table allows to justify a single line of + text. This is an UGLY HACK. Font size is also tweaked for + now but it should not since we have font information. + --> + <fo:table table-layout="fixed"> + + <xsl:attribute name="width"> + <xsl:value-of select="$xmax -$x1" />px + </xsl:attribute> + + <fo:table-column column-number="1"> + <xsl:attribute name="column-width"> + <xsl:value-of select="$xmax -$x1" />px + </xsl:attribute> + </fo:table-column> + + <fo:table-body start-indent="0pt" text-align="justify" text-align-last="justify"> + + <fo:table-row> - <fo:table-row> + <fo:table-cell> - <fo:table-cell> + <fo:block font-family="Times" wrap-option="no-wrap" white-space-collapse="true" text-align-last="justify" text-align="justify"> - <fo:block font-family="Times" wrap-option="no-wrap" white-space-collapse="true" text-align-last="justify" text-align="justify"> + <xsl:attribute name="font-size"> + <xsl:value-of select="$fsize" />px + </xsl:attribute> - <xsl:attribute name="font-size"> - <xsl:value-of select="$fsize" />px - </xsl:attribute> + <xsl:value-of select="@text"/> + </fo:block> - <xsl:value-of select="@text"/> - </fo:block> + </fo:table-cell> - </fo:table-cell> + </fo:table-row> - </fo:table-row> + </fo:table-body> - </fo:table-body> + </fo:table> - </fo:table> + </fo:block-container> + </xsl:when> - </fo:block-container> + </xsl:choose> </xsl:for-each> diff --git a/scribo/scribo/core/line_info.hh b/scribo/scribo/core/line_info.hh index b121205..59ce1d3 100644 --- a/scribo/scribo/core/line_info.hh +++ b/scribo/scribo/core/line_info.hh @@ -1,4 +1,4 @@ -// Copyright (C) 2009, 2010, 2011 EPITA Research and Development +// Copyright (C) 2009, 2010, 2011, 2012 EPITA Research and Development // Laboratory (LRDE) // // This file is part of Olena. @@ -758,6 +758,14 @@ namespace scribo template <typename L> + float + line_info<L>::text_confidence() const + { + return data_->text_confidence_; + } + + + template <typename L> const std::string& line_info<L>::text() const { @@ -775,7 +783,7 @@ namespace scribo template <typename L> void - line_info<L>::update_text(const std::string& str, float confidence = 1.0) + line_info<L>::update_text(const std::string& str, float confidence = 100.0) { data_->text_confidence_ = confidence; data_->text_ = str; diff --git a/scribo/scribo/io/xml/internal/extended_page_xml_visitor.hh b/scribo/scribo/io/xml/internal/extended_page_xml_visitor.hh index c766515..8c83ffa 100644 --- a/scribo/scribo/io/xml/internal/extended_page_xml_visitor.hh +++ b/scribo/scribo/io/xml/internal/extended_page_xml_visitor.hh @@ -337,6 +337,7 @@ namespace scribo << "\" dHeight=\"" << line.d_height() << "\" aHeight=\"" << line.a_height() << "\" charWidth=\"" << line.char_width() + << "\" textConfidence=\"" << line.text_confidence() << "\">" << std::endl; internal::print_box_coords(output, line.bbox(), " "); diff --git a/scribo/scribo/io/xml/internal/full_xml_visitor.hh b/scribo/scribo/io/xml/internal/full_xml_visitor.hh index e398389..23b48a8 100644 --- a/scribo/scribo/io/xml/internal/full_xml_visitor.hh +++ b/scribo/scribo/io/xml/internal/full_xml_visitor.hh @@ -518,6 +518,7 @@ namespace scribo << "\" dHeight=\"" << line.d_height() << "\" aHeight=\"" << line.a_height() << "\" charWidth=\"" << line.char_width() + << "\" textConfidence=\"" << line.text_confidence() << "\">" << std::endl; internal::print_box_coords(output, line.bbox(), " "); diff --git a/scribo/scribo/io/xml/load.hh b/scribo/scribo/io/xml/load.hh index 6b570f8..62ad4a2 100644 --- a/scribo/scribo/io/xml/load.hh +++ b/scribo/scribo/io/xml/load.hh @@ -284,6 +284,7 @@ namespace scribo line_data = new scribo::internal::line_info_data<L>(lines, mln::util::array<component_id_t>()); line_data->text_ = atts.value("text").toUtf8().constData(); + line_data->text_confidence_ = atts.value("textConfidence").toUtf8().constData(); line_data->hidden_ = false; line_data->tag_ = static_cast<line::Tag>(atts.value("tag").toInt()); -- 1.7.2.5
participants (1)
-
Guillaume Lazzara