Olena-patches
Threads by month
- ----- 2025 -----
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2024 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2023 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2022 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2021 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2020 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2019 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2018 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2017 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2016 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2015 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2014 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2013 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2012 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2011 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2010 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2009 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2008 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2007 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2006 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2005 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2004 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- 9625 discussions
26 Jan '11
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "Olena, a generic and efficient image processing platform".
The branch next-build-test has been updated
via a4ce6e74f91ddafc9206ee6289043897bfbeb837 (commit)
via 9461bef78f5065bf96e329a98484f2694fed3299 (commit)
via 9549f83db64189294dedc6cfc460da80796b8c02 (commit)
via 2601ca6dddafc044217403feb716183f72f3d99d (commit)
via 6ba320bc12c5d2bcf2a4a7e77f258f401c5465d2 (commit)
via 0fd6cdf6681ad40dc77042b424b99cf57abb6dd0 (commit)
via ec99c0a83a4b58104a0520a43f948d8601e8bdab (commit)
via 6ef044f11cc61ccb83edf88f1be2c2edbb98228c (commit)
via 6e9c3078721f87637656ef7562a6c9f806585729 (commit)
via 0462459c44f0702e88b39427094ee3c93d2fd01b (commit)
via 332c502cd90032e35c38e2058edbb7c00fa6f6ce (commit)
via e56d207759316a8a0bf41147ef4941dba941e62d (commit)
from 50ac33a582bb71a54b2b7f2944113d1a144e3fb7 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
a4ce6e7 Make use of paragraph information in HTML output.
9461bef Regen generated files.
9549f83 Add paragraph processing in content_in_doc toolchain.
2601ca6 Small fixes in Scribo.
6ba320b Add an option to choose the recognized language.
0fd6cdf scribo/text/recognition.hh: Make use of TessBaseAPI::setImage.
ec99c0a scribo/text/recognition.hh: Fix invalid free().
6ef044f scribo/io/xml/save.hh: Handle paragraphs and separators correctly in XML output.
6e9c307 Identify separators among non-text components.
0462459 Handle paragraphs correctly in scribo-viewer.
332c502 Add support for paragraphs.
e56d207 Fix test for nepomuk toolchain.
-----------------------------------------------------------------------
Summary of changes:
scribo/ChangeLog | 105 ++++
scribo/demo/viewer/common.hh | 7 +-
scribo/demo/viewer/image_region.cc | 11 +-
scribo/demo/viewer/key_widget.cc | 38 +-
scribo/demo/viewer/step_widget.cc | 4 +
scribo/demo/viewer/viewer.cc | 60 +--
scribo/demo/viewer/viewer.hh | 2 +-
scribo/demo/viewer/xml_widget.cc | 29 +-
scribo/demo/xml2doc/templates/html/main.xsl | 398 ++++++++-----
scribo/demo/xml2doc/templates/html/main_base64.xsl | 621 +++++++++++++-------
scribo/headers.mk | 7 +
scribo/scribo/core/document.hh | 59 ++-
scribo/scribo/core/line_links.hh | 263 +++++++++
scribo/scribo/core/macros.hh | 6 +
scribo/scribo/core/paragraph_info.hh | 120 ++++
scribo/scribo/core/paragraph_set.hh | 183 ++++++
scribo/scribo/core/tag/component.hh | 6 +-
.../crop.hh => filter/line_links_x_height.hh} | 69 ++--
scribo/scribo/io/xml/save.hh | 335 ++++++++----
scribo/scribo/primitive/extract/elements.hh | 5 +-
.../{link/internal/dmax_default.hh => identify.hh} | 58 +-
scribo/scribo/text/link_lines.hh | 165 ++++++
scribo/scribo/text/recognition.hh | 26 +-
.../{text_in_doc.hh => content_in_doc.hh} | 51 +-
.../toolchain/internal/content_in_doc_functor.hh | 106 ++++-
scribo/scribo/toolchain/nepomuk/text_extraction.hh | 20 +-
scribo/scribo/toolchain/text_in_doc.hh | 3 +
scribo/src/content_in_doc.cc | 78 +--
scribo/src/pbm_text_in_doc.cc | 40 +-
scribo/tests/toolchain/nepomuk/text_extraction.cc | 7 +-
scribo/tests/unit_test/unit-tests.mk | 14 +
31 files changed, 2148 insertions(+), 748 deletions(-)
create mode 100644 scribo/scribo/core/line_links.hh
create mode 100644 scribo/scribo/core/paragraph_info.hh
create mode 100644 scribo/scribo/core/paragraph_set.hh
copy scribo/scribo/{preprocessing/crop.hh => filter/line_links_x_height.hh} (53%)
copy scribo/scribo/primitive/{link/internal/dmax_default.hh => identify.hh} (60%)
create mode 100644 scribo/scribo/text/link_lines.hh
copy scribo/scribo/toolchain/{text_in_doc.hh => content_in_doc.hh} (56%)
hooks/post-receive
--
Olena, a generic and efficient image processing platform
1
0
last-svn-commit-748-ga4ce6e7 Make use of paragraph information in HTML output.
by Guillaume Lazzara 26 Jan '11
by Guillaume Lazzara 26 Jan '11
26 Jan '11
* demo/xml2doc/templates/html/main.xsl,
* demo/xml2doc/templates/html/main_base64.xsl: Here.
---
scribo/ChangeLog | 7 +
scribo/demo/xml2doc/templates/html/main.xsl | 398 ++++++++-----
scribo/demo/xml2doc/templates/html/main_base64.xsl | 621 +++++++++++++-------
3 files changed, 654 insertions(+), 372 deletions(-)
diff --git a/scribo/ChangeLog b/scribo/ChangeLog
index 43577d5..8683016 100644
--- a/scribo/ChangeLog
+++ b/scribo/ChangeLog
@@ -1,5 +1,12 @@
2011-01-26 Guillaume Lazzara <z(a)lrde.epita.fr>
+ Make use of paragraph information in HTML output.
+
+ * demo/xml2doc/templates/html/main.xsl,
+ * demo/xml2doc/templates/html/main_base64.xsl: Here.
+
+2011-01-26 Guillaume Lazzara <z(a)lrde.epita.fr>
+
Regen generated files.
* headers.mk,
diff --git a/scribo/demo/xml2doc/templates/html/main.xsl b/scribo/demo/xml2doc/templates/html/main.xsl
index 9c90970..8159e98 100644
--- a/scribo/demo/xml2doc/templates/html/main.xsl
+++ b/scribo/demo/xml2doc/templates/html/main.xsl
@@ -21,15 +21,10 @@
margin:0px;
font-family:"Times New Roman", Times, serif;
}
- .para
- {
- position:absolute;
- z-index:6;
- }
.region
{
position:absolute;
- z-index:5;
+ z-index:6;
}
.image
{
@@ -40,14 +35,74 @@
</head>
<body>
- <xsl:for-each select="pcGts/page/graphic_region|
- pcGts/page/image_region|
- pcGts/page/chart_region|
- pcGts/page/table_region|
- pcGts/page/separator_region|
- pcGts/page/text_region|
- pcGts/page/text_region/paragraph|
- pcGts/page/text_region/paragraph/line">
+ <xsl:apply-templates select="pcGts/page/text_region"/>
+ <xsl:apply-templates select="pcGts/page/graphic_region|
+ pcGts/page/image_region|
+ pcGts/page/chart_region|
+ pcGts/page/table_region|
+ pcGts/page/separator_region"/>
+
+ </body>
+ </html>
+ </xsl:template>
+
+
+ <xsl:template match="pcGts/page/text_region">
+
+ <!-- Regions Coordinates -->
+
+ <!-- y1 -->
+ <xsl:variable name="y1">
+ <xsl:for-each select="coords/point">
+ <xsl:sort select="@y" order="ascending" data-type="number"/>
+ <xsl:if test="position() = 1">
+ <xsl:value-of select="@y" />
+ </xsl:if>
+ </xsl:for-each>
+ </xsl:variable>
+
+ <!-- x2 -->
+ <xsl:variable name="x2">
+ <xsl:for-each select="coords/point">
+ <xsl:sort select="@x" order="descending" data-type="number"/>
+ <xsl:if test="position() = 1">
+ <xsl:value-of select="@x" />
+ </xsl:if>
+ </xsl:for-each>
+ </xsl:variable>
+
+ <!-- y2 -->
+ <xsl:variable name="y2">
+ <xsl:for-each select="coords/point">
+ <xsl:sort select="@y" order="descending" data-type="number"/>
+ <xsl:if test="position() = 1">
+ <xsl:value-of select="@y" />
+ </xsl:if>
+ </xsl:for-each>
+ </xsl:variable>
+
+ <!-- x1 -->
+ <xsl:variable name="x1">
+ <xsl:for-each select="coords/point">
+ <xsl:sort select="@x" order="ascending" data-type="number"/>
+ <xsl:if test="position() = 1">
+ <xsl:value-of select="@x" />
+ </xsl:if>
+ </xsl:for-each>
+ </xsl:variable>
+
+ <!-- END OF Regions Coordinates -->
+
+ <xsl:apply-templates select="line"/>
+
+ </xsl:template>
+
+
+
+
+
+
+ <xsl:template match="line">
<!-- Regions Coordinates -->
@@ -94,158 +149,179 @@
<!-- END OF Regions Coordinates -->
<!-- TEXT LINE-->
- <xsl:if test="name() = 'line'">
-
- <xsl:variable name="colour">
- <xsl:choose>
- <xsl:when test="../../@txt_colour != ''">
- <xsl:value-of select="../../@txt_colour" />
- </xsl:when>
- <xsl:otherwise>
- Black
- </xsl:otherwise>
- </xsl:choose>
- </xsl:variable>
-
- <!-- x_height -->
- <xsl:variable name="x_height">
- <xsl:value-of select="../../@x_height" />
- </xsl:variable>
-
- <!-- a_height -->
- <xsl:variable name="a_height">
- <xsl:value-of select="../../@a_height" />
- </xsl:variable>
-
- <!-- ABS(d_height) -->
- <xsl:variable name="d_height_abs">
- <xsl:choose>
- <xsl:when test="../../@d_height < 0">
- <xsl:value-of select="-../../@d_height" />
- </xsl:when>
- <xsl:otherwise>
- <xsl:value-of select="../../@d_height" />
- </xsl:otherwise>
- </xsl:choose>
- </xsl:variable>
-
- <!-- a_height computed, i.e>
- if (A - X < |D|)
- A = X + |D|
- -->
- <xsl:variable name="a">
- <xsl:choose>
- <xsl:when test="($a_height - $x_height) < $d_height_abs">
- <xsl:value-of select="$x_height + $d_height_abs" />
- </xsl:when>
- <xsl:otherwise>
- <xsl:value-of select="$a_height" />
- </xsl:otherwise>
- </xsl:choose>
- </xsl:variable>
-
- <!-- d_height computed, i.e>
- if (A - A > |D|)
- |D| = A - X
- -->
- <xsl:variable name="d">
- <xsl:choose>
- <xsl:when test="($a_height - $x_height) > $d_height_abs">
- <xsl:value-of select="$a_height - $x_height" />
- </xsl:when>
- <xsl:otherwise>
- <xsl:value-of select="$d_height_abs" />
- </xsl:otherwise>
- </xsl:choose>
- </xsl:variable>
-
- <div class="line">
- <xsl:attribute name="style">
- height:auto;
- font-size:<xsl:value-of select="$a+$d" />px;
- width:<xsl:value-of select="$x2 - $x1" />px;
- left:<xsl:value-of select="$x1 " />px;
- top:<xsl:value-of select="$y1 " />px;
- color:<xsl:value-of select="$colour" />;
- </xsl:attribute>
- <xsl:value-of select="@text"/>
- </div>
- </xsl:if>
+ <xsl:variable name="colour">
+ <xsl:choose>
+ <xsl:when test="@txt_colour != ''">
+ <xsl:value-of select="@txt_colour" />
+ </xsl:when>
+ <xsl:otherwise>
+ Black
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:variable>
+
+ <!-- x_height -->
+ <xsl:variable name="x_height">
+ <xsl:value-of select="@x_height" />
+ </xsl:variable>
+
+ <!-- a_height -->
+ <xsl:variable name="a_height">
+ <xsl:value-of select="@a_height" />
+ </xsl:variable>
+
+ <!-- ABS(d_height) -->
+ <xsl:variable name="d_height_abs">
+ <xsl:choose>
+ <xsl:when test="@d_height < 0">
+ <xsl:value-of select="-@d_height" />
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:value-of select="@d_height" />
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:variable>
+
+ <!-- a_height computed, i.e>
+ if (A - X < |D|)
+ A = X + |D|
+ -->
+ <xsl:variable name="a">
+ <xsl:choose>
+ <xsl:when test="($a_height - $x_height) < $d_height_abs">
+ <xsl:value-of select="$x_height + $d_height_abs" />
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:value-of select="$a_height" />
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:variable>
+
+ <!-- d_height computed, i.e>
+ if (A - A > |D|)
+ |D| = A - X
+ -->
+ <xsl:variable name="d">
+ <xsl:choose>
+ <xsl:when test="($a_height - $x_height) > $d_height_abs">
+ <xsl:value-of select="$a_height - $x_height" />
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:value-of select="$d_height_abs" />
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:variable>
+
+ <span class="line">
+ <xsl:attribute name="style">
+ height:auto;
+ font-size:<xsl:value-of select="$a + $d" />px;
+ width:<xsl:value-of select="$x2 - $x1" />px;
+ left:<xsl:value-of select="$x1 " />px;
+ top:<xsl:value-of select="$y1 " />px;
+ color:<xsl:value-of select="$colour" />;
+ </xsl:attribute>
+ <xsl:value-of select="@text"/>
+ </span>
<!-- ENF OF TEXT LINE -->
+ </xsl:template>
- <!-- TEXT REGION -->
-<!-- <xsl:if test="name() = 'text_region'">
- <div class="region">
- <xsl:attribute name="style">
- height:<xsl:value-of select="$y2 - $y1" />px;
- width:<xsl:value-of select="$x2 - $x1" />px;
- left:<xsl:value-of select="$x1" />px;
- top:<xsl:value-of select="$y1" />px;
- </xsl:attribute>
- </div>
- </xsl:if>-->
- <!-- ENF OF TEXT REGION -->
-
- <!-- PARAGRAPH -->
-<!-- <xsl:if test="name() = 'paragraph'">
- <div class="para">
- <xsl:attribute name="style">
- height:<xsl:value-of select="$y2 - $y1" />px;
- width:<xsl:value-of select="$x2 - $x1" />px;
- left:<xsl:value-of select="$x1" />px;
- top:<xsl:value-of select="$y1" />px;
- </xsl:attribute>
- </div>
- </xsl:if>-->
- <!-- ENF OF PARAGRAPH -->
+
+
+
+
+ <xsl:template match="pcGts/page/graphic_region|
+ pcGts/page/image_region|
+ pcGts/page/chart_region|
+ pcGts/page/table_region|
+ pcGts/page/separator_region">
+
+ <!-- Regions Coordinates -->
+
+ <!-- y1 -->
+ <xsl:variable name="y1">
+ <xsl:for-each select="coords/point">
+ <xsl:sort select="@y" order="ascending" data-type="number"/>
+ <xsl:if test="position() = 1">
+ <xsl:value-of select="@y" />
+ </xsl:if>
+ </xsl:for-each>
+ </xsl:variable>
+
+ <!-- x2 -->
+ <xsl:variable name="x2">
+ <xsl:for-each select="coords/point">
+ <xsl:sort select="@x" order="descending" data-type="number"/>
+ <xsl:if test="position() = 1">
+ <xsl:value-of select="@x" />
+ </xsl:if>
+ </xsl:for-each>
+ </xsl:variable>
+
+ <!-- y2 -->
+ <xsl:variable name="y2">
+ <xsl:for-each select="coords/point">
+ <xsl:sort select="@y" order="descending" data-type="number"/>
+ <xsl:if test="position() = 1">
+ <xsl:value-of select="@y" />
+ </xsl:if>
+ </xsl:for-each>
+ </xsl:variable>
+
+ <!-- x1 -->
+ <xsl:variable name="x1">
+ <xsl:for-each select="coords/point">
+ <xsl:sort select="@x" order="ascending" data-type="number"/>
+ <xsl:if test="position() = 1">
+ <xsl:value-of select="@x" />
+ </xsl:if>
+ </xsl:for-each>
+ </xsl:variable>
+
+ <!-- END OF Regions Coordinates -->
<!-- NON-TEXT REGIONS -->
- <xsl:if test="name() = 'image_region' or name() = 'separator_region' or name() = 'graphic_region' or name() = 'chart_region' or name() = 'table_region'">
-
- <!-- id -->
- <xsl:variable name="id">
- <xsl:value-of select="@id" />
- </xsl:variable>
-
- <!-- depth -->
- <xsl:variable name="depth">
- <xsl:choose>
- <xsl:when test="name() = 'separator_region'">
- 1
- </xsl:when>
- <xsl:otherwise>
- 4
- </xsl:otherwise>
- </xsl:choose>
- </xsl:variable>
-
- <div class="image">
- <xsl:attribute name="style">
- top:<xsl:value-of select="$y1" />px;
- left:<xsl:value-of select="$x1" />px;
- width:<xsl:value-of select="$x2 - $x1"/>px;
- height:<xsl:value-of select="$y2 - $y1"/>px;
- z-index:<xsl:value-of select="$depth"/>;
+ <!-- id -->
+ <xsl:variable name="id">
+ <xsl:value-of select="@id" />
+ </xsl:variable>
+
+ <!-- depth -->
+ <xsl:variable name="depth">
+ <xsl:choose>
+ <xsl:when test="name() = 'separator_region'">
+ 1
+ </xsl:when>
+ <xsl:otherwise>
+ 4
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:variable>
+
+ <div class="image">
+ <xsl:attribute name="style">
+ top:<xsl:value-of select="$y1" />px;
+ left:<xsl:value-of select="$x1" />px;
+ width:<xsl:value-of select="$x2 - $x1"/>px;
+ height:<xsl:value-of select="$y2 - $y1"/>px;
+ z-index:<xsl:value-of select="$depth"/>;
+ </xsl:attribute>
+ <img>
+ <xsl:attribute name="alt">
+ <xsl:value-of select="name()" />
+ </xsl:attribute>
+ <xsl:attribute name="width">
+ <xsl:value-of select="$x2 - $x1" />
</xsl:attribute>
- <img>
- <xsl:attribute name="alt">
- <xsl:value-of select="name()" />
- </xsl:attribute>
- <xsl:attribute name="width">
- <xsl:value-of select="$x2 - $x1" />
- </xsl:attribute>
- <xsl:attribute name="height">
- <xsl:value-of select="$y2 - $y1" />
- </xsl:attribute>
- <xsl:attribute name="src"><xsl:value-of select="$id"/>.png</xsl:attribute>
- </img>
- </div>
-
- </xsl:if>
+ <xsl:attribute name="height">
+ <xsl:value-of select="$y2 - $y1" />
+ </xsl:attribute>
+ <xsl:attribute name="src"><xsl:value-of select="$id"/>.png</xsl:attribute>
+ </img>
+ </div>
+
<!-- END NON-TEXT REGIONS -->
- </xsl:for-each>
- </body>
- </html>
- </xsl:template>
+ </xsl:template>
+
</xsl:stylesheet>
diff --git a/scribo/demo/xml2doc/templates/html/main_base64.xsl b/scribo/demo/xml2doc/templates/html/main_base64.xsl
index b24f02d..fe3f1db 100644
--- a/scribo/demo/xml2doc/templates/html/main_base64.xsl
+++ b/scribo/demo/xml2doc/templates/html/main_base64.xsl
@@ -39,222 +39,421 @@
border:0;
}
</style>
-<!-- <link rel="stylesheet" type="text/css" href="css.css" /> -->
+ <!-- <link rel="stylesheet" type="text/css" href="css.css" /> -->
</head>
<body>
- <xsl:for-each select="pcGts/page/graphic_region|
- pcGts/page/image_region|
- pcGts/page/chart_region|
- pcGts/page/table_region|
- pcGts/page/separator_region|
- pcGts/page/text_region|
- pcGts/page/text_region/paragraph|
- pcGts/page/text_region/paragraph/line">
-
- <!-- Regions Coordinates -->
-
- <!-- y1 -->
- <xsl:variable name="y1">
- <xsl:for-each select="coords/point">
- <xsl:sort select="@y" order="ascending" data-type="number"/>
- <xsl:if test="position() = 1">
- <xsl:value-of select="@y" />
- </xsl:if>
- </xsl:for-each>
- </xsl:variable>
-
- <!-- x2 -->
- <xsl:variable name="x2">
- <xsl:for-each select="coords/point">
- <xsl:sort select="@x" order="descending" data-type="number"/>
- <xsl:if test="position() = 1">
- <xsl:value-of select="@x" />
- </xsl:if>
- </xsl:for-each>
- </xsl:variable>
-
- <!-- y2 -->
- <xsl:variable name="y2">
- <xsl:for-each select="coords/point">
- <xsl:sort select="@y" order="descending" data-type="number"/>
- <xsl:if test="position() = 1">
- <xsl:value-of select="@y" />
- </xsl:if>
- </xsl:for-each>
- </xsl:variable>
-
- <!-- x1 -->
- <xsl:variable name="x1">
- <xsl:for-each select="coords/point">
- <xsl:sort select="@x" order="ascending" data-type="number"/>
- <xsl:if test="position() = 1">
- <xsl:value-of select="@x" />
- </xsl:if>
- </xsl:for-each>
- </xsl:variable>
-
- <!-- END OF Regions Coordinates -->
-
- <!-- TEXT LINE-->
- <xsl:if test="name() = 'line'">
-
- <xsl:variable name="colour">
- <xsl:choose>
- <xsl:when test="../../@txt_colour != ''">
- <xsl:value-of select="../../@txt_colour" />
- </xsl:when>
- <xsl:otherwise>
- Black
- </xsl:otherwise>
- </xsl:choose>
- </xsl:variable>
-
- <!-- x_height -->
- <xsl:variable name="x_height">
- <xsl:value-of select="../../@x_height" />
- </xsl:variable>
-
- <!-- a_height -->
- <xsl:variable name="a_height">
- <xsl:value-of select="../../@a_height" />
- </xsl:variable>
-
- <!-- ABS(d_height) -->
- <xsl:variable name="d_height_abs">
- <xsl:choose>
- <xsl:when test="../../@d_height < 0">
- <xsl:value-of select="-../../@d_height" />
- </xsl:when>
- <xsl:otherwise>
- <xsl:value-of select="../../@d_height" />
- </xsl:otherwise>
- </xsl:choose>
- </xsl:variable>
-
- <!-- a_height computed, i.e>
- if (A - X < |D|)
- A = X + |D|
- -->
- <xsl:variable name="a">
- <xsl:choose>
- <xsl:when test="($a_height - $x_height) < $d_height_abs">
- <xsl:value-of select="$x_height + $d_height_abs" />
- </xsl:when>
- <xsl:otherwise>
- <xsl:value-of select="$a_height" />
- </xsl:otherwise>
- </xsl:choose>
- </xsl:variable>
-
- <!-- d_height computed, i.e>
- if (A - A > |D|)
- |D| = A - X
- -->
- <xsl:variable name="d">
- <xsl:choose>
- <xsl:when test="($a_height - $x_height) > $d_height_abs">
- <xsl:value-of select="$a_height - $x_height" />
- </xsl:when>
- <xsl:otherwise>
- <xsl:value-of select="$d_height_abs" />
- </xsl:otherwise>
- </xsl:choose>
- </xsl:variable>
-
- <div class="line" onmouseover="this.style.opacity=0.2;this.filters.alpha.opacity=20"
- onmouseout="this.style.opacity=1;this.filters.alpha.opacity=100">
- <xsl:attribute name="style">
- opacity:1;
- height:auto;
- font-size:<xsl:value-of select="$a+$d" />px;
- width:<xsl:value-of select="$x2 - $x1" />px;
- left:<xsl:value-of select="$x1 " />px;
- top:<xsl:value-of select="$y1 " />px;
- color:<xsl:value-of select="$colour" />;
- </xsl:attribute>
- <xsl:value-of select="@text"/>
- </div>
- </xsl:if>
- <!-- ENF OF TEXT LINE -->
-
- <!-- TEXT REGION -->
- <xsl:if test="name() = 'text_region'">
- <div class="region">
- <xsl:attribute name="style">
- height:<xsl:value-of select="$y2 - $y1" />px;
- width:<xsl:value-of select="$x2 - $x1" />px;
- left:<xsl:value-of select="$x1" />px;
- top:<xsl:value-of select="$y1" />px;
- </xsl:attribute>
- </div>
- </xsl:if>
- <!-- ENF OF TEXT REGION -->
-
- <!-- PARAGRAPH -->
- <xsl:if test="name() = 'paragraph'">
- <div class="para">
- <xsl:attribute name="style">
- height:<xsl:value-of select="$y2 - $y1" />px;
- width:<xsl:value-of select="$x2 - $x1" />px;
- left:<xsl:value-of select="$x1" />px;
- top:<xsl:value-of select="$y1" />px;
- </xsl:attribute>
- </div>
- </xsl:if>
- <!-- ENF OF PARAGRAPH -->
-
- <!-- NON-TEXT REGIONS -->
- <xsl:if test="name() = 'image_region' or name() = 'separator_region' or name() = 'graphic_region' or name() = 'chart_region' or name() = 'table_region'">
-
- <xsl:if test="container">
- <!-- data -->
- <xsl:variable name="data">
- <xsl:value-of select="container/data" />
- </xsl:variable>
-
- <!-- depth -->
- <xsl:variable name="depth">
- <xsl:choose>
- <xsl:when test="name() = 'separator_region'">
- 1
- </xsl:when>
- <xsl:otherwise>
- 4
- </xsl:otherwise>
- </xsl:choose>
- </xsl:variable>
-
- <div class="image">
- <xsl:attribute name="style">
- top:<xsl:value-of select="$y1" />px;
- left:<xsl:value-of select="$x1" />px;
- width:<xsl:value-of select="$x2 - $x1"/>px;
- height:<xsl:value-of select="$y2 - $y1"/>px;
- z-index:<xsl:value-of select="$depth"/>;
- </xsl:attribute>
- <img>
- <xsl:attribute name="alt">
- <xsl:value-of select="name()" />
- </xsl:attribute>
- <xsl:attribute name="width">
- <xsl:value-of select="$x2 - $x1" />
- </xsl:attribute>
- <xsl:attribute name="height">
- <xsl:value-of select="$y2 - $y1" />
- </xsl:attribute>
- <xsl:attribute name="src">
- data:image/png;base64,<xsl:value-of select="$data"/>
- </xsl:attribute>
- </img>
- </div>
- </xsl:if>
-
- </xsl:if>
- <!-- END NON-TEXT REGIONS -->
- </xsl:for-each>
+ <xsl:apply-templates select="pcGts/page/text_region"/>
+ <xsl:apply-templates select="pcGts/page/graphic_region|
+ pcGts/page/image_region|
+ pcGts/page/chart_region|
+ pcGts/page/table_region|
+ pcGts/page/separator_region"/>
</body>
</html>
</xsl:template>
+
+
+ <!--
+ TEXT REGION
+ ===========
+ -->
+
+ <xsl:template match="pcGts/page/text_region">
+
+ <!-- Regions Coordinates -->
+
+ <!-- y1 -->
+ <xsl:variable name="y1">
+ <xsl:for-each select="coords/point">
+ <xsl:sort select="@y" order="ascending" data-type="number"/>
+ <xsl:if test="position() = 1">
+ <xsl:value-of select="@y" />
+ </xsl:if>
+ </xsl:for-each>
+ </xsl:variable>
+
+ <!-- x2 -->
+ <xsl:variable name="x2">
+ <xsl:for-each select="coords/point">
+ <xsl:sort select="@x" order="descending" data-type="number"/>
+ <xsl:if test="position() = 1">
+ <xsl:value-of select="@x" />
+ </xsl:if>
+ </xsl:for-each>
+ </xsl:variable>
+
+ <!-- y2 -->
+ <xsl:variable name="y2">
+ <xsl:for-each select="coords/point">
+ <xsl:sort select="@y" order="descending" data-type="number"/>
+ <xsl:if test="position() = 1">
+ <xsl:value-of select="@y" />
+ </xsl:if>
+ </xsl:for-each>
+ </xsl:variable>
+
+ <!-- x1 -->
+ <xsl:variable name="x1">
+ <xsl:for-each select="coords/point">
+ <xsl:sort select="@x" order="ascending" data-type="number"/>
+ <xsl:if test="position() = 1">
+ <xsl:value-of select="@x" />
+ </xsl:if>
+ </xsl:for-each>
+ </xsl:variable>
+
+ <!-- END OF Regions Coordinates -->
+
+ <xsl:apply-templates select="line"/>
+
+ </xsl:template>
+
+
+
+
+ <!--
+ TEXT LINES
+ ==========
+ -->
+ <xsl:template match="line">
+
+ <!-- Regions Coordinates -->
+
+ <!-- y1 -->
+ <xsl:variable name="y1">
+ <xsl:for-each select="coords/point">
+ <xsl:sort select="@y" order="ascending" data-type="number"/>
+ <xsl:if test="position() = 1">
+ <xsl:value-of select="@y" />
+ </xsl:if>
+ </xsl:for-each>
+ </xsl:variable>
+
+ <!-- x2 -->
+ <xsl:variable name="x2">
+ <xsl:for-each select="coords/point">
+ <xsl:sort select="@x" order="descending" data-type="number"/>
+ <xsl:if test="position() = 1">
+ <xsl:value-of select="@x" />
+ </xsl:if>
+ </xsl:for-each>
+ </xsl:variable>
+
+ <!-- y2 -->
+ <xsl:variable name="y2">
+ <xsl:for-each select="coords/point">
+ <xsl:sort select="@y" order="descending" data-type="number"/>
+ <xsl:if test="position() = 1">
+ <xsl:value-of select="@y" />
+ </xsl:if>
+ </xsl:for-each>
+ </xsl:variable>
+
+ <!-- x1 -->
+ <xsl:variable name="x1">
+ <xsl:for-each select="coords/point">
+ <xsl:sort select="@x" order="ascending" data-type="number"/>
+ <xsl:if test="position() = 1">
+ <xsl:value-of select="@x" />
+ </xsl:if>
+ </xsl:for-each>
+ </xsl:variable>
+
+ <!-- END OF Regions Coordinates -->
+
+ <!-- TEXT LINE-->
+ <xsl:variable name="colour">
+ <xsl:choose>
+ <xsl:when test="@txt_colour != ''">
+ <xsl:value-of select="@txt_colour" />
+ </xsl:when>
+ <xsl:otherwise>
+ Black
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:variable>
+
+ <!-- x_height -->
+ <xsl:variable name="x_height">
+ <xsl:value-of select="@x_height" />
+ </xsl:variable>
+
+ <!-- a_height -->
+ <xsl:variable name="a_height">
+ <xsl:value-of select="@a_height" />
+ </xsl:variable>
+
+ <!-- ABS(d_height) -->
+ <xsl:variable name="d_height_abs">
+ <xsl:choose>
+ <xsl:when test="@d_height < 0">
+ <xsl:value-of select="-@d_height" />
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:value-of select="@d_height" />
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:variable>
+
+ <!-- a_height computed, i.e>
+ if (A - X < |D|)
+ A = X + |D|
+ -->
+ <xsl:variable name="a">
+ <xsl:choose>
+ <xsl:when test="($a_height - $x_height) < $d_height_abs">
+ <xsl:value-of select="$x_height + $d_height_abs" />
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:value-of select="$a_height" />
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:variable>
+
+ <!-- d_height computed, i.e>
+ if (A - A > |D|)
+ |D| = A - X
+ -->
+ <xsl:variable name="d">
+ <xsl:choose>
+ <xsl:when test="($a_height - $x_height) > $d_height_abs">
+ <xsl:value-of select="$a_height - $x_height" />
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:value-of select="$d_height_abs" />
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:variable>
+
+ <span class="line">
+ <xsl:attribute name="style">
+ height:auto;
+ font-size:<xsl:value-of select="$a + $d" />px;
+ width:<xsl:value-of select="$x2 - $x1" />px;
+ left:<xsl:value-of select="$x1 " />px;
+ top:<xsl:value-of select="$y1 " />px;
+ color:<xsl:value-of select="$colour" />;
+ </xsl:attribute>
+ <xsl:value-of select="@text"/>
+ </span>
+ <!-- ENF OF TEXT LINE -->
+ </xsl:template>
+
+
+
+
+ <!--
+ MISC. REGIONS
+ =============
+ -->
+ <xsl:template match="pcGts/page/graphic_region|
+ pcGts/page/image_region|
+ pcGts/page/chart_region|
+ pcGts/page/table_region|
+ pcGts/page/separator_region">
+
+ <!-- Regions Coordinates -->
+
+ <!-- y1 -->
+ <xsl:variable name="y1">
+ <xsl:for-each select="coords/point">
+ <xsl:sort select="@y" order="ascending" data-type="number"/>
+ <xsl:if test="position() = 1">
+ <xsl:value-of select="@y" />
+ </xsl:if>
+ </xsl:for-each>
+ </xsl:variable>
+
+ <!-- x2 -->
+ <xsl:variable name="x2">
+ <xsl:for-each select="coords/point">
+ <xsl:sort select="@x" order="descending" data-type="number"/>
+ <xsl:if test="position() = 1">
+ <xsl:value-of select="@x" />
+ </xsl:if>
+ </xsl:for-each>
+ </xsl:variable>
+
+ <!-- y2 -->
+ <xsl:variable name="y2">
+ <xsl:for-each select="coords/point">
+ <xsl:sort select="@y" order="descending" data-type="number"/>
+ <xsl:if test="position() = 1">
+ <xsl:value-of select="@y" />
+ </xsl:if>
+ </xsl:for-each>
+ </xsl:variable>
+
+ <!-- x1 -->
+ <xsl:variable name="x1">
+ <xsl:for-each select="coords/point">
+ <xsl:sort select="@x" order="ascending" data-type="number"/>
+ <xsl:if test="position() = 1">
+ <xsl:value-of select="@x" />
+ </xsl:if>
+ </xsl:for-each>
+ </xsl:variable>
+
+ <!-- END OF Regions Coordinates -->
+
+ <!-- TEXT LINE-->
+ <xsl:if test="name() = 'line'">
+
+ <xsl:variable name="colour">
+ <xsl:choose>
+ <xsl:when test="../../@txt_colour != ''">
+ <xsl:value-of select="../../@txt_colour" />
+ </xsl:when>
+ <xsl:otherwise>
+ Black
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:variable>
+
+ <!-- x_height -->
+ <xsl:variable name="x_height">
+ <xsl:value-of select="../../@x_height" />
+ </xsl:variable>
+
+ <!-- a_height -->
+ <xsl:variable name="a_height">
+ <xsl:value-of select="../../@a_height" />
+ </xsl:variable>
+
+ <!-- ABS(d_height) -->
+ <xsl:variable name="d_height_abs">
+ <xsl:choose>
+ <xsl:when test="../../@d_height < 0">
+ <xsl:value-of select="-../../@d_height" />
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:value-of select="../../@d_height" />
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:variable>
+
+ <!-- a_height computed, i.e>
+ if (A - X < |D|)
+ A = X + |D|
+ -->
+ <xsl:variable name="a">
+ <xsl:choose>
+ <xsl:when test="($a_height - $x_height) < $d_height_abs">
+ <xsl:value-of select="$x_height + $d_height_abs" />
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:value-of select="$a_height" />
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:variable>
+
+ <!-- d_height computed, i.e>
+ if (A - A > |D|)
+ |D| = A - X
+ -->
+ <xsl:variable name="d">
+ <xsl:choose>
+ <xsl:when test="($a_height - $x_height) > $d_height_abs">
+ <xsl:value-of select="$a_height - $x_height" />
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:value-of select="$d_height_abs" />
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:variable>
+
+ <div class="line" onmouseover="this.style.opacity=0.2;this.filters.alpha.opacity=20"
+ onmouseout="this.style.opacity=1;this.filters.alpha.opacity=100">
+ <xsl:attribute name="style">
+ opacity:1;
+ height:auto;
+ font-size:<xsl:value-of select="$a+$d" />px;
+ width:<xsl:value-of select="$x2 - $x1" />px;
+ left:<xsl:value-of select="$x1 " />px;
+ top:<xsl:value-of select="$y1 " />px;
+ color:<xsl:value-of select="$colour" />;
+ </xsl:attribute>
+ <xsl:value-of select="@text"/>
+ </div>
+ </xsl:if>
+ <!-- ENF OF TEXT LINE -->
+
+ <!-- TEXT REGION -->
+ <xsl:if test="name() = 'text_region'">
+ <div class="region">
+ <xsl:attribute name="style">
+ height:<xsl:value-of select="$y2 - $y1" />px;
+ width:<xsl:value-of select="$x2 - $x1" />px;
+ left:<xsl:value-of select="$x1" />px;
+ top:<xsl:value-of select="$y1" />px;
+ </xsl:attribute>
+ </div>
+ </xsl:if>
+ <!-- ENF OF TEXT REGION -->
+
+ <!-- PARAGRAPH -->
+ <xsl:if test="name() = 'paragraph'">
+ <div class="para">
+ <xsl:attribute name="style">
+ height:<xsl:value-of select="$y2 - $y1" />px;
+ width:<xsl:value-of select="$x2 - $x1" />px;
+ left:<xsl:value-of select="$x1" />px;
+ top:<xsl:value-of select="$y1" />px;
+ </xsl:attribute>
+ </div>
+ </xsl:if>
+ <!-- ENF OF PARAGRAPH -->
+
+ <!-- NON-TEXT REGIONS -->
+ <xsl:if test="name() = 'image_region' or name() = 'separator_region' or name() = 'graphic_region' or name() = 'chart_region' or name() = 'table_region'">
+
+ <xsl:if test="container">
+ <!-- data -->
+ <xsl:variable name="data">
+ <xsl:value-of select="container/data" />
+ </xsl:variable>
+
+ <!-- depth -->
+ <xsl:variable name="depth">
+ <xsl:choose>
+ <xsl:when test="name() = 'separator_region'">
+ 1
+ </xsl:when>
+ <xsl:otherwise>
+ 4
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:variable>
+
+ <div class="image">
+ <xsl:attribute name="style">
+ top:<xsl:value-of select="$y1" />px;
+ left:<xsl:value-of select="$x1" />px;
+ width:<xsl:value-of select="$x2 - $x1"/>px;
+ height:<xsl:value-of select="$y2 - $y1"/>px;
+ z-index:<xsl:value-of select="$depth"/>;
+ </xsl:attribute>
+ <img>
+ <xsl:attribute name="alt">
+ <xsl:value-of select="name()" />
+ </xsl:attribute>
+ <xsl:attribute name="width">
+ <xsl:value-of select="$x2 - $x1" />
+ </xsl:attribute>
+ <xsl:attribute name="height">
+ <xsl:value-of select="$y2 - $y1" />
+ </xsl:attribute>
+ <xsl:attribute name="src">
+ data:image/png;base64,<xsl:value-of select="$data"/>
+ </xsl:attribute>
+ </img>
+ </div>
+ </xsl:if>
+
+ </xsl:if>
+ <!-- END NON-TEXT REGIONS -->
+ </xsl:template>
+
</xsl:stylesheet>
--
1.5.6.5
1
0
* headers.mk,
* tests/unit_test/unit-tests.mk: Regen.
---
scribo/ChangeLog | 7 +++++++
scribo/headers.mk | 7 +++++++
scribo/tests/unit_test/unit-tests.mk | 14 ++++++++++++++
3 files changed, 28 insertions(+), 0 deletions(-)
diff --git a/scribo/ChangeLog b/scribo/ChangeLog
index 809ce7d..43577d5 100644
--- a/scribo/ChangeLog
+++ b/scribo/ChangeLog
@@ -1,5 +1,12 @@
2011-01-26 Guillaume Lazzara <z(a)lrde.epita.fr>
+ Regen generated files.
+
+ * headers.mk,
+ * tests/unit_test/unit-tests.mk: Regen.
+
+2011-01-26 Guillaume Lazzara <z(a)lrde.epita.fr>
+
Add paragraph processing in content_in_doc toolchain.
* scribo/toolchain/content_in_doc.hh: New.
diff --git a/scribo/headers.mk b/scribo/headers.mk
index a388d28..7603f8a 100644
--- a/scribo/headers.mk
+++ b/scribo/headers.mk
@@ -26,10 +26,13 @@ scribo/core/document.hh \
scribo/core/erase_objects.hh \
scribo/core/init_integral_image.hh \
scribo/core/line_info.hh \
+scribo/core/line_links.hh \
scribo/core/line_set.hh \
scribo/core/macros.hh \
scribo/core/object_groups.hh \
scribo/core/object_links.hh \
+scribo/core/paragraph_info.hh \
+scribo/core/paragraph_set.hh \
scribo/core/tag/anchor.hh \
scribo/core/tag/component.hh \
scribo/core/tag/line.hh \
@@ -61,6 +64,7 @@ scribo/filter/common/objects_photo.hh \
scribo/filter/internal/alignment_angle.hh \
scribo/filter/internal/component_aligned.hh \
scribo/filter/internal/compute.hh \
+scribo/filter/line_links_x_height.hh \
scribo/filter/object_groups_size_ratio.hh \
scribo/filter/object_groups_small.hh \
scribo/filter/object_groups_v_thickness.hh \
@@ -135,6 +139,7 @@ scribo/primitive/group/from_double_link.hh \
scribo/primitive/group/from_double_link_any.hh \
scribo/primitive/group/from_graph.hh \
scribo/primitive/group/from_single_link.hh \
+scribo/primitive/identify.hh \
scribo/primitive/internal/all.hh \
scribo/primitive/internal/find_graph_link.hh \
scribo/primitive/internal/find_root.hh \
@@ -198,9 +203,11 @@ scribo/text/all.hh \
scribo/text/clean.hh \
scribo/text/clean_inplace.hh \
scribo/text/extract_lines.hh \
+scribo/text/link_lines.hh \
scribo/text/look_like_text_lines.hh \
scribo/text/merging.hh \
scribo/text/recognition.hh \
+scribo/toolchain/content_in_doc.hh \
scribo/toolchain/internal/content_in_doc_functor.hh \
scribo/toolchain/internal/text_in_doc_functor.hh \
scribo/toolchain/internal/text_in_doc_preprocess_functor.hh \
diff --git a/scribo/tests/unit_test/unit-tests.mk b/scribo/tests/unit_test/unit-tests.mk
index 7ddd085..7da6e5c 100644
--- a/scribo/tests/unit_test/unit-tests.mk
+++ b/scribo/tests/unit_test/unit-tests.mk
@@ -76,10 +76,13 @@ scribo_core_document \
scribo_core_erase_objects \
scribo_core_init_integral_image \
scribo_core_line_info \
+scribo_core_line_links \
scribo_core_line_set \
scribo_core_macros \
scribo_core_object_groups \
scribo_core_object_links \
+scribo_core_paragraph_info \
+scribo_core_paragraph_set \
scribo_core_tag_anchor \
scribo_core_tag_component \
scribo_core_tag_line \
@@ -111,6 +114,7 @@ scribo_filter_common_objects_photo \
scribo_filter_internal_alignment_angle \
scribo_filter_internal_component_aligned \
scribo_filter_internal_compute \
+scribo_filter_line_links_x_height \
scribo_filter_object_groups_size_ratio \
scribo_filter_object_groups_small \
scribo_filter_object_groups_v_thickness \
@@ -185,6 +189,7 @@ scribo_primitive_group_from_double_link \
scribo_primitive_group_from_double_link_any \
scribo_primitive_group_from_graph \
scribo_primitive_group_from_single_link \
+scribo_primitive_identify \
scribo_primitive_internal_all \
scribo_primitive_internal_find_graph_link \
scribo_primitive_internal_find_root \
@@ -244,8 +249,10 @@ scribo_text_all \
scribo_text_clean \
scribo_text_clean_inplace \
scribo_text_extract_lines \
+scribo_text_link_lines \
scribo_text_look_like_text_lines \
scribo_text_merging \
+scribo_toolchain_content_in_doc \
scribo_toolchain_internal_text_in_doc_preprocess_functor \
scribo_toolchain_internal_toolchain_functor \
scribo_toolchain_text_in_doc_preprocess \
@@ -277,10 +284,13 @@ scribo_core_document_SOURCES = scribo_core_document.cc
scribo_core_erase_objects_SOURCES = scribo_core_erase_objects.cc
scribo_core_init_integral_image_SOURCES = scribo_core_init_integral_image.cc
scribo_core_line_info_SOURCES = scribo_core_line_info.cc
+scribo_core_line_links_SOURCES = scribo_core_line_links.cc
scribo_core_line_set_SOURCES = scribo_core_line_set.cc
scribo_core_macros_SOURCES = scribo_core_macros.cc
scribo_core_object_groups_SOURCES = scribo_core_object_groups.cc
scribo_core_object_links_SOURCES = scribo_core_object_links.cc
+scribo_core_paragraph_info_SOURCES = scribo_core_paragraph_info.cc
+scribo_core_paragraph_set_SOURCES = scribo_core_paragraph_set.cc
scribo_core_tag_anchor_SOURCES = scribo_core_tag_anchor.cc
scribo_core_tag_component_SOURCES = scribo_core_tag_component.cc
scribo_core_tag_line_SOURCES = scribo_core_tag_line.cc
@@ -312,6 +322,7 @@ scribo_filter_common_objects_photo_SOURCES = scribo_filter_common_objects_photo.
scribo_filter_internal_alignment_angle_SOURCES = scribo_filter_internal_alignment_angle.cc
scribo_filter_internal_component_aligned_SOURCES = scribo_filter_internal_component_aligned.cc
scribo_filter_internal_compute_SOURCES = scribo_filter_internal_compute.cc
+scribo_filter_line_links_x_height_SOURCES = scribo_filter_line_links_x_height.cc
scribo_filter_object_groups_size_ratio_SOURCES = scribo_filter_object_groups_size_ratio.cc
scribo_filter_object_groups_small_SOURCES = scribo_filter_object_groups_small.cc
scribo_filter_object_groups_v_thickness_SOURCES = scribo_filter_object_groups_v_thickness.cc
@@ -386,6 +397,7 @@ scribo_primitive_group_from_double_link_SOURCES = scribo_primitive_group_from_do
scribo_primitive_group_from_double_link_any_SOURCES = scribo_primitive_group_from_double_link_any.cc
scribo_primitive_group_from_graph_SOURCES = scribo_primitive_group_from_graph.cc
scribo_primitive_group_from_single_link_SOURCES = scribo_primitive_group_from_single_link.cc
+scribo_primitive_identify_SOURCES = scribo_primitive_identify.cc
scribo_primitive_internal_all_SOURCES = scribo_primitive_internal_all.cc
scribo_primitive_internal_find_graph_link_SOURCES = scribo_primitive_internal_find_graph_link.cc
scribo_primitive_internal_find_root_SOURCES = scribo_primitive_internal_find_root.cc
@@ -445,8 +457,10 @@ scribo_text_all_SOURCES = scribo_text_all.cc
scribo_text_clean_SOURCES = scribo_text_clean.cc
scribo_text_clean_inplace_SOURCES = scribo_text_clean_inplace.cc
scribo_text_extract_lines_SOURCES = scribo_text_extract_lines.cc
+scribo_text_link_lines_SOURCES = scribo_text_link_lines.cc
scribo_text_look_like_text_lines_SOURCES = scribo_text_look_like_text_lines.cc
scribo_text_merging_SOURCES = scribo_text_merging.cc
+scribo_toolchain_content_in_doc_SOURCES = scribo_toolchain_content_in_doc.cc
scribo_toolchain_internal_text_in_doc_preprocess_functor_SOURCES = scribo_toolchain_internal_text_in_doc_preprocess_functor.cc
scribo_toolchain_internal_toolchain_functor_SOURCES = scribo_toolchain_internal_toolchain_functor.cc
scribo_toolchain_text_in_doc_preprocess_SOURCES = scribo_toolchain_text_in_doc_preprocess.cc
--
1.5.6.5
1
0
last-svn-commit-746-g9549f83 Add paragraph processing in content_in_doc toolchain.
by Guillaume Lazzara 26 Jan '11
by Guillaume Lazzara 26 Jan '11
26 Jan '11
* scribo/toolchain/content_in_doc.hh: New.
* scribo/toolchain/internal/content_in_doc_functor.hh: Use
paragraph related routines.
* src/content_in_doc.cc: Update use of content_in_doc_functor.
---
scribo/ChangeLog | 11 ++
.../{text_in_doc.hh => content_in_doc.hh} | 52 +++++-----
.../toolchain/internal/content_in_doc_functor.hh | 106 +++++++++++++++++++-
scribo/src/content_in_doc.cc | 48 +++-------
4 files changed, 155 insertions(+), 62 deletions(-)
copy scribo/scribo/toolchain/{text_in_doc.hh => content_in_doc.hh} (57%)
diff --git a/scribo/ChangeLog b/scribo/ChangeLog
index c947550..809ce7d 100644
--- a/scribo/ChangeLog
+++ b/scribo/ChangeLog
@@ -1,3 +1,14 @@
+2011-01-26 Guillaume Lazzara <z(a)lrde.epita.fr>
+
+ Add paragraph processing in content_in_doc toolchain.
+
+ * scribo/toolchain/content_in_doc.hh: New.
+
+ * scribo/toolchain/internal/content_in_doc_functor.hh: Use
+ paragraph related routines.
+
+ * src/content_in_doc.cc: Update use of content_in_doc_functor.
+
2011-01-25 Guillaume Lazzara <z(a)lrde.epita.fr>
Small fixes in Scribo.
diff --git a/scribo/scribo/toolchain/text_in_doc.hh b/scribo/scribo/toolchain/content_in_doc.hh
similarity index 57%
copy from scribo/scribo/toolchain/text_in_doc.hh
copy to scribo/scribo/toolchain/content_in_doc.hh
index e6ba69e..f2938d9 100644
--- a/scribo/scribo/toolchain/text_in_doc.hh
+++ b/scribo/scribo/toolchain/content_in_doc.hh
@@ -1,5 +1,4 @@
-// Copyright (C) 2009, 2010 EPITA Research and Development Laboratory
-// (LRDE)
+// Copyright (C) 2011 EPITA Research and Development Laboratory (LRDE)
//
// This file is part of Olena.
//
@@ -24,14 +23,14 @@
// exception does not however invalidate any other reasons why the
// executable file might be covered by the GNU General Public License.
-#ifndef SCRIBO_TOOLCHAIN_TEXT_IN_DOC_HH
-# define SCRIBO_TOOLCHAIN_TEXT_IN_DOC_HH
+#ifndef SCRIBO_TOOLCHAIN_CONTENT_IN_DOC_HH
+# define SCRIBO_TOOLCHAIN_CONTENT_IN_DOC_HH
/// \file
///
-/// Extract text from a document.
+/// Analyse a document.
-# include <scribo/toolchain/internal/text_in_doc_functor.hh>
+# include <scribo/toolchain/internal/content_in_doc_functor.hh>
namespace scribo
{
@@ -42,36 +41,41 @@ namespace scribo
using namespace mln;
- template <typename I>
- line_set<mln_ch_value(I, def::lbl_type)>
- text_in_doc(const Image<I>& input, bool denoise,
- const std::string& language = std::string("eng"),
- bool find_line_seps = true,
- bool find_whitespace_seps = true,
- bool debug = false);
+ template <typename I, typename J>
+ document<mln_ch_value(I, def::lbl_type)>
+ content_in_doc(const Image<I>& input, const Image<J>& input_preproc,
+ bool denoise,
+ const std::string& language = std::string("eng"),
+ bool find_line_seps = true,
+ bool find_whitespace_seps = true,
+ bool debug = false);
# ifndef MLN_INCLUDE_ONLY
- template <typename I>
- line_set<mln_ch_value(I, def::lbl_type)>
- text_in_doc(const Image<I>& input, bool denoise,
- const std::string& language = std::string("eng"),
- bool find_line_seps = true,
- bool find_whitespace_seps = true,
- bool debug = false)
+ template <typename I, typename J>
+ document<mln_ch_value(I, def::lbl_type)>
+ content_in_doc(const Image<I>& input, const Image<J>& input_preproc,
+ bool denoise,
+ const std::string& language = std::string("eng"),
+ bool find_line_seps = true,
+ bool find_whitespace_seps = true,
+ bool debug = false)
{
- internal::text_in_doc_functor<I> f;
+ mln_precondition(input.is_valid());
+ mln_precondition(input_preproc.is_valid());
+
+ internal::content_in_doc_functor<J> f("noname");
f.enable_denoising = denoise;
f.enable_line_seps = find_line_seps;
f.enable_whitespace_seps = find_whitespace_seps;
f.enable_debug = debug;
f.ocr_language = language;
- line_set<mln_ch_value(I, def::lbl_type)> lines = f(input);
+ document<mln_ch_value(I, def::lbl_type)> doc = f(input, input_preproc);
- return lines;
+ return doc;
}
@@ -83,5 +87,5 @@ namespace scribo
} // end of namespace scribo
-#endif // SCRIBO_TOOLCHAIN_TEXT_IN_DOC_HH
+#endif // SCRIBO_TOOLCHAIN_CONTENT_IN_DOC_HH
diff --git a/scribo/scribo/toolchain/internal/content_in_doc_functor.hh b/scribo/scribo/toolchain/internal/content_in_doc_functor.hh
index 12e5137..7c665e5 100644
--- a/scribo/scribo/toolchain/internal/content_in_doc_functor.hh
+++ b/scribo/scribo/toolchain/internal/content_in_doc_functor.hh
@@ -30,14 +30,20 @@
# include <scribo/core/def/lbl_type.hh>
# include <scribo/core/document.hh>
+# include <scribo/core/line_set.hh>
+# include <scribo/core/paragraph_set.hh>
# include <scribo/primitive/extract/elements.hh>
# include <scribo/primitive/extract/components.hh>
# include <scribo/primitive/extract/vertical_separators.hh>
# include <scribo/primitive/extract/separators_nonvisible.hh>
+# include <scribo/primitive/extract/elements.hh>
+
+# include <scribo/primitive/identify.hh>
# include <scribo/primitive/remove/separators.hh>
+# include <scribo/filter/line_links_x_height.hh>
# include <scribo/filter/object_links_bbox_h_ratio.hh>
# include <scribo/filter/objects_small.hh>
@@ -52,6 +58,7 @@
# include <scribo/text/recognition.hh>
# include <scribo/text/merging.hh>
+# include <scribo/text/link_lines.hh>
# include <scribo/make/debug_filename.hh>
@@ -336,6 +343,7 @@ namespace scribo
lines = scribo::text::merging(lines);
+ //===== DEBUG =====
if (enable_debug)
{
@@ -353,24 +361,116 @@ namespace scribo
}
+ //===== END OF DEBUG =====
on_progress();
- on_new_progress_label("Recognizing text");
+ // Text recognition
+ on_new_progress_label("Recognizing text");
scribo::text::recognition(lines, ocr_language.c_str());
- doc.set_text(lines);
on_progress();
+
+ // Link text lines
+ on_new_progress_label("Linking text lines");
+ line_links<L> llinks = scribo::text::link_lines(lines);
+
+
+ //===== DEBUG =====
+ if (enable_debug)
+ {
+ image2d<value::rgb8> debug = data::convert(value::rgb8(), original_image);
+ for_all_lines(l, lines)
+ {
+ if (! lines(l).is_valid() || lines(l).is_hidden() || lines(l).type() != line::Text)
+ continue;
+
+ mln::draw::box(debug, lines(l).bbox(), literal::blue);
+ mln::draw::line(debug, lines(l).bbox().pcenter(), lines(llinks(l)).bbox().pcenter(), literal::green);
+ }
+
+ mln::io::ppm::save(debug, scribo::make::debug_filename("links_raw.ppm"));
+ }
+ //===== END OF DEBUG =====
+
+ on_progress();
+
+
+ // Filter line links.
+ on_new_progress_label("Filter line links");
+ llinks = scribo::filter::line_links_x_height(llinks);
+
+ //===== DEBUG =====
+ if (enable_debug)
+ {
+ image2d<value::rgb8> debug = data::convert(value::rgb8(), original_image);
+ for_all_links(i, llinks)
+ if (llinks(i) && llinks(i) != i)
+ mln::draw::line(debug, lines(i).bbox().pcenter(),
+ lines(llinks(i)).bbox().pcenter(), literal::red);
+
+ mln::io::ppm::save(debug, scribo::make::debug_filename("links.ppm"));
+
+
+ for (unsigned i = 1; i < llinks.nelements(); ++i)
+ llinks(i) = scribo::make::internal::find_root(llinks, i);
+
+ debug = data::convert(value::rgb8(), original_image);
+ mln::util::array<accu::shape::bbox<point2d> > nbbox(llinks.nelements());
+ for_all_lines(i, lines)
+ {
+ if (! lines(i).is_valid() || lines(i).is_hidden() || lines(i).type() != line::Text)
+ continue;
+
+ mln::draw::box(debug, lines(i).bbox(), literal::red);
+ nbbox(llinks(i)).take(lines(i).bbox());
+ }
+
+ for (unsigned i = 1; i < nbbox.nelements(); ++i)
+ if (nbbox(i).is_valid())
+ {
+ box2d b = nbbox(i).to_result();
+ mln::draw::box(debug, b, literal::green);
+ b.enlarge(1);
+ mln::draw::box(debug, b, literal::green);
+ b.enlarge(1);
+ mln::draw::box(debug, b, literal::green);
+ }
+
+ mln::io::ppm::save(debug, scribo::make::debug_filename("par.ppm"));
+ }
+ //===== END OF DEBUG =====
+
+ on_progress();
+
+
+ // Construct paragraphs
+ on_new_progress_label("Constructing paragraphs");
+ scribo::paragraph_set<L> parset = scribo::make::paragraph(llinks);
+ doc.set_paragraphs(parset);
+
+ on_progress();
+
+
// Extract other Elements
on_new_progress_label("Extracting Elements");
component_set<L>
elements = scribo::primitive::extract::elements(doc, original_image);
+
+ on_progress();
+
+
+ // Identify other Elements
+ on_new_progress_label("Identifying Elements");
+ elements = scribo::primitive::identify(elements);
doc.set_elements(elements);
on_progress();
+
+
// Saving results
if (save_doc_as_xml)
{
@@ -391,7 +491,7 @@ namespace scribo
int
content_in_doc_functor<I>::nsteps() const
{
- return 7 + enable_denoising + enable_line_seps
+ return 11 + enable_denoising + enable_line_seps
+ enable_whitespace_seps + save_doc_as_xml;
}
diff --git a/scribo/src/content_in_doc.cc b/scribo/src/content_in_doc.cc
index f453f08..8cd262b 100644
--- a/scribo/src/content_in_doc.cc
+++ b/scribo/src/content_in_doc.cc
@@ -30,38 +30,23 @@
#include <iostream>
#include <mln/core/image/image2d.hh>
-#include <mln/core/alias/neighb2d.hh>
#include <mln/io/pbm/save.hh>
#include <mln/io/magick/load.hh>
-#include <mln/value/label_8.hh>
-
-#include <mln/core/var.hh>
-
-#include <mln/accu/count_value.hh>
-
-#include <mln/draw/box_plain.hh>
-
-
-#include <scribo/toolchain/text_in_doc.hh>
+#include <scribo/toolchain/content_in_doc.hh>
#include <scribo/toolchain/text_in_doc_preprocess.hh>
#include <scribo/core/document.hh>
-#include <scribo/core/line_set.hh>
#include <scribo/debug/usage.hh>
#include <scribo/make/debug_filename.hh>
-#include <scribo/primitive/extract/elements.hh>
-
#include <scribo/preprocessing/crop_without_localization.hh>
#include <scribo/preprocessing/crop.hh>
#include <scribo/io/xml/save.hh>
-#include <scribo/io/text_boxes/save.hh>
-
const char *args_desc[][2] =
@@ -108,11 +93,11 @@ int main(int argc, char* argv[])
Magick::InitializeMagick(*argv);
typedef image2d<scribo::def::lbl_type> L;
- scribo::document<L> doc(argv[1]);
- doc.open();
+ image2d<value::rgb8> input;
+ mln::io::magick::load(input, argv[1]);
// Preprocess document
- image2d<bool> input;
+ image2d<bool> input_preproc;
{
double K = 0.34;
if (argc == 8 || argc == 12 || argc >= 12)
@@ -125,7 +110,7 @@ int main(int argc, char* argv[])
}
image2d<bool> tmp_fg;
- input = toolchain::text_in_doc_preprocess(doc.image(), false, K);
+ input_preproc = toolchain::text_in_doc_preprocess(input, false, K);
}
// Optional Cropping
@@ -142,12 +127,12 @@ int main(int argc, char* argv[])
<< " to (" << maxr << "," << maxc << ")" << std::endl;
box2d roi = mln::make::box2d(minr, minc, maxr, maxc);
- input = preprocessing::crop_without_localization(input, roi);
+ input_preproc = preprocessing::crop_without_localization(input_preproc, roi);
crop_shift = point2d(minr, minc);
if (debug)
- mln::io::pbm::save(input,
- scribo::make::debug_filename("input_cropped.pbm"));
+ mln::io::pbm::save(input_preproc,
+ scribo::make::debug_filename("input_preproc_cropped.pbm"));
}
bool denoise = (argc > 3 && atoi(argv[3]) != 0);
@@ -174,18 +159,11 @@ int main(int argc, char* argv[])
// Run document toolchain.
// Text
- std::cout << "Extracting text" << std::endl;
- line_set<L>
- lines = scribo::toolchain::text_in_doc(input, denoise, language,
- find_line_seps, find_whitespace_seps,
- debug);
- doc.set_text(lines);
-
- // Elements
- std::cout << "Extracting Elements" << std::endl;
- component_set<L> elements = scribo::primitive::extract::elements(doc, input);
- doc.set_elements(elements);
-
+ std::cout << "Analysing document..." << std::endl;
+ document<L>
+ doc = scribo::toolchain::content_in_doc(input, input_preproc, denoise, language,
+ find_line_seps, find_whitespace_seps,
+ debug);
// Saving results
scribo::io::xml::save(doc, argv[2], true);
--
1.5.6.5
1
0
* scribo/core/paragraph_set.hh,
* scribo/core/document.hh: Add new methods.
* scribo/core/macros.hh: Add a missing macro.
* scribo/io/xml/save.hh,
* scribo/primitive/extract/elements.hh: Make use of the methods in
document class.
* src/pbm_text_in_doc.cc: Add recognized language as an option.
* tests/toolchain/nepomuk/text_extraction.cc: Make the test not
case dependent.
---
scribo/ChangeLog | 18 +++++++++
scribo/scribo/core/document.hh | 41 +++++++++++++++------
scribo/scribo/core/macros.hh | 3 ++
scribo/scribo/core/paragraph_set.hh | 20 ++++++++++
scribo/scribo/io/xml/save.hh | 20 +++++-----
scribo/scribo/primitive/extract/elements.hh | 5 ++-
scribo/src/pbm_text_in_doc.cc | 40 ++++++++++++--------
scribo/tests/toolchain/nepomuk/text_extraction.cc | 4 +-
8 files changed, 110 insertions(+), 41 deletions(-)
diff --git a/scribo/ChangeLog b/scribo/ChangeLog
index 3ec57f0..c947550 100644
--- a/scribo/ChangeLog
+++ b/scribo/ChangeLog
@@ -1,5 +1,23 @@
2011-01-25 Guillaume Lazzara <z(a)lrde.epita.fr>
+ Small fixes in Scribo.
+
+ * scribo/core/paragraph_set.hh,
+ * scribo/core/document.hh: Add new methods.
+
+ * scribo/core/macros.hh: Add a missing macro.
+
+ * scribo/io/xml/save.hh,
+ * scribo/primitive/extract/elements.hh: Make use of the methods in
+ document class.
+
+ * src/pbm_text_in_doc.cc: Add recognized language as an option.
+
+ * tests/toolchain/nepomuk/text_extraction.cc: Make the test not
+ case dependent.
+
+2011-01-25 Guillaume Lazzara <z(a)lrde.epita.fr>
+
Add an option to choose the recognized language.
* scribo/toolchain/nepomuk/text_extraction.hh,
diff --git a/scribo/scribo/core/document.hh b/scribo/scribo/core/document.hh
index f4a78ff..b547da4 100644
--- a/scribo/scribo/core/document.hh
+++ b/scribo/scribo/core/document.hh
@@ -1,4 +1,5 @@
-// Copyright (C) 2010 EPITA Research and Development Laboratory (LRDE)
+// Copyright (C) 2010, 2011 EPITA Research and Development Laboratory
+// (LRDE)
//
// This file is part of Olena.
//
@@ -59,15 +60,24 @@ namespace scribo
bool is_valid() const;
- const line_set<L>& text() const;
+ /*! \brief Check whether this document contains text.
+
+ If it returns true, that document contains paragraphs, lines and
+ text components.
+
+ */
bool has_text() const;
- void set_text(const line_set<L>& line);
+
+ mln::def::coord height() const;
+ mln::def::coord width() const;
+
+ const line_set<L>& lines() const;
const paragraph_set<L>& paragraphs() const;
void set_paragraphs(const paragraph_set<L>& parset);
- const component_set<L>& elements() const;
bool has_elements() const;
+ const component_set<L>& elements() const;
void set_elements(const component_set<L>& elements);
const mln::image2d<value::rgb8>& image() const;
@@ -150,10 +160,18 @@ namespace scribo
template <typename L>
- const line_set<L>&
- document<L>::text() const
+ mln::def::coord
+ document<L>::width() const
{
- return lines_;
+ return image_.ncols();
+ }
+
+
+ template <typename L>
+ mln::def::coord
+ document<L>::height() const
+ {
+ return image_.nrows();
}
@@ -161,17 +179,18 @@ namespace scribo
bool
document<L>::has_text() const
{
- return lines_.is_valid();
+ return parset_.is_valid();
}
template <typename L>
- void
- document<L>::set_text(const line_set<L>& line)
+ const line_set<L>&
+ document<L>::lines() const
{
- lines_ = line;
+ return parset_.lines();
}
+
template <typename L>
const paragraph_set<L>&
document<L>::paragraphs() const
diff --git a/scribo/scribo/core/macros.hh b/scribo/scribo/core/macros.hh
index 1060358..887539f 100644
--- a/scribo/scribo/core/macros.hh
+++ b/scribo/scribo/core/macros.hh
@@ -62,4 +62,7 @@
# define for_all_anchors(E, S) \
for_all_elements(E, S)
+# define for_all_paragraph_lines(E, S) \
+ for_all_elements(E, S)
+
#endif // ! SCRIBO_CORE_MACROS_HH
diff --git a/scribo/scribo/core/paragraph_set.hh b/scribo/scribo/core/paragraph_set.hh
index afb59c5..355eaa9 100644
--- a/scribo/scribo/core/paragraph_set.hh
+++ b/scribo/scribo/core/paragraph_set.hh
@@ -53,9 +53,13 @@ namespace scribo
paragraph_info<L>& operator()(unsigned i);
const paragraph_info<L>& operator()(unsigned i) const;
+ bool is_valid() const;
+
+ const line_set<L>& lines() const;
private:
mln::util::array<paragraph_info<L> > pars_;
+ line_set<L> lines_;
};
@@ -82,6 +86,7 @@ namespace scribo
paragraph_set<L>::paragraph_set(const line_links<L>& llinks, unsigned npars)
: pars_(npars + 1, paragraph_info<L>(llinks))
{
+ lines_ = llinks.lines();
}
template <typename L>
@@ -106,6 +111,21 @@ namespace scribo
}
+ template <typename L>
+ bool
+ paragraph_set<L>::is_valid() const
+ {
+ return !pars_.is_empty();
+ }
+
+
+ template <typename L>
+ const line_set<L>&
+ paragraph_set<L>::lines() const
+ {
+ return lines_;
+ }
+
namespace make
{
diff --git a/scribo/scribo/io/xml/save.hh b/scribo/scribo/io/xml/save.hh
index 1bcdd6f..41d4fef 100644
--- a/scribo/scribo/io/xml/save.hh
+++ b/scribo/scribo/io/xml/save.hh
@@ -130,9 +130,6 @@ namespace scribo
abort();
}
- const line_set<L>& lines = doc.text();
- const paragraph_set<L>& parset = doc.paragraphs();
-
std::map<char, std::string> html_map;
html_map['\"'] = """;
html_map['<'] = "<";
@@ -150,13 +147,16 @@ namespace scribo
file << " </pcMetadata>" << std::endl;
file << " <page image_filename=\"" << doc.filename()
- << "\" image_width=\"" << lines.components().labeled_image().ncols()
- << "\" image_height=\"" << lines.components().labeled_image().nrows()
+ << "\" image_width=\"" << doc.width()
+ << "\" image_height=\"" << doc.height()
<< "\">" << std::endl;
// Text
if (doc.has_text())
{
+ const line_set<L>& lines = doc.lines();
+ const paragraph_set<L>& parset = doc.paragraphs();
+
for_all_paragraphs(p, parset)
{
const mln::util::array<line_id_t>& line_ids = parset(p).line_ids();
@@ -224,9 +224,6 @@ namespace scribo
abort();
}
- const line_set<L>& lines = doc.text();
- const paragraph_set<L>& parset = doc.paragraphs();
-
std::map<char, std::string> html_map;
html_map['\"'] = """;
html_map['<'] = "<";
@@ -244,13 +241,16 @@ namespace scribo
file << " </pcMetadata>" << std::endl;
file << " <page image_filename=\"" << doc.filename()
- << "\" image_width=\"" << lines.components().labeled_image().ncols()
- << "\" image_height=\"" << lines.components().labeled_image().nrows()
+ << "\" image_width=\"" << doc.width()
+ << "\" image_height=\"" << doc.height()
<< "\">" << std::endl;
// Text
if (doc.has_text())
{
+ const line_set<L>& lines = doc.lines();
+ const paragraph_set<L>& parset = doc.paragraphs();
+
for_all_paragraphs(p, parset)
{
const mln::util::array<line_id_t>& line_ids = parset(p).line_ids();
diff --git a/scribo/scribo/primitive/extract/elements.hh b/scribo/scribo/primitive/extract/elements.hh
index 2e6a0cb..ddf2c92 100644
--- a/scribo/scribo/primitive/extract/elements.hh
+++ b/scribo/scribo/primitive/extract/elements.hh
@@ -1,4 +1,5 @@
-// Copyright (C) 2010 EPITA Research and Development Laboratory (LRDE)
+// Copyright (C) 2010, 2011 EPITA Research and Development Laboratory
+// (LRDE)
//
// This file is part of Olena.
//
@@ -114,7 +115,7 @@ namespace scribo
mln_precondition(doc.is_valid());
mln_precondition(input.is_valid());
- const line_set<L>& lines = doc.text();
+ const line_set<L>& lines = doc.lines();
// Element extraction
diff --git a/scribo/src/pbm_text_in_doc.cc b/scribo/src/pbm_text_in_doc.cc
index 2240f42..721ff47 100644
--- a/scribo/src/pbm_text_in_doc.cc
+++ b/scribo/src/pbm_text_in_doc.cc
@@ -1,5 +1,5 @@
-// Copyright (C) 2009, 2010 EPITA Research and Development Laboratory
-// (LRDE)
+// Copyright (C) 2009, 2010, 2011 EPITA Research and Development
+// Laboratory (LRDE)
//
// This file is part of Olena.
//
@@ -49,7 +49,6 @@
#include <scribo/preprocessing/crop_without_localization.hh>
-#include <scribo/io/xml/save.hh>
#include <scribo/io/text_boxes/save.hh>
@@ -65,6 +64,7 @@ for the background." },
{ "pmin_col", "Col index of the top left corner of the Region of interest." },
{ "pmax_row", "Row index of the bottom right corner of the Region of interest." },
{ "pmax_col", "Col index of the bottom right corner of the Region of interest." },
+ { "language", "Language to be used for the text recognition. [eng|fra] (Default: eng)" },
{ "find_lines", "Find vertical lines. (Default 1)" },
{ "find_whitespaces", "Find whitespaces separators. (Default 1)" },
{ "debug_dir", "Output directory for debug image" },
@@ -77,16 +77,16 @@ int main(int argc, char* argv[])
using namespace scribo;
using namespace mln;
- if (argc != 3 && argc != 4 && argc != 5 && argc != 8 && argc != 9)
+ if (argc < 3 || argc > 12)
return scribo::debug::usage(argv,
"Find text lines using left/right validation and display x-height in a binarized article.",
- "input.pbm out.txt <denoise_enabled> [<pmin_row> <pmin_col> <pmax_row> <pmax_col>] <find_lines> <find_whitespaces> <debug_dir>",
+ "input.pbm out.txt <denoise_enabled> [<pmin_row> <pmin_col> <pmax_row> <pmax_col>] <language> <find_lines> <find_whitespaces> <debug_dir>",
args_desc);
bool debug = false;
// Enable debug output.
- if (argc == 7 || argc == 11)
+ if (argc == 8 || argc == 12)
{
scribo::make::internal::debug_filename_prefix = argv[argc - 1];
debug = true;
@@ -101,7 +101,7 @@ int main(int argc, char* argv[])
// Optional Cropping
point2d crop_shift = literal::origin;
- if (argc >= 11)
+ if (argc >= 12)
{
mln::def::coord
minr = atoi(argv[4]),
@@ -120,13 +120,24 @@ int main(int argc, char* argv[])
bool denoise = (argc > 3 && atoi(argv[3]) != 0);
+ std::string language = "eng";
+ if (argc > 4 && argc < 12)
+ language = argv[4];
+ else if (argc == 12)
+ language = argv[8];
+
bool find_line_seps = true;
- if (argc >= 4 && argc < 11)
- find_line_seps = (atoi(argv[3]) != 0);
+ if (argc > 5 && argc < 12)
+ find_line_seps = (atoi(argv[5]) != 0);
+ else if (argc == 12)
+ find_line_seps = (atoi(argv[9]) != 0);
bool find_whitespace_seps = true;
- if (argc >= 5 && argc < 11)
- find_line_seps = (atoi(argv[4]) != 0);
+ if (argc > 6 && argc < 12)
+ find_whitespace_seps = (atoi(argv[6]) != 0);
+ else if (argc == 12)
+ find_whitespace_seps = (atoi(argv[10]) != 0);
+
std::cout << "Running with the following options :"
<< "find_lines_seps = " << find_line_seps
@@ -136,15 +147,12 @@ int main(int argc, char* argv[])
// Run document toolchain.
line_set<L>
- lines = scribo::toolchain::text_in_doc(input, denoise, find_line_seps,
+ lines = scribo::toolchain::text_in_doc(input, denoise,
+ language, find_line_seps,
find_whitespace_seps, debug);
scribo::document<L> doc;
doc.set_filename(argv[1]);
- doc.set_text(lines);
-
- // Saving results
- scribo::io::xml::save(doc, "out.xml", true);
// Specify shift due to potential previous crop.
scribo::io::text_boxes::save(lines, argv[2], crop_shift);
diff --git a/scribo/tests/toolchain/nepomuk/text_extraction.cc b/scribo/tests/toolchain/nepomuk/text_extraction.cc
index 7191650..eeafd6b 100644
--- a/scribo/tests/toolchain/nepomuk/text_extraction.cc
+++ b/scribo/tests/toolchain/nepomuk/text_extraction.cc
@@ -43,9 +43,9 @@ int main()
QImage ima(SCRIBO_IMG_DIR "/wildly.pbm");
QSet<QString> words = scribo::toolchain::nepomuk::text_extraction(ima);
- words = words.toLower();
mln_assertion(words.size() == 1);
- mln_assertion(words.contains("wildly"));
+ QString word = words.toList().at(0).toLower();
+ mln_assertion(word == "wildly");
return 0;
}
--
1.5.6.5
1
0
last-svn-commit-744-g6ba320b Add an option to choose the recognized language.
by Guillaume Lazzara 25 Jan '11
by Guillaume Lazzara 25 Jan '11
25 Jan '11
* scribo/toolchain/nepomuk/text_extraction.hh,
* scribo/toolchain/text_in_doc.hh,
* src/content_in_doc.cc: Here.
---
scribo/ChangeLog | 8 ++++
scribo/scribo/toolchain/nepomuk/text_extraction.hh | 20 +++++++----
scribo/scribo/toolchain/text_in_doc.hh | 3 ++
scribo/src/content_in_doc.cc | 36 ++++++++++++--------
4 files changed, 46 insertions(+), 21 deletions(-)
diff --git a/scribo/ChangeLog b/scribo/ChangeLog
index 360f65d..3ec57f0 100644
--- a/scribo/ChangeLog
+++ b/scribo/ChangeLog
@@ -1,5 +1,13 @@
2011-01-25 Guillaume Lazzara <z(a)lrde.epita.fr>
+ Add an option to choose the recognized language.
+
+ * scribo/toolchain/nepomuk/text_extraction.hh,
+ * scribo/toolchain/text_in_doc.hh,
+ * src/content_in_doc.cc: Here.
+
+2011-01-25 Guillaume Lazzara <z(a)lrde.epita.fr>
+
* scribo/text/recognition.hh: Make use of TessBaseAPI::setImage.
2011-01-25 Guillaume Lazzara <z(a)lrde.epita.fr>
diff --git a/scribo/scribo/toolchain/nepomuk/text_extraction.hh b/scribo/scribo/toolchain/nepomuk/text_extraction.hh
index effb13f..6def090 100644
--- a/scribo/scribo/toolchain/nepomuk/text_extraction.hh
+++ b/scribo/scribo/toolchain/nepomuk/text_extraction.hh
@@ -24,8 +24,8 @@
// executable file might be covered by the GNU General Public License.
-#ifndef SCRIBO_TOOLCHAIN_TEXT_EXTRACTION_HH
-# define SCRIBO_TOOLCHAIN_TEXT_EXTRACTION_HH
+#ifndef SCRIBO_TOOLCHAIN_NEPOMUK_TEXT_EXTRACTION_HH
+# define SCRIBO_TOOLCHAIN_NEPOMUK_TEXT_EXTRACTION_HH
/// \file
///
@@ -77,13 +77,13 @@ namespace scribo
*/
QSet<QString>
- text_extraction(const QImage& input);
+ text_extraction(const QImage& input, const QString& language);
# ifndef MLN_INCLUDE_ONLY
QSet<QString>
- text_extraction(const QImage& input)
+ text_extraction(const QImage& input, const QString& language = QString("eng"))
{
trace::entering("scribo::toolchain::nepomuk::text_extraction");
@@ -119,13 +119,19 @@ namespace scribo
// Process
{
// Run document toolchain.
- lines_bg = scribo::toolchain::text_in_doc(input_bin, true, false);
+ lines_bg = scribo::toolchain::text_in_doc(input_bin,
+ true,
+ language.toUtf8().data(),
+ false);
// Negate document.
logical::not_inplace(input_bin);
// Run document toolchain.
- lines_fg = scribo::toolchain::text_in_doc(input_bin, true, false);
+ lines_fg = scribo::toolchain::text_in_doc(input_bin,
+ true,
+ language.toUtf8().data(),
+ false);
}
@@ -165,4 +171,4 @@ namespace scribo
} // end of namespace scribo
-#endif // ! SCRIBO_TOOLCHAIN_TEXT_EXTRACTION_HH
+#endif // ! SCRIBO_TOOLCHAIN_NEPOMUK_TEXT_EXTRACTION_HH
diff --git a/scribo/scribo/toolchain/text_in_doc.hh b/scribo/scribo/toolchain/text_in_doc.hh
index 0ad6cf3..e6ba69e 100644
--- a/scribo/scribo/toolchain/text_in_doc.hh
+++ b/scribo/scribo/toolchain/text_in_doc.hh
@@ -45,6 +45,7 @@ namespace scribo
template <typename I>
line_set<mln_ch_value(I, def::lbl_type)>
text_in_doc(const Image<I>& input, bool denoise,
+ const std::string& language = std::string("eng"),
bool find_line_seps = true,
bool find_whitespace_seps = true,
bool debug = false);
@@ -56,6 +57,7 @@ namespace scribo
template <typename I>
line_set<mln_ch_value(I, def::lbl_type)>
text_in_doc(const Image<I>& input, bool denoise,
+ const std::string& language = std::string("eng"),
bool find_line_seps = true,
bool find_whitespace_seps = true,
bool debug = false)
@@ -65,6 +67,7 @@ namespace scribo
f.enable_line_seps = find_line_seps;
f.enable_whitespace_seps = find_whitespace_seps;
f.enable_debug = debug;
+ f.ocr_language = language;
line_set<mln_ch_value(I, def::lbl_type)> lines = f(input);
diff --git a/scribo/src/content_in_doc.cc b/scribo/src/content_in_doc.cc
index 2c31d90..f453f08 100644
--- a/scribo/src/content_in_doc.cc
+++ b/scribo/src/content_in_doc.cc
@@ -1,4 +1,5 @@
-// Copyright (C) 2010 EPITA Research and Development Laboratory (LRDE)
+// Copyright (C) 2010, 2011 EPITA Research and Development Laboratory
+// (LRDE)
//
// This file is part of Olena.
//
@@ -72,6 +73,7 @@ const char *args_desc[][2] =
{ "pmin_col", "Col index of the top left corner of the Region of interest." },
{ "pmax_row", "Row index of the bottom right corner of the Region of interest." },
{ "pmax_col", "Col index of the bottom right corner of the Region of interest." },
+ { "language", "Language to be used for the text recognition. [eng|fra] (Default: eng)" },
{ "find_lines", "Find vertical lines. (Default 1)" },
{ "find_whitespaces", "Find whitespaces separators. (Default 1)" },
{ "K", "Sauvola's binarization threshold parameter. (Default: 0.34)" },
@@ -86,16 +88,16 @@ int main(int argc, char* argv[])
using namespace scribo;
using namespace mln;
- if (argc < 3 || (argc > 8 && argc != 12))
+ if (argc < 3 || argc > 14)
return scribo::debug::usage(argv,
"Find text lines and elements in a document",
- "input.* out.xml <denoise_enabled> [<pmin_row> <pmin_col> <pmax_row> <pmax_col>] <find_lines> <find_whitespaces> <K> <debug_dir>",
+ "input.* out.xml <denoise_enabled> [<pmin_row> <pmin_col> <pmax_row> <pmax_col>] [language] [find_lines] [find_whitespaces] [K] [debug_dir]",
args_desc);
bool debug = false;
// Enable debug output.
- if (argc == 8 || argc == 12)
+ if (argc == 9 || argc == 13)
{
scribo::make::internal::debug_filename_prefix = argv[argc - 1];
debug = true;
@@ -113,10 +115,10 @@ int main(int argc, char* argv[])
image2d<bool> input;
{
double K = 0.34;
- if (argc == 7 || argc == 8 || argc == 11)
+ if (argc == 8 || argc == 12 || argc >= 12)
{
- if (argc == 7)
- K = atof(argv[6]);
+ if (argc == 8)
+ K = atof(argv[7]);
else
K = atof(argv[argc - 2]);
std::cout << "Using K = " << K << std::endl;
@@ -150,16 +152,21 @@ int main(int argc, char* argv[])
bool denoise = (argc > 3 && atoi(argv[3]) != 0);
+ std::string language = "eng";
+ if (argc >= 5 && argc < 13)
+ language = argv[4];
+
bool find_line_seps = true;
- if (argc >= 5 && argc < 12)
- find_line_seps = (atoi(argv[4]) != 0);
+ if (argc >= 6 && argc < 13)
+ find_line_seps = (atoi(argv[5]) != 0);
bool find_whitespace_seps = true;
- if (argc >= 6 && argc < 12)
- find_line_seps = (atoi(argv[5]) != 0);
+ if (argc >= 7 && argc < 13)
+ find_line_seps = (atoi(argv[6]) != 0);
std::cout << "Running with the following options :"
- << "find_lines_seps = " << find_line_seps
+ << " ocr_language = " << language
+ << " | find_lines_seps = " << find_line_seps
<< " | find_whitespace_seps = " << find_whitespace_seps
<< " | debug = " << debug
<< std::endl;
@@ -169,8 +176,9 @@ int main(int argc, char* argv[])
// Text
std::cout << "Extracting text" << std::endl;
line_set<L>
- lines = scribo::toolchain::text_in_doc(input, denoise, find_line_seps,
- find_whitespace_seps, debug);
+ lines = scribo::toolchain::text_in_doc(input, denoise, language,
+ find_line_seps, find_whitespace_seps,
+ debug);
doc.set_text(lines);
// Elements
--
1.5.6.5
1
0
last-svn-commit-743-g0fd6cdf scribo/text/recognition.hh: Make use of TessBaseAPI::setImage.
by Guillaume Lazzara 25 Jan '11
by Guillaume Lazzara 25 Jan '11
25 Jan '11
---
scribo/ChangeLog | 4 ++++
scribo/scribo/text/recognition.hh | 11 +++++------
2 files changed, 9 insertions(+), 6 deletions(-)
diff --git a/scribo/ChangeLog b/scribo/ChangeLog
index 4574778..360f65d 100644
--- a/scribo/ChangeLog
+++ b/scribo/ChangeLog
@@ -1,5 +1,9 @@
2011-01-25 Guillaume Lazzara <z(a)lrde.epita.fr>
+ * scribo/text/recognition.hh: Make use of TessBaseAPI::setImage.
+
+2011-01-25 Guillaume Lazzara <z(a)lrde.epita.fr>
+
* scribo/text/recognition.hh: Fix invalid free().
2011-01-25 Guillaume Lazzara <z(a)lrde.epita.fr>
diff --git a/scribo/scribo/text/recognition.hh b/scribo/scribo/text/recognition.hh
index 7a72ee4..59f269e 100644
--- a/scribo/scribo/text/recognition.hh
+++ b/scribo/scribo/text/recognition.hh
@@ -180,14 +180,13 @@ namespace scribo
line_image.ncols(), // n cols
line_image.nrows()); // n rows
# else // HAVE_TESSERACT_3
- char* s = tess.TesseractRect(
+ tess.SetImage(
(unsigned char*) line_image.buffer(),
- sizeof (bool), // Pixel size.
- line_image.ncols() * sizeof (bool), // Row_offset
- 0, // Left
- 0, // Top
line_image.ncols(), // n cols
- line_image.nrows()); // n rows
+ line_image.nrows(), // n rows
+ sizeof (bool), // Pixel size.
+ line_image.ncols() * sizeof (bool)); // Row_offset
+ char* s = tess.GetUTF8Text();
# endif // ! HAVE_TESSERACT_2
if (s != 0)
--
1.5.6.5
1
0
last-svn-commit-742-gec99c0a scribo/text/recognition.hh: Fix invalid free().
by Guillaume Lazzara 25 Jan '11
by Guillaume Lazzara 25 Jan '11
25 Jan '11
---
scribo/ChangeLog | 4 ++++
scribo/scribo/text/recognition.hh | 15 +++++----------
2 files changed, 9 insertions(+), 10 deletions(-)
diff --git a/scribo/ChangeLog b/scribo/ChangeLog
index cfd40df..4574778 100644
--- a/scribo/ChangeLog
+++ b/scribo/ChangeLog
@@ -1,5 +1,9 @@
2011-01-25 Guillaume Lazzara <z(a)lrde.epita.fr>
+ * scribo/text/recognition.hh: Fix invalid free().
+
+2011-01-25 Guillaume Lazzara <z(a)lrde.epita.fr>
+
* scribo/io/xml/save.hh: Handle paragraphs and separators correctly
in XML output.
diff --git a/scribo/scribo/text/recognition.hh b/scribo/scribo/text/recognition.hh
index 8b243bd..7a72ee4 100644
--- a/scribo/scribo/text/recognition.hh
+++ b/scribo/scribo/text/recognition.hh
@@ -1,5 +1,5 @@
-// Copyright (C) 2009, 2010 EPITA Research and Development Laboratory
-// (LRDE)
+// Copyright (C) 2009, 2010, 2011 EPITA Research and Development
+// Laboratory (LRDE)
//
// This file is part of Olena.
//
@@ -118,15 +118,10 @@ namespace scribo
abort();
}
tess.SetPageSegMode(tesseract::PSM_SINGLE_LINE);
+
# endif // HAVE_TESSERACT_2
typedef mln_ch_value(L,bool) I;
- int vals[] = { 0, 9, 0, 9, 0,
- 9, 6, 4, 6, 9,
- 0, 4, 0, 4, 0,
- 9, 6, 4, 6, 9,
- 0, 9, 0, 9, 0 };
- w_window2d_int dmap_win = mln::make::w_window2d_int(vals);
/// Use text bboxes with Tesseract
@@ -203,7 +198,7 @@ namespace scribo
}
// The string has been allocated by Tesseract. It must be released.
- free(s);
+ delete [] s;
}
trace::exiting("scribo::text::recognition");
@@ -289,7 +284,7 @@ namespace scribo
}
// The string has been allocated by Tesseract. We must free it.
- free(s);
+ delete [] s;
if (!output_file.empty())
file.close();
--
1.5.6.5
1
0
last-svn-commit-741-g6ef044f scribo/io/xml/save.hh: Handle paragraphs and separators correctly in XML output.
by Guillaume Lazzara 25 Jan '11
by Guillaume Lazzara 25 Jan '11
25 Jan '11
---
scribo/ChangeLog | 5 +
scribo/scribo/io/xml/save.hh | 335 +++++++++++++++++++++++++++++-------------
2 files changed, 241 insertions(+), 99 deletions(-)
diff --git a/scribo/ChangeLog b/scribo/ChangeLog
index 2ba7d58..cfd40df 100644
--- a/scribo/ChangeLog
+++ b/scribo/ChangeLog
@@ -1,5 +1,10 @@
2011-01-25 Guillaume Lazzara <z(a)lrde.epita.fr>
+ * scribo/io/xml/save.hh: Handle paragraphs and separators correctly
+ in XML output.
+
+2011-01-25 Guillaume Lazzara <z(a)lrde.epita.fr>
+
Identify separators among non-text components.
* scribo/core/tag/component.hh: New Separator type.
diff --git a/scribo/scribo/io/xml/save.hh b/scribo/scribo/io/xml/save.hh
index d0c72e9..1bcdd6f 100644
--- a/scribo/scribo/io/xml/save.hh
+++ b/scribo/scribo/io/xml/save.hh
@@ -1,4 +1,5 @@
-// Copyright (C) 2010 EPITA Research and Development Laboratory (LRDE)
+// Copyright (C) 2010, 2011 EPITA Research and Development Laboratory
+// (LRDE)
//
// This file is part of Olena.
//
@@ -30,6 +31,7 @@
///
/// \brief Save document information as XML.
+# include <libgen.h>
# include <fstream>
# include <sstream>
@@ -58,12 +60,12 @@ namespace scribo
Its XSD file is located here:
http://schema.primaresearch.org/PAGE/gts/pagecontent/2009-03-16/pagecontent…
- */
+ */
template <typename L>
void
save(const document<L>& doc,
const std::string& output_name,
- bool extended_format);
+ bool allow_extensions);
# ifndef MLN_INCLUDE_ONLY
@@ -77,14 +79,14 @@ namespace scribo
std::map<char, std::string>& map)
{
for (unsigned i = 0; i < input.size(); ++i)
+ {
+ std::map<char, std::string>::iterator it = map.find(input.at(i));
+ if (it != map.end())
{
- std::map<char, std::string>::iterator it = map.find(input.at(i));
- if (it != map.end())
- {
- input.replace(i, 1, it->second);
- i += it->second.size() - 1;
- }
+ input.replace(i, 1, it->second);
+ i += it->second.size() - 1;
}
+ }
return input;
}
@@ -112,142 +114,277 @@ namespace scribo
}
- } // end of namespace scribo::io::xml::internal
- template <typename L>
- void
- save(const document<L>& doc,
- const std::string& output_name,
- bool extended_format)
- {
- trace::entering("scribo::io::xml:save_text_lines");
- std::ofstream file(output_name.c_str());
- if (! file)
+ template <typename L>
+ void
+ save(const document<L>& doc,
+ const std::string& output_name)
{
- std::cerr << "error: cannot open file '" << doc.filename() << "'!";
- abort();
- }
+ trace::entering("scribo::io::xml:save_text_lines");
- const line_set<L>& lines = doc.text();
+ std::ofstream file(output_name.c_str());
+ if (! file)
+ {
+ std::cerr << "error: cannot open file '" << doc.filename() << "'!";
+ abort();
+ }
- std::map<char, std::string> html_map;
- html_map['\"'] = """;
- html_map['<'] = "<";
- html_map['>'] = ">";
- html_map['&'] = "&";
+ const line_set<L>& lines = doc.text();
+ const paragraph_set<L>& parset = doc.paragraphs();
- file << "<?xml version=\"1.0\"?>" << std::endl;
- if (extended_format)
- {
- file << "<pcGts>" << std::endl;
- }
- else
- {
+ std::map<char, std::string> html_map;
+ html_map['\"'] = """;
+ html_map['<'] = "<";
+ html_map['>'] = ">";
+ html_map['&'] = "&";
+
+ file << "<?xml version=\"1.0\"?>" << std::endl;
file << "<pcGts xmlns=\"http://schema.primaresearch.org/PAGE/gts/pagecontent/2009-03-16\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://schema.primaresearch.org/PAGE/gts/pagecontent/2009-03-16 http://schema.primaresearch.org/PAGE/gts/pagecontent/2009-03-16/pagecontent…" pcGtsId=\"" << doc.filename() << "\">" << std::endl;
+
+ file << " <pcMetadata>" << std::endl;
+ file << " <pcCreator>LRDE</pcCreator>" << std::endl;
+ file << " <pcCreated/>" << std::endl;
+ file << " <pcLastChange/>" << std::endl;
+ file << " <pcComments>Generated by Scribo from Olena.</pcComments>" << std::endl;
+ file << " </pcMetadata>" << std::endl;
+
+ file << " <page image_filename=\"" << doc.filename()
+ << "\" image_width=\"" << lines.components().labeled_image().ncols()
+ << "\" image_height=\"" << lines.components().labeled_image().nrows()
+ << "\">" << std::endl;
+
+ // Text
+ if (doc.has_text())
+ {
+ for_all_paragraphs(p, parset)
+ {
+ const mln::util::array<line_id_t>& line_ids = parset(p).line_ids();
+
+ // FIXME: compute that information on the whole paragraph
+ // and use them here.
+ line_id_t fid = line_ids(0);
+ file << " <text_region id=\"" << p
+ << "\" txt_orientation=\"" << lines(fid).orientation()
+ << "\" txt_reading_orientation=\"" << lines(fid).reading_orientation()
+ << "\" txt_reading_direction=\"" << lines(fid).reading_direction()
+ << "\" txt_text_type=\"" << lines(fid).type()
+ << "\" txt_reverse_video=\"" << (lines(fid).reverse_video() ? "true" : "false")
+ << "\" txt_indented=\"" << (lines(fid).indented() ? "true" : "false")
+ << "\" kerning=\"" << lines(fid).char_space()
+ << "\">"
+ << std::endl;
+
+ internal::print_box_coords(file, parset(p).bbox(), " ");
+
+ file << " </text_region>" << std::endl;
+ }
+ }
+
+ // Page elements (Pictures, ...)
+ if (doc.has_elements())
+ {
+ const component_set<L>& elts = doc.elements();
+ for_all_comps(e, elts)
+ if (elts(e).is_valid())
+ {
+ file << " <image_region id=\"ir" << elts(e).id()
+ << "\" img_colour_type=\"24_Bit_Colour\""
+ << " img_orientation=\"0.000000\" "
+ << " img_emb_text=\"No\" "
+ << " img_bgcolour=\"White\">" << std::endl;
+
+ internal::print_box_coords(file, elts(e).bbox(), " ");
+
+ file << " </image_region>" << std::endl;
+ }
+ }
+
+
+ file << " </page>" << std::endl;
+ file << "</pcGts>" << std::endl;
+
+ trace::exiting("scribo::io::xml::save_text_lines");
}
- file << " <PcMetadata>" << std::endl;
- file << " <PcCreator>LRDE</PcCreator>" << std::endl;
- file << " <PcCreated/>" << std::endl;
- file << " <PcLastChange/>" << std::endl;
- file << " <PcComments>Generated by Scribo from Olena.</PcComments>" << std::endl;
- file << " </PcMetadata>" << std::endl;
- file << " <page image_filename=\"" << doc.filename()
- << "\" image_width=\"" << lines.components().labeled_image().ncols()
- << "\" image_height=\"" << lines.components().labeled_image().nrows()
- << "\">" << std::endl;
- // Text
- if (doc.has_text())
+
+ template <typename L>
+ void
+ save_extended(const document<L>& doc,
+ const std::string& output_name)
{
- for_all_lines(l, lines)
+ trace::entering("scribo::io::xml:save_text_lines");
+
+ std::ofstream file(output_name.c_str());
+ if (! file)
{
- if (! lines(l).is_valid()
- || lines(l).tag() != line::None
- || lines(l).type() != line::Text) // Is NOT a text line.
- continue;
+ std::cerr << "error: cannot open file '" << doc.filename() << "'!";
+ abort();
+ }
+
+ const line_set<L>& lines = doc.text();
+ const paragraph_set<L>& parset = doc.paragraphs();
+
+ std::map<char, std::string> html_map;
+ html_map['\"'] = """;
+ html_map['<'] = "<";
+ html_map['>'] = ">";
+ html_map['&'] = "&";
+
+ file << "<?xml version=\"1.0\"?>" << std::endl;
+ file << "<pcGts>" << std::endl;
+
+ file << " <pcMetadata>" << std::endl;
+ file << " <pcCreator>LRDE</pcCreator>" << std::endl;
+ file << " <pcCreated/>" << std::endl;
+ file << " <pcLastChange/>" << std::endl;
+ file << " <pcComments>Generated by Scribo from Olena.</pcComments>" << std::endl;
+ file << " </pcMetadata>" << std::endl;
+
+ file << " <page image_filename=\"" << doc.filename()
+ << "\" image_width=\"" << lines.components().labeled_image().ncols()
+ << "\" image_height=\"" << lines.components().labeled_image().nrows()
+ << "\">" << std::endl;
+
+ // Text
+ if (doc.has_text())
+ {
+ for_all_paragraphs(p, parset)
{
- file << " <text_region id=\"" << lines(l).id()
- << "\" txt_orientation=\"" << lines(l).orientation()
- << "\" txt_reading_orientation=\"" << lines(l).reading_orientation()
- << "\" txt_reading_direction=\"" << lines(l).reading_direction()
- << "\" txt_text_type=\"" << lines(l).type()
- << "\" txt_reverse_video=\"" << (lines(l).reverse_video() ? "true" : "false")
- << "\" txt_indented=\"" << (lines(l).indented() ? "true" : "false")
- << "\" kerning=\"" << lines(l).char_space();
+ const mln::util::array<line_id_t>& line_ids = parset(p).line_ids();
+
+ // FIXME: compute that information on the whole paragraph
+ // and use them here.
+ line_id_t fid = line_ids(0);
+ file << " <text_region id=\"" << p
+ << "\" txt_orientation=\"" << lines(fid).orientation()
+ << "\" txt_reading_orientation=\"" << lines(fid).reading_orientation()
+ << "\" txt_reading_direction=\"" << lines(fid).reading_direction()
+ << "\" txt_text_type=\"" << lines(fid).type()
+ << "\" txt_reverse_video=\"" << (lines(fid).reverse_video() ? "true" : "false")
+ << "\" txt_indented=\"" << (lines(fid).indented() ? "true" : "false")
+ << "\" kerning=\"" << lines(fid).char_space();
// EXTENSIONS - Not officially supported
- if (extended_format)
- {
- file << "\" baseline=\"" << lines(l).baseline()
- << "\" meanline=\"" << lines(l).meanline()
- << "\" x_height=\"" << lines(l).x_height()
- << "\" d_height=\"" << lines(l).d_height()
- << "\" a_height=\"" << lines(l).a_height()
- << "\" char_width=\"" << lines(l).char_width();
- }
+ file << "\" baseline=\"" << lines(fid).baseline()
+ << "\" meanline=\"" << lines(fid).meanline()
+ << "\" x_height=\"" << lines(fid).x_height()
+ << "\" d_height=\"" << lines(fid).d_height()
+ << "\" a_height=\"" << lines(fid).a_height()
+ << "\" char_width=\"" << lines(fid).char_width();
// End of EXTENSIONS
file << "\">"
<< std::endl;
- internal::print_box_coords(file, lines(l).bbox(), " ");
+ internal::print_box_coords(file, parset(p).bbox(), " ");
- if (extended_format)
- {
- file << " <paragraph>" << std::endl;
- internal::print_box_coords(file, lines(l).bbox(), " ");
+ // EXTENSIONS - Not officially supported
+ for_all_paragraph_lines(lid, line_ids)
+ {
+ line_id_t l = line_ids(lid);
if (lines(l).has_text())
{
std::string tmp = lines(l).text();
tmp = internal::html_markups_replace(tmp, html_map);
- file << " <line text=\""
- << tmp
- << "\">" << std::endl;
+ file << " <line text=\"" << tmp << "\" ";
}
else
- file << " <line>" << std::endl;
+ file << " <line " << std::endl;
+
+ file << "id=\"" << lines(l).id()
+ << "\" txt_orientation=\"" << lines(l).orientation()
+ << "\" txt_reading_orientation=\"" << lines(l).reading_orientation()
+ << "\" txt_reading_direction=\"" << lines(l).reading_direction()
+ << "\" txt_text_type=\"" << lines(l).type()
+ << "\" txt_reverse_video=\"" << (lines(l).reverse_video() ? "true" : "false")
+ << "\" txt_indented=\"" << (lines(l).indented() ? "true" : "false")
+ << "\" kerning=\"" << lines(l).char_space()
+ << "\" baseline=\"" << lines(l).baseline()
+ << "\" meanline=\"" << lines(l).meanline()
+ << "\" x_height=\"" << lines(l).x_height()
+ << "\" d_height=\"" << lines(l).d_height()
+ << "\" a_height=\"" << lines(l).a_height()
+ << "\" char_width=\"" << lines(l).char_width()
+ << "\">" << std::endl;
internal::print_box_coords(file, lines(l).bbox(), " ");
file << " </line>" << std::endl;
-
- file << " </paragraph>" << std::endl;
}
file << " </text_region>" << std::endl;
}
}
- }
+ // End of EXTENSIONS
- // Page elements (Pictures, ...)
- if (doc.has_elements())
- {
- const component_set<L>& elts = doc.elements();
- for_all_comps(e, elts)
- if (elts(e).is_valid())
- {
- file << " <image_region id=\"ir" << elts(e).id()
- << "\" img_colour_type=\"24_Bit_Colour\""
- << " img_orientation=\"0.000000\" "
- << " img_emb_text=\"No\" "
- << " img_bgcolour=\"White\">" << std::endl;
+ // Page elements (Pictures, ...)
+ if (doc.has_elements())
+ {
+ const component_set<L>& elts = doc.elements();
+ for_all_comps(e, elts)
+ if (elts(e).is_valid())
+ {
+ switch (elts(e).type())
+ {
+ case component::Separator:
+ {
+ file << " <separator_region id=\"sr" << elts(e).id()
+ << "\" sep_orientation=\"0.000000\" "
+ << " sep_colour=\"Black\" "
+ << " sep_bgcolour=\"White\">" << std::endl;
+
+ internal::print_box_coords(file, elts(e).bbox(), " ");
+
+ file << " </separator_region>" << std::endl;
+ break;
+ break;
+ }
+
+ default:
+ case component::Image:
+ {
+ file << " <image_region id=\"ir" << elts(e).id()
+ << "\" img_colour_type=\"24_Bit_Colour\""
+ << " img_orientation=\"0.000000\" "
+ << " img_emb_text=\"No\" "
+ << " img_bgcolour=\"White\">" << std::endl;
+
+ internal::print_box_coords(file, elts(e).bbox(), " ");
+
+ file << " </image_region>" << std::endl;
+ break;
+ }
+ }
+ }
+ }
- internal::print_box_coords(file, elts(e).bbox(), " ");
- file << " </image_region>" << std::endl;
- }
+ file << " </page>" << std::endl;
+ file << "</pcGts>" << std::endl;
+
+ trace::exiting("scribo::io::xml::save_text_lines");
}
+ } // end of namespace scribo::io::xml::internal
- file << " </page>" << std::endl;
- file << "</pcGts>" << std::endl;
- trace::exiting("scribo::io::xml::save_text_lines");
+ // FACADE
+
+ template <typename L>
+ void
+ save(const document<L>& doc,
+ const std::string& output_name,
+ bool allow_extensions)
+ {
+ if (allow_extensions)
+ internal::save_extended(doc, output_name);
+ else
+ internal::save(doc, output_name);
}
--
1.5.6.5
1
0
last-svn-commit-740-g6e9c307 Identify separators among non-text components.
by Guillaume Lazzara 25 Jan '11
by Guillaume Lazzara 25 Jan '11
25 Jan '11
* scribo/core/tag/component.hh: New Separator type.
* scribo/primitive/identify.hh: identify non-text components
type according criterions.
---
scribo/ChangeLog | 9 +++
scribo/scribo/core/tag/component.hh | 6 ++-
.../{link/internal/dmax_default.hh => identify.hh} | 58 +++++++++-----------
3 files changed, 40 insertions(+), 33 deletions(-)
copy scribo/scribo/primitive/{link/internal/dmax_default.hh => identify.hh} (60%)
diff --git a/scribo/ChangeLog b/scribo/ChangeLog
index 37d0ea8..2ba7d58 100644
--- a/scribo/ChangeLog
+++ b/scribo/ChangeLog
@@ -1,5 +1,14 @@
2011-01-25 Guillaume Lazzara <z(a)lrde.epita.fr>
+ Identify separators among non-text components.
+
+ * scribo/core/tag/component.hh: New Separator type.
+
+ * scribo/primitive/identify.hh: identify non-text components
+ type according criterions.
+
+2011-01-25 Guillaume Lazzara <z(a)lrde.epita.fr>
+
Handle paragraphs correctly in scribo-viewer.
* demo/viewer/common.hh,
diff --git a/scribo/scribo/core/tag/component.hh b/scribo/scribo/core/tag/component.hh
index f2fb059..f773932 100644
--- a/scribo/scribo/core/tag/component.hh
+++ b/scribo/scribo/core/tag/component.hh
@@ -57,7 +57,8 @@ namespace scribo
Character,
Separator,
Noise,
- Punctuation
+ Punctuation,
+ Image
};
# ifndef MLN_INCLUDE_ONLY
@@ -106,6 +107,9 @@ namespace scribo
case Punctuation:
str = "Punctuation";
break;
+ case Image:
+ str = "Image";
+ break;
}
return ostr << str;
diff --git a/scribo/scribo/primitive/link/internal/dmax_default.hh b/scribo/scribo/primitive/identify.hh
similarity index 60%
copy from scribo/scribo/primitive/link/internal/dmax_default.hh
copy to scribo/scribo/primitive/identify.hh
index b4106a9..7d0c3e6 100644
--- a/scribo/scribo/primitive/link/internal/dmax_default.hh
+++ b/scribo/scribo/primitive/identify.hh
@@ -1,4 +1,4 @@
-// Copyright (C) 2010 EPITA Research and Development Laboratory (LRDE)
+// Copyright (C) 2011 EPITA Research and Development Laboratory (LRDE)
//
// This file is part of Olena.
//
@@ -23,15 +23,13 @@
// exception does not however invalidate any other reasons why the
// executable file might be covered by the GNU General Public License.
-#ifndef SCRIBO_PRIMITIVE_LINK_INTERNAL_DMAX_DEFAULT_HH
-# define SCRIBO_PRIMITIVE_LINK_INTERNAL_DMAX_DEFAULT_HH
+#ifndef SCRIBO_PRIMITIVE_IDENTIFY_HH
+# define SCRIBO_PRIMITIVE_IDENTIFY_HH
-/// \file
-///
-/// Default class for dmax functors.
-
-# include <scribo/primitive/link/internal/dmax_functor_base.hh>
+/*! \brief try to determine the type of a component.
+\fixme Add support for more component type (graphic, images, ...)
+ */
namespace scribo
{
@@ -39,46 +37,42 @@ namespace scribo
namespace primitive
{
- namespace link
+ template <typename L>
+ component_set<L>
+ identify(const component_set<L> comps)
{
+ trace::entering("scribo::primitive::identify");
- namespace internal
- {
+ mln_assertion(comps.is_valid());
+ component_set<L> output = comps.duplicate();
- /// \brief Base class for dmax functors.
- class dmax_default : public dmax_functor_base<dmax_default>
+ for_all_comps(c, comps)
+ if (comps(c).is_valid())
{
- typedef dmax_functor_base<dmax_default> super_;
+ float
+ min = comps(c).bbox().height(),
+ max = comps(c).bbox().width();
- public:
- dmax_default(const float& dmax_factor);
+ if (comps(c).bbox().width() < comps(c).bbox().height())
+ std::swap(min, max);
- protected:
- using super_::dmax_factor_;
- };
+ if (max/min > 10)
+ output(c).update_type(component::Separator);
+ }
+ trace::exiting("scribo::primitive::identify");
+ return output;
+ }
# ifndef MLN_INCLUDE_ONLY
- inline
- dmax_default::dmax_default(const float& dmax_factor)
- : super_(dmax_factor)
- {
- }
-
# endif // ! MLN_INCLUDE_ONLY
-
- } // end of namespace scribo::primitive::link::internal
-
- } // end of namespace scribo::primitive::link
-
} // end of namespace scribo::primitive
} // end of namespace scribo
-
-#endif // ! SCRIBO_PRIMITIVE_LINK_INTERNAL_DMAX_DEFAULT_HH
+#endif // ! SCRIBO_PRIMITIVE_IDENTIFY_HH
--
1.5.6.5
1
0