---
scribo/ChangeLog | 5 +
scribo/scribo/io/xml/save.hh | 335 +++++++++++++++++++++++++++++-------------
2 files changed, 241 insertions(+), 99 deletions(-)
diff --git a/scribo/ChangeLog b/scribo/ChangeLog
index 2ba7d58..cfd40df 100644
--- a/scribo/ChangeLog
+++ b/scribo/ChangeLog
@@ -1,5 +1,10 @@
2011-01-25 Guillaume Lazzara <z(a)lrde.epita.fr>
+ * scribo/io/xml/save.hh: Handle paragraphs and separators correctly
+ in XML output.
+
+2011-01-25 Guillaume Lazzara <z(a)lrde.epita.fr>
+
Identify separators among non-text components.
* scribo/core/tag/component.hh: New Separator type.
diff --git a/scribo/scribo/io/xml/save.hh b/scribo/scribo/io/xml/save.hh
index d0c72e9..1bcdd6f 100644
--- a/scribo/scribo/io/xml/save.hh
+++ b/scribo/scribo/io/xml/save.hh
@@ -1,4 +1,5 @@
-// Copyright (C) 2010 EPITA Research and Development Laboratory (LRDE)
+// Copyright (C) 2010, 2011 EPITA Research and Development Laboratory
+// (LRDE)
//
// This file is part of Olena.
//
@@ -30,6 +31,7 @@
///
/// \brief Save document information as XML.
+# include <libgen.h>
# include <fstream>
# include <sstream>
@@ -58,12 +60,12 @@ namespace scribo
Its XSD file is located here:
http://schema.primaresearch.org/PAGE/gts/pagecontent/2009-03-16/pagecontent…
- */
+ */
template <typename L>
void
save(const document<L>& doc,
const std::string& output_name,
- bool extended_format);
+ bool allow_extensions);
# ifndef MLN_INCLUDE_ONLY
@@ -77,14 +79,14 @@ namespace scribo
std::map<char, std::string>& map)
{
for (unsigned i = 0; i < input.size(); ++i)
+ {
+ std::map<char, std::string>::iterator it = map.find(input.at(i));
+ if (it != map.end())
{
- std::map<char, std::string>::iterator it = map.find(input.at(i));
- if (it != map.end())
- {
- input.replace(i, 1, it->second);
- i += it->second.size() - 1;
- }
+ input.replace(i, 1, it->second);
+ i += it->second.size() - 1;
}
+ }
return input;
}
@@ -112,142 +114,277 @@ namespace scribo
}
- } // end of namespace scribo::io::xml::internal
- template <typename L>
- void
- save(const document<L>& doc,
- const std::string& output_name,
- bool extended_format)
- {
- trace::entering("scribo::io::xml:save_text_lines");
- std::ofstream file(output_name.c_str());
- if (! file)
+ template <typename L>
+ void
+ save(const document<L>& doc,
+ const std::string& output_name)
{
- std::cerr << "error: cannot open file '" << doc.filename()
<< "'!";
- abort();
- }
+ trace::entering("scribo::io::xml:save_text_lines");
- const line_set<L>& lines = doc.text();
+ std::ofstream file(output_name.c_str());
+ if (! file)
+ {
+ std::cerr << "error: cannot open file '" << doc.filename()
<< "'!";
+ abort();
+ }
- std::map<char, std::string> html_map;
- html_map['\"'] = """;
- html_map['<'] = "<";
- html_map['>'] = ">";
- html_map['&'] = "&";
+ const line_set<L>& lines = doc.text();
+ const paragraph_set<L>& parset = doc.paragraphs();
- file << "<?xml version=\"1.0\"?>" << std::endl;
- if (extended_format)
- {
- file << "<pcGts>" << std::endl;
- }
- else
- {
+ std::map<char, std::string> html_map;
+ html_map['\"'] = """;
+ html_map['<'] = "<";
+ html_map['>'] = ">";
+ html_map['&'] = "&";
+
+ file << "<?xml version=\"1.0\"?>" << std::endl;
file << "<pcGts
xmlns=\"http://schema.primaresearch.org/PAGE/gts/pagecontent/2009-03-1…
xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"
xsi:schemaLocation=\"http://schema.primaresearch.org/PAGE/gts/pagecont…
http://schema.primaresearch.org/PAGE/gts/pagecontent/2009-03-16/pagecontent…
pcGtsId=\"" << doc.filename() << "\">" <<
std::endl;
+
+ file << " <pcMetadata>" << std::endl;
+ file << " <pcCreator>LRDE</pcCreator>" <<
std::endl;
+ file << " <pcCreated/>" << std::endl;
+ file << " <pcLastChange/>" << std::endl;
+ file << " <pcComments>Generated by Scribo from
Olena.</pcComments>" << std::endl;
+ file << " </pcMetadata>" << std::endl;
+
+ file << " <page image_filename=\"" << doc.filename()
+ << "\" image_width=\"" <<
lines.components().labeled_image().ncols()
+ << "\" image_height=\"" <<
lines.components().labeled_image().nrows()
+ << "\">" << std::endl;
+
+ // Text
+ if (doc.has_text())
+ {
+ for_all_paragraphs(p, parset)
+ {
+ const mln::util::array<line_id_t>& line_ids = parset(p).line_ids();
+
+ // FIXME: compute that information on the whole paragraph
+ // and use them here.
+ line_id_t fid = line_ids(0);
+ file << " <text_region id=\"" << p
+ << "\" txt_orientation=\"" <<
lines(fid).orientation()
+ << "\" txt_reading_orientation=\"" <<
lines(fid).reading_orientation()
+ << "\" txt_reading_direction=\"" <<
lines(fid).reading_direction()
+ << "\" txt_text_type=\"" << lines(fid).type()
+ << "\" txt_reverse_video=\"" <<
(lines(fid).reverse_video() ? "true" : "false")
+ << "\" txt_indented=\"" << (lines(fid).indented() ?
"true" : "false")
+ << "\" kerning=\"" << lines(fid).char_space()
+ << "\">"
+ << std::endl;
+
+ internal::print_box_coords(file, parset(p).bbox(), " ");
+
+ file << " </text_region>" << std::endl;
+ }
+ }
+
+ // Page elements (Pictures, ...)
+ if (doc.has_elements())
+ {
+ const component_set<L>& elts = doc.elements();
+ for_all_comps(e, elts)
+ if (elts(e).is_valid())
+ {
+ file << " <image_region id=\"ir" << elts(e).id()
+ << "\" img_colour_type=\"24_Bit_Colour\""
+ << " img_orientation=\"0.000000\" "
+ << " img_emb_text=\"No\" "
+ << " img_bgcolour=\"White\">" << std::endl;
+
+ internal::print_box_coords(file, elts(e).bbox(), " ");
+
+ file << " </image_region>" << std::endl;
+ }
+ }
+
+
+ file << " </page>" << std::endl;
+ file << "</pcGts>" << std::endl;
+
+ trace::exiting("scribo::io::xml::save_text_lines");
}
- file << " <PcMetadata>" << std::endl;
- file << " <PcCreator>LRDE</PcCreator>" <<
std::endl;
- file << " <PcCreated/>" << std::endl;
- file << " <PcLastChange/>" << std::endl;
- file << " <PcComments>Generated by Scribo from
Olena.</PcComments>" << std::endl;
- file << " </PcMetadata>" << std::endl;
- file << " <page image_filename=\"" << doc.filename()
- << "\" image_width=\"" <<
lines.components().labeled_image().ncols()
- << "\" image_height=\"" <<
lines.components().labeled_image().nrows()
- << "\">" << std::endl;
- // Text
- if (doc.has_text())
+
+ template <typename L>
+ void
+ save_extended(const document<L>& doc,
+ const std::string& output_name)
{
- for_all_lines(l, lines)
+ trace::entering("scribo::io::xml:save_text_lines");
+
+ std::ofstream file(output_name.c_str());
+ if (! file)
{
- if (! lines(l).is_valid()
- || lines(l).tag() != line::None
- || lines(l).type() != line::Text) // Is NOT a text line.
- continue;
+ std::cerr << "error: cannot open file '" << doc.filename()
<< "'!";
+ abort();
+ }
+
+ const line_set<L>& lines = doc.text();
+ const paragraph_set<L>& parset = doc.paragraphs();
+
+ std::map<char, std::string> html_map;
+ html_map['\"'] = """;
+ html_map['<'] = "<";
+ html_map['>'] = ">";
+ html_map['&'] = "&";
+
+ file << "<?xml version=\"1.0\"?>" << std::endl;
+ file << "<pcGts>" << std::endl;
+
+ file << " <pcMetadata>" << std::endl;
+ file << " <pcCreator>LRDE</pcCreator>" <<
std::endl;
+ file << " <pcCreated/>" << std::endl;
+ file << " <pcLastChange/>" << std::endl;
+ file << " <pcComments>Generated by Scribo from
Olena.</pcComments>" << std::endl;
+ file << " </pcMetadata>" << std::endl;
+
+ file << " <page image_filename=\"" << doc.filename()
+ << "\" image_width=\"" <<
lines.components().labeled_image().ncols()
+ << "\" image_height=\"" <<
lines.components().labeled_image().nrows()
+ << "\">" << std::endl;
+
+ // Text
+ if (doc.has_text())
+ {
+ for_all_paragraphs(p, parset)
{
- file << " <text_region id=\"" << lines(l).id()
- << "\" txt_orientation=\"" <<
lines(l).orientation()
- << "\" txt_reading_orientation=\"" <<
lines(l).reading_orientation()
- << "\" txt_reading_direction=\"" <<
lines(l).reading_direction()
- << "\" txt_text_type=\"" << lines(l).type()
- << "\" txt_reverse_video=\"" <<
(lines(l).reverse_video() ? "true" : "false")
- << "\" txt_indented=\"" << (lines(l).indented() ?
"true" : "false")
- << "\" kerning=\"" << lines(l).char_space();
+ const mln::util::array<line_id_t>& line_ids = parset(p).line_ids();
+
+ // FIXME: compute that information on the whole paragraph
+ // and use them here.
+ line_id_t fid = line_ids(0);
+ file << " <text_region id=\"" << p
+ << "\" txt_orientation=\"" <<
lines(fid).orientation()
+ << "\" txt_reading_orientation=\"" <<
lines(fid).reading_orientation()
+ << "\" txt_reading_direction=\"" <<
lines(fid).reading_direction()
+ << "\" txt_text_type=\"" << lines(fid).type()
+ << "\" txt_reverse_video=\"" <<
(lines(fid).reverse_video() ? "true" : "false")
+ << "\" txt_indented=\"" << (lines(fid).indented() ?
"true" : "false")
+ << "\" kerning=\"" << lines(fid).char_space();
// EXTENSIONS - Not officially supported
- if (extended_format)
- {
- file << "\" baseline=\"" << lines(l).baseline()
- << "\" meanline=\"" << lines(l).meanline()
- << "\" x_height=\"" << lines(l).x_height()
- << "\" d_height=\"" << lines(l).d_height()
- << "\" a_height=\"" << lines(l).a_height()
- << "\" char_width=\"" << lines(l).char_width();
- }
+ file << "\" baseline=\"" << lines(fid).baseline()
+ << "\" meanline=\"" << lines(fid).meanline()
+ << "\" x_height=\"" << lines(fid).x_height()
+ << "\" d_height=\"" << lines(fid).d_height()
+ << "\" a_height=\"" << lines(fid).a_height()
+ << "\" char_width=\"" << lines(fid).char_width();
// End of EXTENSIONS
file << "\">"
<< std::endl;
- internal::print_box_coords(file, lines(l).bbox(), " ");
+ internal::print_box_coords(file, parset(p).bbox(), " ");
- if (extended_format)
- {
- file << " <paragraph>" << std::endl;
- internal::print_box_coords(file, lines(l).bbox(), " ");
+ // EXTENSIONS - Not officially supported
+ for_all_paragraph_lines(lid, line_ids)
+ {
+ line_id_t l = line_ids(lid);
if (lines(l).has_text())
{
std::string tmp = lines(l).text();
tmp = internal::html_markups_replace(tmp, html_map);
- file << " <line text=\""
- << tmp
- << "\">" << std::endl;
+ file << " <line text=\"" << tmp <<
"\" ";
}
else
- file << " <line>" << std::endl;
+ file << " <line " << std::endl;
+
+ file << "id=\"" << lines(l).id()
+ << "\" txt_orientation=\"" <<
lines(l).orientation()
+ << "\" txt_reading_orientation=\"" <<
lines(l).reading_orientation()
+ << "\" txt_reading_direction=\"" <<
lines(l).reading_direction()
+ << "\" txt_text_type=\"" << lines(l).type()
+ << "\" txt_reverse_video=\"" <<
(lines(l).reverse_video() ? "true" : "false")
+ << "\" txt_indented=\"" << (lines(l).indented() ?
"true" : "false")
+ << "\" kerning=\"" << lines(l).char_space()
+ << "\" baseline=\"" << lines(l).baseline()
+ << "\" meanline=\"" << lines(l).meanline()
+ << "\" x_height=\"" << lines(l).x_height()
+ << "\" d_height=\"" << lines(l).d_height()
+ << "\" a_height=\"" << lines(l).a_height()
+ << "\" char_width=\"" << lines(l).char_width()
+ << "\">" << std::endl;
internal::print_box_coords(file, lines(l).bbox(), " ");
file << " </line>" << std::endl;
-
- file << " </paragraph>" << std::endl;
}
file << " </text_region>" << std::endl;
}
}
- }
+ // End of EXTENSIONS
- // Page elements (Pictures, ...)
- if (doc.has_elements())
- {
- const component_set<L>& elts = doc.elements();
- for_all_comps(e, elts)
- if (elts(e).is_valid())
- {
- file << " <image_region id=\"ir" << elts(e).id()
- << "\" img_colour_type=\"24_Bit_Colour\""
- << " img_orientation=\"0.000000\" "
- << " img_emb_text=\"No\" "
- << " img_bgcolour=\"White\">" << std::endl;
+ // Page elements (Pictures, ...)
+ if (doc.has_elements())
+ {
+ const component_set<L>& elts = doc.elements();
+ for_all_comps(e, elts)
+ if (elts(e).is_valid())
+ {
+ switch (elts(e).type())
+ {
+ case component::Separator:
+ {
+ file << " <separator_region id=\"sr" <<
elts(e).id()
+ << "\" sep_orientation=\"0.000000\" "
+ << " sep_colour=\"Black\" "
+ << " sep_bgcolour=\"White\">" << std::endl;
+
+ internal::print_box_coords(file, elts(e).bbox(), " ");
+
+ file << " </separator_region>" << std::endl;
+ break;
+ break;
+ }
+
+ default:
+ case component::Image:
+ {
+ file << " <image_region id=\"ir" << elts(e).id()
+ << "\" img_colour_type=\"24_Bit_Colour\""
+ << " img_orientation=\"0.000000\" "
+ << " img_emb_text=\"No\" "
+ << " img_bgcolour=\"White\">" << std::endl;
+
+ internal::print_box_coords(file, elts(e).bbox(), " ");
+
+ file << " </image_region>" << std::endl;
+ break;
+ }
+ }
+ }
+ }
- internal::print_box_coords(file, elts(e).bbox(), " ");
- file << " </image_region>" << std::endl;
- }
+ file << " </page>" << std::endl;
+ file << "</pcGts>" << std::endl;
+
+ trace::exiting("scribo::io::xml::save_text_lines");
}
+ } // end of namespace scribo::io::xml::internal
- file << " </page>" << std::endl;
- file << "</pcGts>" << std::endl;
- trace::exiting("scribo::io::xml::save_text_lines");
+ // FACADE
+
+ template <typename L>
+ void
+ save(const document<L>& doc,
+ const std::string& output_name,
+ bool allow_extensions)
+ {
+ if (allow_extensions)
+ internal::save_extended(doc, output_name);
+ else
+ internal::save(doc, output_name);
}
--
1.5.6.5