last-svn-commit-741-g6ef044f scribo/io/xml/save.hh: Handle paragraphs and separators correctly in XML output.

--- scribo/ChangeLog | 5 + scribo/scribo/io/xml/save.hh | 335 +++++++++++++++++++++++++++++------------- 2 files changed, 241 insertions(+), 99 deletions(-) diff --git a/scribo/ChangeLog b/scribo/ChangeLog index 2ba7d58..cfd40df 100644 --- a/scribo/ChangeLog +++ b/scribo/ChangeLog @@ -1,5 +1,10 @@ 2011-01-25 Guillaume Lazzara <z@lrde.epita.fr> + * scribo/io/xml/save.hh: Handle paragraphs and separators correctly + in XML output. + +2011-01-25 Guillaume Lazzara <z@lrde.epita.fr> + Identify separators among non-text components. * scribo/core/tag/component.hh: New Separator type. diff --git a/scribo/scribo/io/xml/save.hh b/scribo/scribo/io/xml/save.hh index d0c72e9..1bcdd6f 100644 --- a/scribo/scribo/io/xml/save.hh +++ b/scribo/scribo/io/xml/save.hh @@ -1,4 +1,5 @@ -// Copyright (C) 2010 EPITA Research and Development Laboratory (LRDE) +// Copyright (C) 2010, 2011 EPITA Research and Development Laboratory +// (LRDE) // // This file is part of Olena. // @@ -30,6 +31,7 @@ /// /// \brief Save document information as XML. +# include <libgen.h> # include <fstream> # include <sstream> @@ -58,12 +60,12 @@ namespace scribo Its XSD file is located here: http://schema.primaresearch.org/PAGE/gts/pagecontent/2009-03-16/pagecontent.... - */ + */ template <typename L> void save(const document<L>& doc, const std::string& output_name, - bool extended_format); + bool allow_extensions); # ifndef MLN_INCLUDE_ONLY @@ -77,14 +79,14 @@ namespace scribo std::map<char, std::string>& map) { for (unsigned i = 0; i < input.size(); ++i) + { + std::map<char, std::string>::iterator it = map.find(input.at(i)); + if (it != map.end()) { - std::map<char, std::string>::iterator it = map.find(input.at(i)); - if (it != map.end()) - { - input.replace(i, 1, it->second); - i += it->second.size() - 1; - } + input.replace(i, 1, it->second); + i += it->second.size() - 1; } + } return input; } @@ -112,142 +114,277 @@ namespace scribo } - } // end of namespace scribo::io::xml::internal - template <typename L> - void - save(const document<L>& doc, - const std::string& output_name, - bool extended_format) - { - trace::entering("scribo::io::xml:save_text_lines"); - std::ofstream file(output_name.c_str()); - if (! file) + template <typename L> + void + save(const document<L>& doc, + const std::string& output_name) { - std::cerr << "error: cannot open file '" << doc.filename() << "'!"; - abort(); - } + trace::entering("scribo::io::xml:save_text_lines"); - const line_set<L>& lines = doc.text(); + std::ofstream file(output_name.c_str()); + if (! file) + { + std::cerr << "error: cannot open file '" << doc.filename() << "'!"; + abort(); + } - std::map<char, std::string> html_map; - html_map['\"'] = """; - html_map['<'] = "<"; - html_map['>'] = ">"; - html_map['&'] = "&"; + const line_set<L>& lines = doc.text(); + const paragraph_set<L>& parset = doc.paragraphs(); - file << "<?xml version=\"1.0\"?>" << std::endl; - if (extended_format) - { - file << "<pcGts>" << std::endl; - } - else - { + std::map<char, std::string> html_map; + html_map['\"'] = """; + html_map['<'] = "<"; + html_map['>'] = ">"; + html_map['&'] = "&"; + + file << "<?xml version=\"1.0\"?>" << std::endl; file << "<pcGts xmlns=\"http://schema.primaresearch.org/PAGE/gts/pagecontent/2009-03-16\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://schema.primaresearch.org/PAGE/gts/pagecontent/2009-03-16 http://schema.primaresearch.org/PAGE/gts/pagecontent/2009-03-16/pagecontent.xsd\" pcGtsId=\"" << doc.filename() << "\">" << std::endl; + + file << " <pcMetadata>" << std::endl; + file << " <pcCreator>LRDE</pcCreator>" << std::endl; + file << " <pcCreated/>" << std::endl; + file << " <pcLastChange/>" << std::endl; + file << " <pcComments>Generated by Scribo from Olena.</pcComments>" << std::endl; + file << " </pcMetadata>" << std::endl; + + file << " <page image_filename=\"" << doc.filename() + << "\" image_width=\"" << lines.components().labeled_image().ncols() + << "\" image_height=\"" << lines.components().labeled_image().nrows() + << "\">" << std::endl; + + // Text + if (doc.has_text()) + { + for_all_paragraphs(p, parset) + { + const mln::util::array<line_id_t>& line_ids = parset(p).line_ids(); + + // FIXME: compute that information on the whole paragraph + // and use them here. + line_id_t fid = line_ids(0); + file << " <text_region id=\"" << p + << "\" txt_orientation=\"" << lines(fid).orientation() + << "\" txt_reading_orientation=\"" << lines(fid).reading_orientation() + << "\" txt_reading_direction=\"" << lines(fid).reading_direction() + << "\" txt_text_type=\"" << lines(fid).type() + << "\" txt_reverse_video=\"" << (lines(fid).reverse_video() ? "true" : "false") + << "\" txt_indented=\"" << (lines(fid).indented() ? "true" : "false") + << "\" kerning=\"" << lines(fid).char_space() + << "\">" + << std::endl; + + internal::print_box_coords(file, parset(p).bbox(), " "); + + file << " </text_region>" << std::endl; + } + } + + // Page elements (Pictures, ...) + if (doc.has_elements()) + { + const component_set<L>& elts = doc.elements(); + for_all_comps(e, elts) + if (elts(e).is_valid()) + { + file << " <image_region id=\"ir" << elts(e).id() + << "\" img_colour_type=\"24_Bit_Colour\"" + << " img_orientation=\"0.000000\" " + << " img_emb_text=\"No\" " + << " img_bgcolour=\"White\">" << std::endl; + + internal::print_box_coords(file, elts(e).bbox(), " "); + + file << " </image_region>" << std::endl; + } + } + + + file << " </page>" << std::endl; + file << "</pcGts>" << std::endl; + + trace::exiting("scribo::io::xml::save_text_lines"); } - file << " <PcMetadata>" << std::endl; - file << " <PcCreator>LRDE</PcCreator>" << std::endl; - file << " <PcCreated/>" << std::endl; - file << " <PcLastChange/>" << std::endl; - file << " <PcComments>Generated by Scribo from Olena.</PcComments>" << std::endl; - file << " </PcMetadata>" << std::endl; - file << " <page image_filename=\"" << doc.filename() - << "\" image_width=\"" << lines.components().labeled_image().ncols() - << "\" image_height=\"" << lines.components().labeled_image().nrows() - << "\">" << std::endl; - // Text - if (doc.has_text()) + + template <typename L> + void + save_extended(const document<L>& doc, + const std::string& output_name) { - for_all_lines(l, lines) + trace::entering("scribo::io::xml:save_text_lines"); + + std::ofstream file(output_name.c_str()); + if (! file) { - if (! lines(l).is_valid() - || lines(l).tag() != line::None - || lines(l).type() != line::Text) // Is NOT a text line. - continue; + std::cerr << "error: cannot open file '" << doc.filename() << "'!"; + abort(); + } + + const line_set<L>& lines = doc.text(); + const paragraph_set<L>& parset = doc.paragraphs(); + + std::map<char, std::string> html_map; + html_map['\"'] = """; + html_map['<'] = "<"; + html_map['>'] = ">"; + html_map['&'] = "&"; + + file << "<?xml version=\"1.0\"?>" << std::endl; + file << "<pcGts>" << std::endl; + + file << " <pcMetadata>" << std::endl; + file << " <pcCreator>LRDE</pcCreator>" << std::endl; + file << " <pcCreated/>" << std::endl; + file << " <pcLastChange/>" << std::endl; + file << " <pcComments>Generated by Scribo from Olena.</pcComments>" << std::endl; + file << " </pcMetadata>" << std::endl; + + file << " <page image_filename=\"" << doc.filename() + << "\" image_width=\"" << lines.components().labeled_image().ncols() + << "\" image_height=\"" << lines.components().labeled_image().nrows() + << "\">" << std::endl; + + // Text + if (doc.has_text()) + { + for_all_paragraphs(p, parset) { - file << " <text_region id=\"" << lines(l).id() - << "\" txt_orientation=\"" << lines(l).orientation() - << "\" txt_reading_orientation=\"" << lines(l).reading_orientation() - << "\" txt_reading_direction=\"" << lines(l).reading_direction() - << "\" txt_text_type=\"" << lines(l).type() - << "\" txt_reverse_video=\"" << (lines(l).reverse_video() ? "true" : "false") - << "\" txt_indented=\"" << (lines(l).indented() ? "true" : "false") - << "\" kerning=\"" << lines(l).char_space(); + const mln::util::array<line_id_t>& line_ids = parset(p).line_ids(); + + // FIXME: compute that information on the whole paragraph + // and use them here. + line_id_t fid = line_ids(0); + file << " <text_region id=\"" << p + << "\" txt_orientation=\"" << lines(fid).orientation() + << "\" txt_reading_orientation=\"" << lines(fid).reading_orientation() + << "\" txt_reading_direction=\"" << lines(fid).reading_direction() + << "\" txt_text_type=\"" << lines(fid).type() + << "\" txt_reverse_video=\"" << (lines(fid).reverse_video() ? "true" : "false") + << "\" txt_indented=\"" << (lines(fid).indented() ? "true" : "false") + << "\" kerning=\"" << lines(fid).char_space(); // EXTENSIONS - Not officially supported - if (extended_format) - { - file << "\" baseline=\"" << lines(l).baseline() - << "\" meanline=\"" << lines(l).meanline() - << "\" x_height=\"" << lines(l).x_height() - << "\" d_height=\"" << lines(l).d_height() - << "\" a_height=\"" << lines(l).a_height() - << "\" char_width=\"" << lines(l).char_width(); - } + file << "\" baseline=\"" << lines(fid).baseline() + << "\" meanline=\"" << lines(fid).meanline() + << "\" x_height=\"" << lines(fid).x_height() + << "\" d_height=\"" << lines(fid).d_height() + << "\" a_height=\"" << lines(fid).a_height() + << "\" char_width=\"" << lines(fid).char_width(); // End of EXTENSIONS file << "\">" << std::endl; - internal::print_box_coords(file, lines(l).bbox(), " "); + internal::print_box_coords(file, parset(p).bbox(), " "); - if (extended_format) - { - file << " <paragraph>" << std::endl; - internal::print_box_coords(file, lines(l).bbox(), " "); + // EXTENSIONS - Not officially supported + for_all_paragraph_lines(lid, line_ids) + { + line_id_t l = line_ids(lid); if (lines(l).has_text()) { std::string tmp = lines(l).text(); tmp = internal::html_markups_replace(tmp, html_map); - file << " <line text=\"" - << tmp - << "\">" << std::endl; + file << " <line text=\"" << tmp << "\" "; } else - file << " <line>" << std::endl; + file << " <line " << std::endl; + + file << "id=\"" << lines(l).id() + << "\" txt_orientation=\"" << lines(l).orientation() + << "\" txt_reading_orientation=\"" << lines(l).reading_orientation() + << "\" txt_reading_direction=\"" << lines(l).reading_direction() + << "\" txt_text_type=\"" << lines(l).type() + << "\" txt_reverse_video=\"" << (lines(l).reverse_video() ? "true" : "false") + << "\" txt_indented=\"" << (lines(l).indented() ? "true" : "false") + << "\" kerning=\"" << lines(l).char_space() + << "\" baseline=\"" << lines(l).baseline() + << "\" meanline=\"" << lines(l).meanline() + << "\" x_height=\"" << lines(l).x_height() + << "\" d_height=\"" << lines(l).d_height() + << "\" a_height=\"" << lines(l).a_height() + << "\" char_width=\"" << lines(l).char_width() + << "\">" << std::endl; internal::print_box_coords(file, lines(l).bbox(), " "); file << " </line>" << std::endl; - - file << " </paragraph>" << std::endl; } file << " </text_region>" << std::endl; } } - } + // End of EXTENSIONS - // Page elements (Pictures, ...) - if (doc.has_elements()) - { - const component_set<L>& elts = doc.elements(); - for_all_comps(e, elts) - if (elts(e).is_valid()) - { - file << " <image_region id=\"ir" << elts(e).id() - << "\" img_colour_type=\"24_Bit_Colour\"" - << " img_orientation=\"0.000000\" " - << " img_emb_text=\"No\" " - << " img_bgcolour=\"White\">" << std::endl; + // Page elements (Pictures, ...) + if (doc.has_elements()) + { + const component_set<L>& elts = doc.elements(); + for_all_comps(e, elts) + if (elts(e).is_valid()) + { + switch (elts(e).type()) + { + case component::Separator: + { + file << " <separator_region id=\"sr" << elts(e).id() + << "\" sep_orientation=\"0.000000\" " + << " sep_colour=\"Black\" " + << " sep_bgcolour=\"White\">" << std::endl; + + internal::print_box_coords(file, elts(e).bbox(), " "); + + file << " </separator_region>" << std::endl; + break; + break; + } + + default: + case component::Image: + { + file << " <image_region id=\"ir" << elts(e).id() + << "\" img_colour_type=\"24_Bit_Colour\"" + << " img_orientation=\"0.000000\" " + << " img_emb_text=\"No\" " + << " img_bgcolour=\"White\">" << std::endl; + + internal::print_box_coords(file, elts(e).bbox(), " "); + + file << " </image_region>" << std::endl; + break; + } + } + } + } - internal::print_box_coords(file, elts(e).bbox(), " "); - file << " </image_region>" << std::endl; - } + file << " </page>" << std::endl; + file << "</pcGts>" << std::endl; + + trace::exiting("scribo::io::xml::save_text_lines"); } + } // end of namespace scribo::io::xml::internal - file << " </page>" << std::endl; - file << "</pcGts>" << std::endl; - trace::exiting("scribo::io::xml::save_text_lines"); + // FACADE + + template <typename L> + void + save(const document<L>& doc, + const std::string& output_name, + bool allow_extensions) + { + if (allow_extensions) + internal::save_extended(doc, output_name); + else + internal::save(doc, output_name); } -- 1.5.6.5
participants (1)
-
Guillaume Lazzara