
--- scribo/ChangeLog | 4 + scribo/scribo/io/xml/save.hh | 214 +++++++++++++++++++++--------------------- 2 files changed, 111 insertions(+), 107 deletions(-) diff --git a/scribo/ChangeLog b/scribo/ChangeLog index e738a8a..b14e5bf 100644 --- a/scribo/ChangeLog +++ b/scribo/ChangeLog @@ -1,5 +1,9 @@ 2010-11-15 Guillaume Lazzara <z@lrde.epita.fr> + * scribo/io/xml/save.hh: Make use of document structure. + +2010-11-15 Guillaume Lazzara <z@lrde.epita.fr> + * scribo/primitive/extract/elements.hh: New routine. 2010-11-15 Guillaume Lazzara <z@lrde.epita.fr> diff --git a/scribo/scribo/io/xml/save.hh b/scribo/scribo/io/xml/save.hh index f13396f..c6b416d 100644 --- a/scribo/scribo/io/xml/save.hh +++ b/scribo/scribo/io/xml/save.hh @@ -28,7 +28,7 @@ /// \file /// -/// \brief Save text line information as XML. +/// \brief Save document information as XML. # include <fstream> # include <sstream> @@ -46,7 +46,7 @@ namespace scribo namespace xml { - /*! \brief Save text line information as XML. + /*! \brief Save document information as XML. We use a XML Schema part of the PAGE (Page Analysis and Ground truth Elements) image representation framework. @@ -60,8 +60,7 @@ namespace scribo */ template <typename L> void - save(const std::string& input_name, - const line_set<L>& lines, + save(const document<L>& doc, const std::string& output_name, bool extended_format); @@ -86,12 +85,34 @@ namespace scribo return input; } + + void print_box_coords(std::ofstream& ostr, const box2d& b, + const char *space) + { + std::string sc = space; + std::string sp = sc + " "; + ostr << sc << "<coords>" << std::endl + << sp << "<point x=\"" << b.pmin().col() + << "\" y=\"" << b.pmin().row() << "\"/>" + << std::endl + << sp << "<point x=\"" << b.pmax().col() + << "\" y=\"" << b.pmin().row() << "\"/>" + << std::endl + << sp << "<point x=\"" << b.pmax().col() + << "\" y=\"" << b.pmax().row() << "\"/>" + << std::endl + << sp << "<point x=\"" << b.pmin().col() + << "\" y=\"" << b.pmax().row() << "\"/>" + << std::endl + << sc << "</coords>" << std::endl; + + } + } // end of namespace scribo::io::xml::internal template <typename L> void - save(const std::string& input_name, - const line_set<L>& lines, + save(const document<L>& doc, const std::string& output_name, bool extended_format) { @@ -100,9 +121,12 @@ namespace scribo std::ofstream file(output_name.c_str()); if (! file) { - std::cerr << "error: cannot open file '" << input_name << "'!"; + std::cerr << "error: cannot open file '" << doc.filename() << "'!"; abort(); } + + const line_set<L>& lines = doc.text(); + std::map<char, std::string> html_map; html_map['\"'] = """; html_map['<'] = "<"; @@ -111,13 +135,13 @@ namespace scribo file << "<?xml version=\"1.0\"?>" << std::endl; if (extended_format) - { - file << "<pcGts>" << std::endl; - } + { + file << "<pcGts>" << std::endl; + } else - { - file << "<pcGts xmlns=\"http://schema.primaresearch.org/PAGE/gts/pagecontent/2009-03-16\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://schema.primaresearch.org/PAGE/gts/pagecontent/2009-03-16 http://schema.primaresearch.org/PAGE/gts/pagecontent/2009-03-16/pagecontent.xsd\" pcGtsId=\"" << input_name << "\">" << std::endl; - } + { + file << "<pcGts xmlns=\"http://schema.primaresearch.org/PAGE/gts/pagecontent/2009-03-16\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://schema.primaresearch.org/PAGE/gts/pagecontent/2009-03-16 http://schema.primaresearch.org/PAGE/gts/pagecontent/2009-03-16/pagecontent.xsd\" pcGtsId=\"" << doc.filename() << "\">" << std::endl; + } file << " <PcMetadata>" << std::endl; file << " <PcCreator>LRDE</PcCreator>" << std::endl; @@ -126,110 +150,86 @@ namespace scribo file << " <PcComments>Generated by Scribo from Olena.</PcComments>" << std::endl; file << " </PcMetadata>" << std::endl; - file << " <page image_filename=\"" << input_name + file << " <page image_filename=\"" << doc.filename() << "\" image_width=\"" << lines.components().labeled_image().ncols() << "\" image_height=\"" << lines.components().labeled_image().nrows() << "\">" << std::endl; for_all_lines(l, lines) + { + if (! lines(l).is_valid() + || lines(l).tag() != line::None + || lines(l).type() != line::Text) // Is NOT a text line. + continue; { - if (! lines(l).is_valid() - || lines(l).tag() != line::None - || lines(l).type() != line::Text) // Is NOT a text line. - continue; + file << " <text_region id=\"" << lines(l).id() + << "\" txt_orientation=\"" << lines(l).orientation() + << "\" txt_reading_orientation=\"" << lines(l).reading_orientation() + << "\" txt_reading_direction=\"" << lines(l).reading_direction() + << "\" txt_text_type=\"" << lines(l).type() + << "\" txt_reverse_video=\"" << (lines(l).reverse_video() ? "true" : "false") + << "\" txt_indented=\"" << (lines(l).indented() ? "true" : "false") + << "\" kerning=\"" << lines(l).char_space(); + + // EXTENSIONS - Not officially supported + if (extended_format) { - file << " <text_region id=\"" << lines(l).id() - << "\" txt_orientation=\"" << lines(l).orientation() - << "\" txt_reading_orientation=\"" << lines(l).reading_orientation() - << "\" txt_reading_direction=\"" << lines(l).reading_direction() - << "\" txt_text_type=\"" << lines(l).type() - << "\" txt_reverse_video=\"" << (lines(l).reverse_video() ? "true" : "false") - << "\" txt_indented=\"" << (lines(l).indented() ? "true" : "false") - << "\" kerning=\"" << lines(l).char_space(); - - // EXTENSIONS - Not officially supported - if (extended_format) - { - file << "\" baseline=\"" << lines(l).baseline() - << "\" meanline=\"" << lines(l).meanline() - << "\" x_height=\"" << lines(l).x_height() - << "\" d_height=\"" << lines(l).d_height() - << "\" a_height=\"" << lines(l).a_height() - << "\" char_width=\"" << lines(l).char_width(); - } - // End of EXTENSIONS - file << "\">" - << std::endl; + file << "\" baseline=\"" << lines(l).baseline() + << "\" meanline=\"" << lines(l).meanline() + << "\" x_height=\"" << lines(l).x_height() + << "\" d_height=\"" << lines(l).d_height() + << "\" a_height=\"" << lines(l).a_height() + << "\" char_width=\"" << lines(l).char_width(); + } + // End of EXTENSIONS + file << "\">" + << std::endl; - if (extended_format) - { - file << " <coords>" << std::endl - << " <point x=\"" << lines(l).bbox().pmin().col() - << "\" y=\"" << lines(l).bbox().pmin().row() << "\"/>" - << std::endl - << " <point x=\"" << lines(l).bbox().pmax().col() - << "\" y=\"" << lines(l).bbox().pmin().row() << "\"/>" - << std::endl - << " <point x=\"" << lines(l).bbox().pmax().col() - << "\" y=\"" << lines(l).bbox().pmax().row() << "\"/>" - << std::endl - << " <point x=\"" << lines(l).bbox().pmin().col() - << "\" y=\"" << lines(l).bbox().pmax().row() << "\"/>" - << std::endl - << " </coords>" << std::endl; - - - file << " <paragraph>" << std::endl; - - file << " <coords>" << std::endl - << " <point x=\"" << lines(l).bbox().pmin().col() - << "\" y=\"" << lines(l).bbox().pmin().row() << "\"/>" - << std::endl - << " <point x=\"" << lines(l).bbox().pmax().col() - << "\" y=\"" << lines(l).bbox().pmin().row() << "\"/>" - << std::endl - << " <point x=\"" << lines(l).bbox().pmax().col() - << "\" y=\"" << lines(l).bbox().pmax().row() << "\"/>" - << std::endl - << " <point x=\"" << lines(l).bbox().pmin().col() - << "\" y=\"" << lines(l).bbox().pmax().row() << "\"/>" - << std::endl - << " </coords>" << std::endl; - - if (lines(l).has_text()) - { - std::string tmp = lines(l).text(); - tmp = internal::html_markups_replace(tmp, html_map); - - file << " <line text=\"" - << tmp - << "\">" << std::endl; - } - else - file << " <line>" << std::endl; - - file << " <coords>" << std::endl - << " <point x=\"" << lines(l).bbox().pmin().col() - << "\" y=\"" << lines(l).bbox().pmin().row() << "\"/>" - << std::endl - << " <point x=\"" << lines(l).bbox().pmax().col() - << "\" y=\"" << lines(l).bbox().pmin().row() << "\"/>" - << std::endl - << " <point x=\"" << lines(l).bbox().pmax().col() - << "\" y=\"" << lines(l).bbox().pmax().row() << "\"/>" - << std::endl - << " <point x=\"" << lines(l).bbox().pmin().col() - << "\" y=\"" << lines(l).bbox().pmax().row() << "\"/>" - << std::endl - << " </coords>" << std::endl; - - file << " </line>" << std::endl; - - file << " </paragraph>" << std::endl; - } + internal::print_box_coords(file, lines(l).bbox(), " "); + + if (extended_format) + { + file << " <paragraph>" << std::endl; + + internal::print_box_coords(file, lines(l).bbox(), " "); + + if (lines(l).has_text()) + { + std::string tmp = lines(l).text(); + tmp = internal::html_markups_replace(tmp, html_map); + + file << " <line text=\"" + << tmp + << "\">" << std::endl; + } + else + file << " <line>" << std::endl; + + internal::print_box_coords(file, lines(l).bbox(), " "); - file << " </text_region>" << std::endl; + file << " </line>" << std::endl; + + file << " </paragraph>" << std::endl; } + + file << " </text_region>" << std::endl; + } + } + + + const component_set<L>& elts = doc.elements(); + for_all_comps(e, elts) + if (elts(e).is_valid()) + { + file << " <image_region id=\"ir" << elts(e).id() + << "\" img_colour_type=\"24_Bit_Colour\"" + << " img_orientation=\"0.000000\" " + << " img_emb_text=\"No\" " + << " img_bgcolour=\"White\">" << std::endl; + + internal::print_box_coords(file, elts(e).bbox(), " "); + + file << " </image_region>" << std::endl; } file << " </page>" << std::endl; -- 1.5.6.5