
* scribo/core/component_info.hh, * scribo/core/component_set.hh, * scribo/core/document.hh, * scribo/core/line_info.hh, * scribo/core/line_links.hh, * scribo/core/object_groups.hh, * scribo/core/object_links.hh, * scribo/core/paragraph_set.hh: Make these classes serializable. * scribo/core/concept/serializable.hh, * scribo/core/concept/serialize_visitor.hh: New concepts. * scribo/core/internal/doc_xml_serializer.hh: New. Base implementation. * scribo/io/xml/internal/extended_page_xml_visitor.hh, * scribo/io/xml/internal/full_xml_visitor.hh, * scribo/io/xml/internal/page_xml_visitor.hh: New. Visitors producing different XML outputs. * scribo/io/xml/internal/html_markups_replace.hh, * scribo/io/xml/internal/print_box_coords.hh, * scribo/io/xml/internal/print_page_preambule.hh: New. Tools for XML output. * scribo/io/xml/save.hh: Make use of visitors. * scribo/toolchain/internal/content_in_doc_functor.hh: Set default XML output type. * src/content_in_doc.cc: Produce several XML output. --- scribo/ChangeLog | 38 ++ scribo/demo/viewer/runner.cc | 5 +- scribo/scribo/core/component_info.hh | 3 +- scribo/scribo/core/component_set.hh | 7 +- scribo/scribo/core/concept/serializable.hh | 64 +++ scribo/scribo/core/concept/serialize_visitor.hh | 49 +++ scribo/scribo/core/document.hh | 8 +- scribo/scribo/core/internal/doc_xml_serializer.hh | 140 ++++++ scribo/scribo/core/line_info.hh | 21 +- scribo/scribo/core/line_links.hh | 3 +- scribo/scribo/core/object_groups.hh | 4 +- scribo/scribo/core/object_links.hh | 8 +- scribo/scribo/core/paragraph_set.hh | 4 +- .../io/xml/internal/extended_page_xml_visitor.hh | 283 ++++++++++++ scribo/scribo/io/xml/internal/full_xml_visitor.hh | 456 ++++++++++++++++++++ .../scribo/io/xml/internal/html_markups_replace.hh | 97 +++++ scribo/scribo/io/xml/internal/page_xml_visitor.hh | 222 ++++++++++ scribo/scribo/io/xml/internal/print_box_coords.hh | 92 ++++ .../scribo/io/xml/internal/print_page_preambule.hh | 95 ++++ scribo/scribo/io/xml/save.hh | 388 +++-------------- .../toolchain/internal/content_in_doc_functor.hh | 9 +- scribo/src/content_in_doc.cc | 4 +- 22 files changed, 1660 insertions(+), 340 deletions(-) create mode 100644 scribo/scribo/core/concept/serializable.hh create mode 100644 scribo/scribo/core/concept/serialize_visitor.hh create mode 100644 scribo/scribo/core/internal/doc_xml_serializer.hh create mode 100644 scribo/scribo/io/xml/internal/extended_page_xml_visitor.hh create mode 100644 scribo/scribo/io/xml/internal/full_xml_visitor.hh create mode 100644 scribo/scribo/io/xml/internal/html_markups_replace.hh create mode 100644 scribo/scribo/io/xml/internal/page_xml_visitor.hh create mode 100644 scribo/scribo/io/xml/internal/print_box_coords.hh create mode 100644 scribo/scribo/io/xml/internal/print_page_preambule.hh diff --git a/scribo/ChangeLog b/scribo/ChangeLog index 63e3fee..cf02d73 100644 --- a/scribo/ChangeLog +++ b/scribo/ChangeLog @@ -1,5 +1,43 @@ 2011-03-01 Guillaume Lazzara <z@lrde.epita.fr> + Make XML output more flexible. + + * scribo/core/component_info.hh, + * scribo/core/component_set.hh, + * scribo/core/document.hh, + * scribo/core/line_info.hh, + * scribo/core/line_links.hh, + * scribo/core/object_groups.hh, + * scribo/core/object_links.hh, + * scribo/core/paragraph_set.hh: Make these classes serializable. + + * scribo/core/concept/serializable.hh, + * scribo/core/concept/serialize_visitor.hh: New concepts. + + * scribo/core/internal/doc_xml_serializer.hh: New. Base + implementation. + + * scribo/io/xml/internal/extended_page_xml_visitor.hh, + * scribo/io/xml/internal/full_xml_visitor.hh, + * scribo/io/xml/internal/page_xml_visitor.hh: New. Visitors + producing different XML outputs. + + * scribo/io/xml/internal/html_markups_replace.hh, + * scribo/io/xml/internal/print_box_coords.hh, + * scribo/io/xml/internal/print_page_preambule.hh: New. Tools for + XML output. + + * scribo/io/xml/save.hh: Make use of visitors. + + * scribo/toolchain/internal/content_in_doc_functor.hh: Set default + XML output type. + + * src/content_in_doc.cc: Produce several XML output. + + * demo/viewer/runner.cc: Update call to io::xml::save. + +2011-03-01 Guillaume Lazzara <z@lrde.epita.fr> + Set component type during component extraction. * scribo/core/component_info.hh, diff --git a/scribo/demo/viewer/runner.cc b/scribo/demo/viewer/runner.cc index 86ff5dc..a3cc883 100644 --- a/scribo/demo/viewer/runner.cc +++ b/scribo/demo/viewer/runner.cc @@ -1,4 +1,5 @@ -// Copyright (C) 2010 EPITA Research and Development Laboratory (LRDE) +// Copyright (C) 2010, 2011 EPITA Research and Development Laboratory +// (LRDE) // // This file is part of Olena. // @@ -156,7 +157,7 @@ void runner::process(const image2d<value::rgb8>& original_ima, f.enable_whitespace_seps = (find_seps == defs::Whitespaces || find_seps == defs::LinesAndWhitespaces); - f.allow_xml_extensions = true; + f.xml_format = scribo::io::xml::PageExtended; f.save_doc_as_xml = true; diff --git a/scribo/scribo/core/component_info.hh b/scribo/scribo/core/component_info.hh index 6fc73f8..f825aee 100644 --- a/scribo/scribo/core/component_info.hh +++ b/scribo/scribo/core/component_info.hh @@ -36,6 +36,7 @@ # include <mln/core/alias/point2d.hh> # include <mln/util/object_id.hh> +# include <scribo/core/concept/serializable.hh> # include <scribo/core/tag/component.hh> # include <scribo/core/tag/line.hh> @@ -44,7 +45,7 @@ namespace scribo typedef mln::util::object_id<scribo::ComponentId, unsigned> component_id_t; - class component_info + class component_info : public Serializable<component_info> { typedef mln::util::object_id<scribo::ComponentId, unsigned> component_id_t; diff --git a/scribo/scribo/core/component_set.hh b/scribo/scribo/core/component_set.hh index 442e8d6..a63ed6c 100644 --- a/scribo/scribo/core/component_set.hh +++ b/scribo/scribo/core/component_set.hh @@ -30,6 +30,10 @@ /// \file /// /// \brief Definition of a component set. +/// +/// \fixme component_set should always set a component type in order +/// to be fully supported by visitors. + # include <mln/core/concept/site_set.hh> # include <mln/core/concept/function.hh> @@ -59,6 +63,7 @@ # include <scribo/core/macros.hh> # include <scribo/core/component_info.hh> +# include <scribo/core/concept/serializable.hh> namespace scribo @@ -115,7 +120,7 @@ namespace scribo template <typename L> - class component_set + class component_set : public Serializable<component_set<L> > { typedef mln::accu::shape::bbox<mln_site(L)> bbox_accu_t; typedef mln::accu::center<mln_site(L)> center_accu_t; diff --git a/scribo/scribo/core/concept/serializable.hh b/scribo/scribo/core/concept/serializable.hh new file mode 100644 index 0000000..6e661a6 --- /dev/null +++ b/scribo/scribo/core/concept/serializable.hh @@ -0,0 +1,64 @@ +// Copyright (C) 2011 EPITA Research and Development Laboratory (LRDE) +// +// This file is part of Olena. +// +// Olena is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation, version 2 of the License. +// +// Olena is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Olena. If not, see <http://www.gnu.org/licenses/>. +// +// As a special exception, you may use this file as part of a free +// software project without restriction. Specifically, if other files +// instantiate templates or use macros or inline functions from this +// file, or you compile this file and link it with other files to produce +// an executable, this file does not by itself cause the resulting +// executable to be covered by the GNU General Public License. This +// exception does not however invalidate any other reasons why the +// executable file might be covered by the GNU General Public License. + +#ifndef SCRIBO_CORE_CONCEPT_SERIALIZABLE_HH +# define SCRIBO_CORE_CONCEPT_SERIALIZABLE_HH + +/// \file +/// +/// Concept for serializer visitors. + +# include <mln/core/concept/object.hh> +# include <scribo/core/concept/serialize_visitor.hh> + +namespace scribo +{ + + /// \brief Link functor concept. + template <typename E> + class Serializable : public mln::Object<E> + { + public: + template <typename E2> + void accept(const SerializeVisitor<E2>& visitor) const; + }; + + +# ifndef MLN_INCLUDE_ONLY + + template <typename E> + template <typename E2> + void + Serializable<E>::accept(const SerializeVisitor<E2>& visitor) const + { + exact(visitor).visit(exact(*this)); + } + +# endif // ! MLN_INCLUDE_ONLY + + +} // end of namespace scribo + +#endif // SCRIBO_CORE_CONCEPT_SERIALIZABLE_HH diff --git a/scribo/scribo/core/concept/serialize_visitor.hh b/scribo/scribo/core/concept/serialize_visitor.hh new file mode 100644 index 0000000..e5e598f --- /dev/null +++ b/scribo/scribo/core/concept/serialize_visitor.hh @@ -0,0 +1,49 @@ +// Copyright (C) 2011 EPITA Research and Development Laboratory (LRDE) +// +// This file is part of Olena. +// +// Olena is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation, version 2 of the License. +// +// Olena is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Olena. If not, see <http://www.gnu.org/licenses/>. +// +// As a special exception, you may use this file as part of a free +// software project without restriction. Specifically, if other files +// instantiate templates or use macros or inline functions from this +// file, or you compile this file and link it with other files to produce +// an executable, this file does not by itself cause the resulting +// executable to be covered by the GNU General Public License. This +// exception does not however invalidate any other reasons why the +// executable file might be covered by the GNU General Public License. + +#ifndef SCRIBO_CORE_CONCEPT_SERIALIZE_VISITOR_HH +# define SCRIBO_CORE_CONCEPT_SERIALIZE_VISITOR_HH + +/// \file +/// +/// Concept for serializer visitors. + +# include <mln/core/concept/object.hh> + +namespace scribo +{ + + /// \brief Link functor concept. + template <typename E> + class SerializeVisitor : public mln::Object<E> + { + public: + // void visit(..); + }; + + +} // end of namespace scribo + +#endif // SCRIBO_CORE_CONCEPT_SERIALIZE_VISITOR_HH diff --git a/scribo/scribo/core/document.hh b/scribo/scribo/core/document.hh index ef0869e..372f0a4 100644 --- a/scribo/scribo/core/document.hh +++ b/scribo/scribo/core/document.hh @@ -40,13 +40,15 @@ # include <scribo/core/line_set.hh> # include <scribo/core/paragraph_set.hh> +# include <scribo/core/concept/serializable.hh> + # include <scribo/primitive/extract/components.hh> namespace scribo { template <typename L> - struct document + struct document : public Serializable<document<L> > { public: @@ -98,7 +100,7 @@ namespace scribo private: - const char *filename_; + std::string filename_; mln::image2d<mln::value::rgb8> image_; paragraph_set<L> parset_; @@ -142,7 +144,7 @@ namespace scribo const char * document<L>::filename() const { - return filename_; + return filename_.c_str(); } diff --git a/scribo/scribo/core/internal/doc_xml_serializer.hh b/scribo/scribo/core/internal/doc_xml_serializer.hh new file mode 100644 index 0000000..b64c9d4 --- /dev/null +++ b/scribo/scribo/core/internal/doc_xml_serializer.hh @@ -0,0 +1,140 @@ +// Copyright (C) 2011 EPITA Research and Development Laboratory (LRDE) +// +// This file is part of Olena. +// +// Olena is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation, version 2 of the License. +// +// Olena is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Olena. If not, see <http://www.gnu.org/licenses/>. +// +// As a special exception, you may use this file as part of a free +// software project without restriction. Specifically, if other files +// instantiate templates or use macros or inline functions from this +// file, or you compile this file and link it with other files to produce +// an executable, this file does not by itself cause the resulting +// executable to be covered by the GNU General Public License. This +// exception does not however invalidate any other reasons why the +// executable file might be covered by the GNU General Public License. + +#ifndef SCRIBO_CORE_INTERNAL_DOC_XML_SERIALIZER_HH +# define SCRIBO_CORE_INTERNAL_DOC_XML_SERIALIZER_HH + +/// \file +/// +/// Concept for serializer visitors. + +# include <scribo/core/concept/serialize_visitor.hh> + +# include <scribo/core/document.hh> +# include <scribo/core/component_set.hh> +# include <scribo/core/component_info.hh> +# include <scribo/core/paragraph_set.hh> +# include <scribo/core/object_groups.hh> +# include <scribo/core/object_links.hh> +# include <scribo/core/line_links.hh> +# include <scribo/core/line_info.hh> + +namespace scribo +{ + + /// \brief Link functor concept. + template <typename E> + class doc_xml_serializer : public SerializeVisitor<E> + { + public: + // Visit overloads + template <typename L> + void visit(const document<L>& doc) const; + + template <typename L> + void visit(const line_links<L>& llinks) const; + + template <typename L> + void visit(const object_groups<L>& groups) const; + + template <typename L> + void visit(const object_links<L>& links) const; + + template <typename L> + void visit(const component_set<L>& comp_set) const; + + void visit(const component_info& info) const; + + template <typename L> + void visit(const paragraph_set<L>& parset) const; + + template <typename L> + void visit(const line_info<L>& line) const; + }; + + +# ifndef MLN_INCLUDE_ONLY + + template <typename E> + template <typename L> + void + doc_xml_serializer<E>::visit(const document<L>& doc) const + { + } + + template <typename E> + template <typename L> + void + doc_xml_serializer<E>::visit(const line_links<L>& llinks) const + { + } + + template <typename E> + template <typename L> + void + doc_xml_serializer<E>::visit(const object_groups<L>& groups) const + { + } + + template <typename E> + template <typename L> + void + doc_xml_serializer<E>::visit(const object_links<L>& links) const + { + } + + template <typename E> + template <typename L> + void + doc_xml_serializer<E>::visit(const component_set<L>& comp_set) const + { + } + + template <typename E> + void + doc_xml_serializer<E>::visit(const component_info& info) const + { + } + + template <typename E> + template <typename L> + void + doc_xml_serializer<E>::visit(const paragraph_set<L>& parset) const + { + } + + template <typename E> + template <typename L> + void + doc_xml_serializer<E>::visit(const line_info<L>& line) const + { + } + +# endif // ! MLN_INCLUDE_ONLY + + +} // end of namespace scribo + +#endif // SCRIBO_CORE_INTERNAL_DOC_XML_SERIALIZER_HH diff --git a/scribo/scribo/core/line_info.hh b/scribo/scribo/core/line_info.hh index c82160a..33a1529 100644 --- a/scribo/scribo/core/line_info.hh +++ b/scribo/scribo/core/line_info.hh @@ -53,6 +53,11 @@ # include <scribo/core/line_set.hh> # include <scribo/core/component_set.hh> +# include <scribo/io/xml/internal/html_markups_replace.hh> + +# include <scribo/core/concept/serializable.hh> + + namespace scribo { @@ -114,6 +119,7 @@ namespace scribo bool indented_; std::string text_; + std::string html_text_; // Line set holding this element. line_set<L> holder_; @@ -125,7 +131,7 @@ namespace scribo template <typename L> - class line_info + class line_info : public Serializable<line_info<L> > { typedef internal::line_info_data<L> data_t; typedef mln::util::object_id<scribo::ComponentId, unsigned> component_id_t; @@ -198,6 +204,7 @@ namespace scribo bool has_text() const; const std::string& text() const; + const std::string& html_text() const; void update_text(const std::string& str); bool is_valid() const; @@ -604,6 +611,7 @@ namespace scribo return data_->indented_; } + template <typename L> bool line_info<L>::has_text() const @@ -611,6 +619,7 @@ namespace scribo return !data_->text_.empty(); } + template <typename L> const std::string& line_info<L>::text() const @@ -620,10 +629,19 @@ namespace scribo template <typename L> + const std::string& + line_info<L>::html_text() const + { + return data_->html_text_; + } + + + template <typename L> void line_info<L>::update_text(const std::string& str) { data_->text_ = str; + data_->html_text_ = scribo::io::xml::internal::html_markups_replace(str); } @@ -987,6 +1005,7 @@ namespace scribo << ", indented=" << info.indented() << ", hidden=" << info.is_hidden() << ", text=" << info.text() + << ", html_text=" << info.html_text() << ")" << std::endl; } diff --git a/scribo/scribo/core/line_links.hh b/scribo/scribo/core/line_links.hh index de62158..fdd09a5 100644 --- a/scribo/scribo/core/line_links.hh +++ b/scribo/scribo/core/line_links.hh @@ -34,6 +34,7 @@ # include <mln/util/array.hh> # include <mln/util/tracked_ptr.hh> +# include <scribo/core/concept/serializable.hh> # include <scribo/core/line_set.hh> @@ -69,7 +70,7 @@ namespace scribo /// \brief Line group representation. // template <typename L> - class line_links + class line_links : public Serializable<line_links<L> > { typedef internal::line_links_data<L> data_t; diff --git a/scribo/scribo/core/object_groups.hh b/scribo/scribo/core/object_groups.hh index 9d9fb25..bbfaf6e 100644 --- a/scribo/scribo/core/object_groups.hh +++ b/scribo/scribo/core/object_groups.hh @@ -36,6 +36,8 @@ # include <scribo/core/object_links.hh> # include <scribo/core/component_set.hh> +# include <scribo/core/concept/serializable.hh> + namespace scribo { @@ -69,7 +71,7 @@ namespace scribo /// \brief Object group representation. // template <typename L> - class object_groups + class object_groups : public Serializable<object_groups<L> > { typedef internal::object_groups_data<L> data_t; diff --git a/scribo/scribo/core/object_links.hh b/scribo/scribo/core/object_links.hh index af7dc38..2c2eea1 100644 --- a/scribo/scribo/core/object_links.hh +++ b/scribo/scribo/core/object_links.hh @@ -1,5 +1,5 @@ -// Copyright (C) 2009, 2010 EPITA Research and Development Laboratory -// (LRDE) +// Copyright (C) 2009, 2010, 2011 EPITA Research and Development +// Laboratory (LRDE) // // This file is part of Olena. // @@ -37,6 +37,8 @@ # include <scribo/core/component_set.hh> +# include <scribo/core/concept/serializable.hh> + namespace scribo { @@ -70,7 +72,7 @@ namespace scribo /// \brief Object group representation. // template <typename L> - class object_links + class object_links : public Serializable<object_links<L> > { typedef internal::object_links_data<L> data_t; diff --git a/scribo/scribo/core/paragraph_set.hh b/scribo/scribo/core/paragraph_set.hh index 6597189..5451069 100644 --- a/scribo/scribo/core/paragraph_set.hh +++ b/scribo/scribo/core/paragraph_set.hh @@ -33,6 +33,8 @@ # include <scribo/core/line_set.hh> # include <scribo/core/paragraph_info.hh> +# include <scribo/core/concept/serializable.hh> + namespace scribo { @@ -61,7 +63,7 @@ namespace scribo */ template <typename L> - class paragraph_set + class paragraph_set : public Serializable<paragraph_set<L> > { public: paragraph_set(); diff --git a/scribo/scribo/io/xml/internal/extended_page_xml_visitor.hh b/scribo/scribo/io/xml/internal/extended_page_xml_visitor.hh new file mode 100644 index 0000000..5d8a672 --- /dev/null +++ b/scribo/scribo/io/xml/internal/extended_page_xml_visitor.hh @@ -0,0 +1,283 @@ +// Copyright (C) 2011 EPITA Research and Development Laboratory (LRDE) +// +// This file is part of Olena. +// +// Olena is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation, version 2 of the License. +// +// Olena is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Olena. If not, see <http://www.gnu.org/licenses/>. +// +// As a special exception, you may use this file as part of a free +// software project without restriction. Specifically, if other files +// instantiate templates or use macros or inline functions from this +// file, or you compile this file and link it with other files to produce +// an executable, this file does not by itself cause the resulting +// executable to be covered by the GNU General Public License. This +// exception does not however invalidate any other reasons why the +// executable file might be covered by the GNU General Public License. + +#ifndef SCRIBO_IO_XML_INTERNAL_EXTENDED_PAGE_XML_VISITOR_HH +# define SCRIBO_IO_XML_INTERNAL_EXTENDED_PAGE_XML_VISITOR_HH + +/// \file +/// +/// Extended XML PAGE format serializer Visitor. + +# include <fstream> +# include <scribo/core/internal/doc_xml_serializer.hh> +# include <scribo/core/document.hh> +# include <scribo/core/component_set.hh> +# include <scribo/core/paragraph_set.hh> +# include <scribo/core/object_groups.hh> +# include <scribo/core/object_links.hh> +# include <scribo/core/line_links.hh> +# include <scribo/core/line_info.hh> + +# include <scribo/convert/to_base64.hh> + +# include <scribo/io/xml/internal/print_box_coords.hh> +# include <scribo/io/xml/internal/print_page_preambule.hh> + + +namespace scribo +{ + + namespace io + { + + namespace xml + { + + namespace internal + { + + + class extended_page_xml_visitor : public doc_xml_serializer<extended_page_xml_visitor> + { + public: + // Constructor + extended_page_xml_visitor(std::ofstream& out); + + // Visit overloads + template <typename L> + void visit(const document<L>& doc) const; + + template <typename L> + void visit(const component_set<L>& comp_set) const; + + void visit(const component_info& info) const; + + template <typename L> + void visit(const paragraph_set<L>& parset) const; + + template <typename L> + void visit(const line_info<L>& line) const; + + private: // Attributes + std::ofstream& output; + }; + + + +# ifndef MLN_INCLUDE_ONLY + + + inline + extended_page_xml_visitor::extended_page_xml_visitor(std::ofstream& out) + : output(out) + { + } + + + + /// Document + // + template <typename L> + void + extended_page_xml_visitor::visit(const document<L>& doc) const + { + // Preambule + print_PAGE_preambule(output, doc, false); + + // Text + if (doc.has_text()) + doc.paragraphs().accept(*this); + + + // Page elements (Pictures, ...) + if (doc.has_elements()) + doc.elements().accept(*this); + + // Whitespace seraparators + if (doc.has_whitespace_seps()) + doc.whitespace_seps_comps().accept(*this); + + output << " </page>" << std::endl; + output << "</pcGts>" << std::endl; + + } + + /// Component Set + // + template <typename L> + void + extended_page_xml_visitor::visit(const component_set<L>& comp_set) const + { + for_all_comps(c, comp_set) + if (comp_set(c).is_valid()) + comp_set(c).accept(*this); + } + + + /// Component_info + // + inline + void + extended_page_xml_visitor::visit(const component_info& info) const + { + switch (info.type()) + { + case component::WhitespaceSeparator: + { + output << " <whitespace_separator_region id=\"wss" + << info.id() + << "\">" << std::endl; + + internal::print_box_coords(output, info.bbox(), " "); + + output << " </whitespace_separator_region>" << std::endl; + break; + } + + case component::LineSeparator: + { + output << " <separator_region id=\"sr" << info.id() + << "\" sep_orientation=\"0.000000\" " + << " sep_colour=\"Black\" " + << " sep_bgcolour=\"White\">" << std::endl; + + internal::print_box_coords(output, info.bbox(), " "); + + output << " </separator_region>" << std::endl; + break; + } + + + default: + case component::Image: + { + output << " <image_region id=\"ir" << info.id() + << "\" img_colour_type=\"24_Bit_Colour\"" + << " img_orientation=\"0.000000\" " + << " img_emb_text=\"No\" " + << " img_bgcolour=\"White\">" << std::endl; + + internal::print_box_coords(output, info.bbox(), " "); + + output << " </image_region>" << std::endl; + break; + } + } + } + + + /// Paragraph Set + // + template <typename L> + void + extended_page_xml_visitor::visit(const paragraph_set<L>& parset) const + { + const line_set<L>& lines = parset.lines(); + + for_all_paragraphs(p, parset) + { + const mln::util::array<line_id_t>& line_ids = parset(p).line_ids(); + + // FIXME: compute that information on the whole paragraph + // and use them here. + line_id_t fid = line_ids(0); + output << " <text_region id=\"" << p + << "\" txt_orientation=\"" << lines(fid).orientation() + << "\" txt_reading_orientation=\"" << lines(fid).reading_orientation() + << "\" txt_reading_direction=\"" << lines(fid).reading_direction() + << "\" txt_text_type=\"" << lines(fid).type() + << "\" txt_reverse_video=\"" << (lines(fid).reverse_video() ? "true" : "false") + << "\" txt_indented=\"" << (lines(fid).indented() ? "true" : "false") + << "\" kerning=\"" << lines(fid).char_space(); + + // EXTENSIONS - Not officially supported + output << "\" baseline=\"" << lines(fid).baseline() + << "\" meanline=\"" << lines(fid).meanline() + << "\" x_height=\"" << lines(fid).x_height() + << "\" d_height=\"" << lines(fid).d_height() + << "\" a_height=\"" << lines(fid).a_height() + << "\" char_width=\"" << lines(fid).char_width(); + // End of EXTENSIONS + output << "\">" + << std::endl; + + internal::print_box_coords(output, parset(p).bbox(), " "); + + // EXTENSIONS - Not officially supported + for_all_paragraph_lines(lid, line_ids) + { + line_id_t l = line_ids(lid); + lines(l).accept(*this); + } + // End of EXTENSIONS + + output << " </text_region>" << std::endl; + } + } + + + template <typename L> + void + extended_page_xml_visitor::visit(const line_info<L>& line) const + { + if (line.has_text()) + { + output << " <line text=\"" << line.html_text() << "\" "; + } + else + output << " <line " << std::endl; + + output << "id=\"" << line.id() + << "\" txt_orientation=\"" << line.orientation() + << "\" txt_reading_orientation=\"" << line.reading_orientation() + << "\" txt_reading_direction=\"" << line.reading_direction() + << "\" txt_text_type=\"" << line.type() + << "\" txt_reverse_video=\"" << (line.reverse_video() ? "true" : "false") + << "\" txt_indented=\"" << (line.indented() ? "true" : "false") + << "\" kerning=\"" << line.char_space() + << "\" baseline=\"" << line.baseline() + << "\" meanline=\"" << line.meanline() + << "\" x_height=\"" << line.x_height() + << "\" d_height=\"" << line.d_height() + << "\" a_height=\"" << line.a_height() + << "\" char_width=\"" << line.char_width() + << "\">" << std::endl; + + internal::print_box_coords(output, line.bbox(), " "); + + output << " </line>" << std::endl; + } + +#endif // MLN_INCLUDE_ONLY + + } // end of namespace scribo::io::xml::internal + + } // end of namespace scribo::io::xml + + } // end of namespace scribo::io + +} // end of namespace scribo + +#endif // SCRIBO_IO_XML_INTERNAL_EXTENDED_PAGE_XML_VISITOR_HH diff --git a/scribo/scribo/io/xml/internal/full_xml_visitor.hh b/scribo/scribo/io/xml/internal/full_xml_visitor.hh new file mode 100644 index 0000000..9c5bd1d --- /dev/null +++ b/scribo/scribo/io/xml/internal/full_xml_visitor.hh @@ -0,0 +1,456 @@ +// Copyright (C) 2011 EPITA Research and Development Laboratory (LRDE) +// +// This file is part of Olena. +// +// Olena is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation, version 2 of the License. +// +// Olena is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Olena. If not, see <http://www.gnu.org/licenses/>. +// +// As a special exception, you may use this file as part of a free +// software project without restriction. Specifically, if other files +// instantiate templates or use macros or inline functions from this +// file, or you compile this file and link it with other files to produce +// an executable, this file does not by itself cause the resulting +// executable to be covered by the GNU General Public License. This +// exception does not however invalidate any other reasons why the +// executable file might be covered by the GNU General Public License. + +#ifndef SCRIBO_IO_XML_INTERNAL_FULL_XML_VISITOR_HH +# define SCRIBO_IO_XML_INTERNAL_FULL_XML_VISITOR_HH + +/// \file +/// +/// XML serializer Visitor. + +# include <fstream> +# include <scribo/core/internal/doc_xml_serializer.hh> +# include <scribo/core/document.hh> +# include <scribo/core/component_set.hh> +# include <scribo/core/paragraph_set.hh> +# include <scribo/core/object_groups.hh> +# include <scribo/core/object_links.hh> +# include <scribo/core/line_links.hh> +# include <scribo/core/line_info.hh> + +# include <scribo/convert/to_base64.hh> + +# include <scribo/io/xml/internal/print_box_coords.hh> +# include <scribo/io/xml/internal/print_page_preambule.hh> + + +namespace scribo +{ + + namespace io + { + + namespace xml + { + + namespace internal + { + + + class full_xml_visitor : public doc_xml_serializer<full_xml_visitor> + { + public: + // Constructor + full_xml_visitor(std::ofstream& out); + + // Visit overloads + template <typename L> + void visit(const document<L>& doc) const; + + template <typename L> + void visit(const line_links<L>& llinks) const; + + template <typename L> + void visit(const object_groups<L>& groups) const; + + template <typename L> + void visit(const object_links<L>& links) const; + + template <typename L> + void visit(const component_set<L>& comp_set) const; + + void visit(const component_info& info) const; + + template <typename L> + void visit(const paragraph_set<L>& parset) const; + + template <typename L> + void visit(const line_info<L>& line) const; + + private: // Attributes + std::ofstream& output; + }; + + + +# ifndef MLN_INCLUDE_ONLY + + + inline + full_xml_visitor::full_xml_visitor(std::ofstream& out) + : output(out) + { + } + + + + /// Document + // + template <typename L> + void + full_xml_visitor::visit(const document<L>& doc) const + { + print_PAGE_preambule(output, doc, false); + + // Text + if (doc.has_text()) + { + const line_set<L>& lines = doc.lines(); + + // Save component/link/group information (Extension) + { + // Component set + lines.components().accept(*this); + + // Object link + lines.links().accept(*this); + + // Object group + lines.groups().accept(*this); + } + // End of EXTENSIONS + + const paragraph_set<L>& parset = doc.paragraphs(); + + // Save paragraphs related information (Extension) + { + // General text information + output << " <text_data nlines=\"" << lines.nelements() << "\" " + << " nparagraphs=\"" << parset.nelements() << "\" />" << std::endl; + + // line_links + parset.links().accept(*this); + } + + // Paragraph and lines + parset.accept(*this); + } + + + // Page elements (Pictures, ...) + if (doc.has_elements()) + { + const component_set<L>& elts = doc.elements(); + for_all_comps(e, elts) + if (elts(e).is_valid()) + elts(e).accept(*this); + } + + + // line seraparators + if (doc.has_line_seps()) + { + const component_set<L>& + line_seps_comps = doc.line_seps_comps(); + + for_all_comps(c, line_seps_comps) + line_seps_comps(c).accept(*this); + } + + + // Whitespace seraparators + if (doc.has_whitespace_seps()) + { + const component_set<L>& + whitespace_seps_comps = doc.whitespace_seps_comps(); + + for_all_comps(c, whitespace_seps_comps) + whitespace_seps_comps(c).accept(*this); + } + + output << " </page>" << std::endl; + output << "</pcGts>" << std::endl; + + } + + + /// Line Links + // + template <typename L> + void + full_xml_visitor::visit(const line_links<L>& llinks) const + { + output << " <line_links>" << std::endl; + for_all_links(l, llinks) + { + output << " <line_link" + << " from=\"" << l + << "\" to=\"" << llinks(l) + << "\"/>" << std::endl; + } + output << " </line_links>" << std::endl; + } + + + /// Object Groups + // + template <typename L> + void + full_xml_visitor::visit(const object_groups<L>& groups) const + { + output << " <object_groups>" << std::endl; + for_all_groups(g, groups) + { + output << " <group " + << " object_id=\"" << g + << "\" group_id=\"" << groups(g) + << "\"/>" << std::endl; + } + output << " </object_groups>" << std::endl; + } + + + /// Object Links + // + template <typename L> + void + full_xml_visitor::visit(const object_links<L>& links) const + { + output << " <object_links>" << std::endl; + for_all_links(l, links) + { + output << " <link" + << " from=\"" << l + << "\" to=\"" << links(l) + << "\"/>" << std::endl; + } + output << " </object_links>" << std::endl; + } + + + /// Component Set + // + template <typename L> + void + full_xml_visitor::visit(const component_set<L>& comp_set) const + { + output << " <component_set nelements=\"" << comp_set.nelements() + << "\">" << std::endl; + for_all_comps(c, comp_set) + { + output << " <component_info" + << " id=\"" << comp_set(c).id() + << "\" mass_center_x=\"" << comp_set(c).mass_center().col() + << "\" mass_center_y=\"" << comp_set(c).mass_center().row() + << "\" card=\"" << comp_set(c).card() + << "\" tag=\"" << comp_set(c).tag() + << "\" type=\"" << comp_set(c).type() + << "\" pmin_x=\"" << comp_set(c).bbox().pmin().col() + << "\" pmin_y=\"" << comp_set(c).bbox().pmin().row() + << "\" pmax_x=\"" << comp_set(c).bbox().pmax().col() + << "\" pmax_y=\"" << comp_set(c).bbox().pmax().row() + << "\"/>" << std::endl; + } + + + // Save labeled image + { + const L& lbl = comp_set.labeled_image(); + output << "<labeled_image " + << " height=\"" << lbl.domain().height() + << "\" width=\"" << lbl.domain().width() << "\">" + << "<![CDATA["; + + util::array<unsigned char> lbl64; + convert::to_base64(lbl, lbl64); + output.write((const char *)lbl64.std_vector().data(), + lbl64.nelements()); + + output << "]]></labeled_image>" << std::endl; + } + + // Save separators image + { + const mln_ch_value(L,bool)& seps = comp_set.separators(); + output << "<separators_image " + << " height=\"" << seps.domain().height() + << "\" width=\"" << seps.domain().width() << "\">" + << "<![CDATA["; + + util::array<unsigned char> seps64; + convert::to_base64(seps, seps64); + output.write((const char *)seps64.std_vector().data(), + seps64.nelements()); + + output << "]]></separators_image>" << std::endl; + } + + output << "</component_set>" << std::endl; + } + + + /// Component_info + // + inline + void + full_xml_visitor::visit(const component_info& info) const + { + switch (info.type()) + { + case component::WhitespaceSeparator: + { + output << " <whitespace_separator_region id=\"wss" + << info.id() + << "\">" << std::endl; + + internal::print_box_coords(output, info.bbox(), " "); + + output << " </whitespace_separator_region>" << std::endl; + break; + } + + case component::LineSeparator: + { + output << " <separator_region id=\"sr" << info.id() + << "\" sep_orientation=\"0.000000\" " + << " sep_colour=\"Black\" " + << " sep_bgcolour=\"White\">" << std::endl; + + internal::print_box_coords(output, info.bbox(), " "); + + output << " </separator_region>" << std::endl; + break; + } + + + default: + case component::Image: + { + output << " <image_region id=\"ir" << info.id() + << "\" img_colour_type=\"24_Bit_Colour\"" + << " img_orientation=\"0.000000\" " + << " img_emb_text=\"No\" " + << " img_bgcolour=\"White\">" << std::endl; + + internal::print_box_coords(output, info.bbox(), " "); + + output << " </image_region>" << std::endl; + break; + } + } + } + + /// Paragraph Set + // + template <typename L> + void + full_xml_visitor::visit(const paragraph_set<L>& parset) const + { + const line_set<L>& lines = parset.lines(); + + for_all_paragraphs(p, parset) + { + const mln::util::array<line_id_t>& line_ids = parset(p).line_ids(); + + // FIXME: compute that information on the whole paragraph + // and use them here. + line_id_t fid = line_ids(0); + output << " <text_region id=\"" << p + << "\" txt_orientation=\"" << lines(fid).orientation() + << "\" txt_reading_orientation=\"" << lines(fid).reading_orientation() + << "\" txt_reading_direction=\"" << lines(fid).reading_direction() + << "\" txt_text_type=\"" << lines(fid).type() + << "\" txt_reverse_video=\"" << (lines(fid).reverse_video() ? "true" : "false") + << "\" txt_indented=\"" << (lines(fid).indented() ? "true" : "false") + << "\" kerning=\"" << lines(fid).char_space(); + + // EXTENSIONS - Not officially supported + output << "\" baseline=\"" << lines(fid).baseline() + << "\" meanline=\"" << lines(fid).meanline() + << "\" x_height=\"" << lines(fid).x_height() + << "\" d_height=\"" << lines(fid).d_height() + << "\" a_height=\"" << lines(fid).a_height() + << "\" char_width=\"" << lines(fid).char_width(); + // End of EXTENSIONS + output << "\">" + << std::endl; + + internal::print_box_coords(output, parset(p).bbox(), " "); + + + // EXTENSIONS - Not officially supported + for_all_paragraph_lines(lid, line_ids) + { + line_id_t l = line_ids(lid); + + lines(l).accept(*this); + } + + output << " </text_region>" << std::endl; + } + } + + + template <typename L> + void + full_xml_visitor::visit(const line_info<L>& line) const + { + if (line.has_text()) + { + output << " <line text=\"" << line.html_text() << "\" "; + } + else + output << " <line " << std::endl; + + output << "id=\"" << line.id() + << "\" txt_orientation=\"" << line.orientation() + << "\" txt_reading_orientation=\"" << line.reading_orientation() + << "\" txt_reading_direction=\"" << line.reading_direction() + << "\" txt_text_type=\"" << line.type() + << "\" txt_reverse_video=\"" << (line.reverse_video() ? "true" : "false") + << "\" txt_indented=\"" << (line.indented() ? "true" : "false") + << "\" kerning=\"" << line.char_space() + << "\" baseline=\"" << line.baseline() + << "\" meanline=\"" << line.meanline() + << "\" x_height=\"" << line.x_height() + << "\" d_height=\"" << line.d_height() + << "\" a_height=\"" << line.a_height() + << "\" char_width=\"" << line.char_width() + << "\">" << std::endl; + + internal::print_box_coords(output, line.bbox(), " "); + + output << " <compid_list>" << std::endl; + + for_all_line_comps(c, line.components()) + output << " <compid value=\"" + << line.components()(c) << "\" />" << std::endl; + + output << " </compid_list>" << std::endl; + + output << " </line>" << std::endl; + } + +#endif // MLN_INCLUDE_ONLY + + } // end of namespace scribo::io::xml::internal + + } // end of namespace scribo::io::xml + + } // end of namespace scribo::io + +} // end of namespace scribo + +#endif // SCRIBO_IO_XML_INTERNAL_FULL_XML_VISITOR_HH diff --git a/scribo/scribo/io/xml/internal/html_markups_replace.hh b/scribo/scribo/io/xml/internal/html_markups_replace.hh new file mode 100644 index 0000000..76f8107 --- /dev/null +++ b/scribo/scribo/io/xml/internal/html_markups_replace.hh @@ -0,0 +1,97 @@ +// Copyright (C) 2011 EPITA Research and Development Laboratory (LRDE) +// +// This file is part of Olena. +// +// Olena is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation, version 2 of the License. +// +// Olena is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Olena. If not, see <http://www.gnu.org/licenses/>. +// +// As a special exception, you may use this file as part of a free +// software project without restriction. Specifically, if other files +// instantiate templates or use macros or inline functions from this +// file, or you compile this file and link it with other files to produce +// an executable, this file does not by itself cause the resulting +// executable to be covered by the GNU General Public License. This +// exception does not however invalidate any other reasons why the +// executable file might be covered by the GNU General Public License. + +#ifndef SCRIBO_IO_XML_INTERNAL_HTML_MARKUPS_REPLACE_HH +# define SCRIBO_IO_XML_INTERNAL_HTML_MARKUPS_REPLACE_HH + +/// \file +/// +/// \brief Replace HTML markups characters by their corresponding +/// markups. + + +namespace scribo +{ + + namespace io + { + + namespace xml + { + + namespace internal + { + + /*! \brief Replace HTML markups characters by their corresponding + markups. + */ + inline + std::string + html_markups_replace(std::string& input); + + +# ifndef MLN_INCLUDE_ONLY + + static inline std::map<char, std::string> init_map() + { + std::map<char, std::string> html_map; + html_map['\"'] = """; + html_map['<'] = "<"; + html_map['>'] = ">"; + html_map['&'] = "&"; + return html_map; + } + + + inline + std::string + html_markups_replace(const std::string& input) + { + static std::map<char, std::string> map = init_map(); + + std::string output = input; + for (unsigned i = 0; i < input.size(); ++i) + { + std::map<char, std::string>::iterator it = map.find(output.at(i)); + if (it != map.end()) + { + output.replace(i, 1, it->second); + i += it->second.size() - 1; + } + } + return output; + } + +# endif // ! MLN_INCLUDE_ONLY + + } // end of namespace scribo::io::xml::internal + + } // end of namespace scribo::io::xml + + } // end of namespace scribo::io + +} // end of namespace scribo + +#endif // ! SCRIBO_IO_XML_INTERNAL_HTML_MARKUPS_REPLACE_HH diff --git a/scribo/scribo/io/xml/internal/page_xml_visitor.hh b/scribo/scribo/io/xml/internal/page_xml_visitor.hh new file mode 100644 index 0000000..52d8f12 --- /dev/null +++ b/scribo/scribo/io/xml/internal/page_xml_visitor.hh @@ -0,0 +1,222 @@ +// Copyright (C) 2011 EPITA Research and Development Laboratory (LRDE) +// +// This file is part of Olena. +// +// Olena is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation, version 2 of the License. +// +// Olena is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Olena. If not, see <http://www.gnu.org/licenses/>. +// +// As a special exception, you may use this file as part of a free +// software project without restriction. Specifically, if other files +// instantiate templates or use macros or inline functions from this +// file, or you compile this file and link it with other files to produce +// an executable, this file does not by itself cause the resulting +// executable to be covered by the GNU General Public License. This +// exception does not however invalidate any other reasons why the +// executable file might be covered by the GNU General Public License. + +#ifndef SCRIBO_IO_XML_INTERNAL_PAGE_XML_VISITOR_HH +# define SCRIBO_IO_XML_INTERNAL_PAGE_XML_VISITOR_HH + +/// \file +/// +/// PAGE format XML serializer Visitor. + +# include <fstream> + +# include <scribo/core/internal/doc_xml_serializer.hh> +# include <scribo/convert/to_base64.hh> + +# include <scribo/io/xml/internal/print_box_coords.hh> +# include <scribo/io/xml/internal/print_page_preambule.hh> + + +namespace scribo +{ + + namespace io + { + + namespace xml + { + + namespace internal + { + + /*! \brief Save document information as XML. + + We use a XML Schema part of the PAGE (Page Analysis and Ground + truth Elements) image representation framework. + + This schema was used in the Page Segmentation COMPetition + (PSCOMP) for ICDAR 2009. + + Its XSD file is located here: + http://schema.primaresearch.org/PAGE/gts/pagecontent/2009-03-16/pagecontent.... + + */ + class page_xml_visitor : public doc_xml_serializer<page_xml_visitor> + { + public: + // Constructor + page_xml_visitor(std::ofstream& out); + + // Visit overloads + template <typename L> + void visit(const document<L>& doc) const; + + template <typename L> + void visit(const component_set<L>& comp_set) const; + + void visit(const component_info& info) const; + + template <typename L> + void visit(const paragraph_set<L>& parset) const; + + private: // Attributes + std::ofstream& output; + }; + + + +# ifndef MLN_INCLUDE_ONLY + + + inline + page_xml_visitor::page_xml_visitor(std::ofstream& out) + : output(out) + { + } + + + + /// Document + // + template <typename L> + void + page_xml_visitor::visit(const document<L>& doc) const + { + // Preambule + print_PAGE_preambule(output, doc, true); + + // Text + if (doc.has_text()) + doc.paragraphs().accept(*this); + + // Page elements (Pictures, ...) + if (doc.has_elements()) + doc.elements().accept(*this); + + // line seraparators + if (doc.has_line_seps()) + doc.line_seps_comps().accept(*this); + + output << " </page>" << std::endl; + output << "</pcGts>" << std::endl; + } + + + /// Component Set + // + template <typename L> + void + page_xml_visitor::visit(const component_set<L>& comp_set) const + { + for_all_comps(c, comp_set) + if (comp_set(c).is_valid()) + comp_set(c).accept(*this); + } + + + /// Component_info + // + inline + void + page_xml_visitor::visit(const component_info& info) const + { + switch (info.type()) + { + case component::LineSeparator: + { + output << " <separator_region id=\"sr" << info.id() + << "\" sep_orientation=\"0.000000\" " + << " sep_colour=\"Black\" " + << " sep_bgcolour=\"White\">" << std::endl; + + internal::print_box_coords(output, info.bbox(), " "); + + output << " </separator_region>" << std::endl; + break; + } + + + default: + case component::Image: + { + output << " <image_region id=\"ir" << info.id() + << "\" img_colour_type=\"24_Bit_Colour\"" + << " img_orientation=\"0.000000\" " + << " img_emb_text=\"No\" " + << " img_bgcolour=\"White\">" << std::endl; + + internal::print_box_coords(output, info.bbox(), " "); + + output << " </image_region>" << std::endl; + break; + } + } + } + + + /// Paragraph Set + // + template <typename L> + void + page_xml_visitor::visit(const paragraph_set<L>& parset) const + { + const line_set<L>& lines = parset.lines(); + + for_all_paragraphs(p, parset) + { + const mln::util::array<line_id_t>& line_ids = parset(p).line_ids(); + + // FIXME: compute that information on the whole paragraph + // and use them here. + line_id_t fid = line_ids(0); + output << " <text_region id=\"" << p + << "\" txt_orientation=\"" << lines(fid).orientation() + << "\" txt_reading_orientation=\"" << lines(fid).reading_orientation() + << "\" txt_reading_direction=\"" << lines(fid).reading_direction() + << "\" txt_text_type=\"" << lines(fid).type() + << "\" txt_reverse_video=\"" << (lines(fid).reverse_video() ? "true" : "false") + << "\" txt_indented=\"" << (lines(fid).indented() ? "true" : "false") + << "\" kerning=\"" << lines(fid).char_space() + << "\">" + << std::endl; + + internal::print_box_coords(output, parset(p).bbox(), " "); + + output << " </text_region>" << std::endl; + } + } + + +#endif // MLN_INCLUDE_ONLY + + } // end of namespace scribo::io::xml::internal + + } // end of namespace scribo::io::xml + + } // end of namespace scribo::io + +} // end of namespace scribo + +#endif // SCRIBO_IO_XML_INTERNAL_PAGE_XML_VISITOR_HH diff --git a/scribo/scribo/io/xml/internal/print_box_coords.hh b/scribo/scribo/io/xml/internal/print_box_coords.hh new file mode 100644 index 0000000..d3aeedf --- /dev/null +++ b/scribo/scribo/io/xml/internal/print_box_coords.hh @@ -0,0 +1,92 @@ +// Copyright (C) 2011 EPITA Research and Development Laboratory (LRDE) +// +// This file is part of Olena. +// +// Olena is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation, version 2 of the License. +// +// Olena is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Olena. If not, see <http://www.gnu.org/licenses/>. +// +// As a special exception, you may use this file as part of a free +// software project without restriction. Specifically, if other files +// instantiate templates or use macros or inline functions from this +// file, or you compile this file and link it with other files to produce +// an executable, this file does not by itself cause the resulting +// executable to be covered by the GNU General Public License. This +// exception does not however invalidate any other reasons why the +// executable file might be covered by the GNU General Public License. + +#ifndef SCRIBO_IO_XML_INTERNAL_PRINT_BOX_COORDS_HH +# define SCRIBO_IO_XML_INTERNAL_PRINT_BOX_COORDS_HH + +/// \file +/// +/// \brief Prints box2d coordinates to XML data. + +# include <mln/core/alias/box2d.hh> + +namespace scribo +{ + + namespace io + { + + namespace xml + { + + namespace internal + { + + /*! \brief Prints box2d coordinates to XML data. + */ + void + print_box_coords(std::ofstream& ostr, const box2d& b, + const char *space); + + +# ifndef MLN_INCLUDE_ONLY + + + inline + void + print_box_coords(std::ofstream& ostr, const box2d& b, + const char *space) + { + std::string sc = space; + std::string sp = sc + " "; + ostr << sc << "<coords>" << std::endl + << sp << "<point x=\"" << b.pmin().col() + << "\" y=\"" << b.pmin().row() << "\"/>" + << std::endl + << sp << "<point x=\"" << b.pmax().col() + << "\" y=\"" << b.pmin().row() << "\"/>" + << std::endl + << sp << "<point x=\"" << b.pmax().col() + << "\" y=\"" << b.pmax().row() << "\"/>" + << std::endl + << sp << "<point x=\"" << b.pmin().col() + << "\" y=\"" << b.pmax().row() << "\"/>" + << std::endl + << sc << "</coords>" << std::endl; + + } + + +# endif // ! MLN_INCLUDE_ONLY + + } // end of namespace scribo::io::xml::internal + + } // end of namespace scribo::io::xml + + } // end of namespace scribo::io + +} // end of namespace scribo + +#endif // ! SCRIBO_IO_XML_INTERNAL_PRINT_BOX_COORDS_HH diff --git a/scribo/scribo/io/xml/internal/print_page_preambule.hh b/scribo/scribo/io/xml/internal/print_page_preambule.hh new file mode 100644 index 0000000..b5ae891 --- /dev/null +++ b/scribo/scribo/io/xml/internal/print_page_preambule.hh @@ -0,0 +1,95 @@ +// Copyright (C) 2011 EPITA Research and Development Laboratory (LRDE) +// +// This file is part of Olena. +// +// Olena is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation, version 2 of the License. +// +// Olena is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Olena. If not, see <http://www.gnu.org/licenses/>. +// +// As a special exception, you may use this file as part of a free +// software project without restriction. Specifically, if other files +// instantiate templates or use macros or inline functions from this +// file, or you compile this file and link it with other files to produce +// an executable, this file does not by itself cause the resulting +// executable to be covered by the GNU General Public License. This +// exception does not however invalidate any other reasons why the +// executable file might be covered by the GNU General Public License. + +#ifndef SCRIBO_IO_XML_INTERNAL_PRINT_PAGE_PREAMBULE_HH +# define SCRIBO_IO_XML_INTERNAL_PRINT_PAGE_PREAMBULE_HH + +/// \file +/// +/// \brief Print PAGE XML format preambule. + +# include <mln/core/alias/box2d.hh> + +namespace scribo +{ + + namespace io + { + + namespace xml + { + + namespace internal + { + + /// \brief Print PAGE XML format preambule. + template <typename L> + void print_PAGE_preambule(std::ofstream& output, + const document<L>& doc, + bool with_validation); + + +# ifndef MLN_INCLUDE_ONLY + + template <typename L> + void print_PAGE_preambule(std::ofstream& output, + const document<L>& doc, + bool with_validation) + { + output << "<?xml version=\"1.0\"?>" << std::endl; + + if (with_validation) + output << "<pcGts xmlns=\"http://schema.primaresearch.org/PAGE/gts/pagecontent/2009-03-16\" " + << "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" " + << "xsi:schemaLocation=\"http://schema.primaresearch.org/PAGE/gts/pagecontent/2009-03-16 " + << "http://schema.primaresearch.org/PAGE/gts/pagecontent/2009-03-16/pagecontent.xsd\" " + << "pcGtsId=\"" << doc.filename() << "\">" << std::endl; + else + output << "<pcGts>" << std::endl; + + output << " <pcMetadata>" << std::endl; + output << " <pcCreator>LRDE</pcCreator>" << std::endl; + output << " <pcCreated/>" << std::endl; + output << " <pcLastChange/>" << std::endl; + output << " <pcComments>Generated by Scribo from Olena.</pcComments>" << std::endl; + output << " </pcMetadata>" << std::endl; + + output << " <page image_filename=\"" << doc.filename() + << "\" image_width=\"" << doc.width() + << "\" image_height=\"" << doc.height() + << "\">" << std::endl; + } + +# endif // ! MLN_INCLUDE_ONLY + + } // end of namespace scribo::io::xml::internal + + } // end of namespace scribo::io::xml + + } // end of namespace scribo::io + +} // end of namespace scribo + +#endif // ! SCRIBO_IO_XML_INTERNAL_PRINT_PAGE_PREAMBULE_HH diff --git a/scribo/scribo/io/xml/save.hh b/scribo/scribo/io/xml/save.hh index 7011e87..30579d0 100644 --- a/scribo/scribo/io/xml/save.hh +++ b/scribo/scribo/io/xml/save.hh @@ -38,7 +38,11 @@ # include <map> # include <scribo/core/document.hh> -# include <scribo/core/line_set.hh> + +# include <scribo/io/xml/internal/full_xml_visitor.hh> +# include <scribo/io/xml/internal/extended_page_xml_visitor.hh> +# include <scribo/io/xml/internal/page_xml_visitor.hh> + namespace scribo { @@ -49,360 +53,104 @@ namespace scribo namespace xml { - /*! \brief Save document information as XML. + /*! \brief Supported XML formats + + Page : PRima PAGE format. Used in ICDAR 2009. - We use a XML Schema part of the PAGE (Page Analysis and Ground - truth Elements) image representation framework. + PageExtended : Enriched PRima PAGE format with scribo data. + + Full : Enriched PRima PAGE format with scribo data. This + format can be reloaded in Scribo. + */ + enum Format + { + Page, + PageExtended, + Full + //Hocr + }; - This schema was used in the Page Segmentation COMPetition - (PSCOMP) for ICDAR 2009. - Its XSD file is located here: - http://schema.primaresearch.org/PAGE/gts/pagecontent/2009-03-16/pagecontent.... + /*! \brief Save document information as XML. */ template <typename L> void - save(const document<L>& doc, - const std::string& output_name, - bool allow_extensions); + save(const document<L>& doc, const std::string& output_name, + Format format); # ifndef MLN_INCLUDE_ONLY + namespace internal { - inline - std::string& - html_markups_replace(std::string& input, - std::map<char, std::string>& map) - { - for (unsigned i = 0; i < input.size(); ++i) - { - std::map<char, std::string>::iterator it = map.find(input.at(i)); - if (it != map.end()) - { - input.replace(i, 1, it->second); - i += it->second.size() - 1; - } - } - return input; - } - - - inline - void print_box_coords(std::ofstream& ostr, const box2d& b, - const char *space) + template <typename L> + void save_page(const document<L>& doc, std::ofstream& output) { - std::string sc = space; - std::string sp = sc + " "; - ostr << sc << "<coords>" << std::endl - << sp << "<point x=\"" << b.pmin().col() - << "\" y=\"" << b.pmin().row() << "\"/>" - << std::endl - << sp << "<point x=\"" << b.pmax().col() - << "\" y=\"" << b.pmin().row() << "\"/>" - << std::endl - << sp << "<point x=\"" << b.pmax().col() - << "\" y=\"" << b.pmax().row() << "\"/>" - << std::endl - << sp << "<point x=\"" << b.pmin().col() - << "\" y=\"" << b.pmax().row() << "\"/>" - << std::endl - << sc << "</coords>" << std::endl; - + scribo::io::xml::internal::page_xml_visitor f(output); + doc.accept(f); } - - template <typename L> - void - save(const document<L>& doc, - const std::string& output_name) + void save_page_extended(const document<L>& doc, std::ofstream& output) { - trace::entering("scribo::io::xml:save_text_lines"); - - std::ofstream file(output_name.c_str()); - if (! file) - { - std::cerr << "error: cannot open file '" << doc.filename() << "'!"; - abort(); - } - - std::map<char, std::string> html_map; - html_map['\"'] = """; - html_map['<'] = "<"; - html_map['>'] = ">"; - html_map['&'] = "&"; - - file << "<?xml version=\"1.0\"?>" << std::endl; - file << "<pcGts xmlns=\"http://schema.primaresearch.org/PAGE/gts/pagecontent/2009-03-16\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://schema.primaresearch.org/PAGE/gts/pagecontent/2009-03-16 http://schema.primaresearch.org/PAGE/gts/pagecontent/2009-03-16/pagecontent.xsd\" pcGtsId=\"" << doc.filename() << "\">" << std::endl; - - file << " <pcMetadata>" << std::endl; - file << " <pcCreator>LRDE</pcCreator>" << std::endl; - file << " <pcCreated/>" << std::endl; - file << " <pcLastChange/>" << std::endl; - file << " <pcComments>Generated by Scribo from Olena.</pcComments>" << std::endl; - file << " </pcMetadata>" << std::endl; - - file << " <page image_filename=\"" << doc.filename() - << "\" image_width=\"" << doc.width() - << "\" image_height=\"" << doc.height() - << "\">" << std::endl; - - // Text - if (doc.has_text()) - { - const line_set<L>& lines = doc.lines(); - const paragraph_set<L>& parset = doc.paragraphs(); - - for_all_paragraphs(p, parset) - { - const mln::util::array<line_id_t>& line_ids = parset(p).line_ids(); - - // FIXME: compute that information on the whole paragraph - // and use them here. - line_id_t fid = line_ids(0); - file << " <text_region id=\"" << p - << "\" txt_orientation=\"" << lines(fid).orientation() - << "\" txt_reading_orientation=\"" << lines(fid).reading_orientation() - << "\" txt_reading_direction=\"" << lines(fid).reading_direction() - << "\" txt_text_type=\"" << lines(fid).type() - << "\" txt_reverse_video=\"" << (lines(fid).reverse_video() ? "true" : "false") - << "\" txt_indented=\"" << (lines(fid).indented() ? "true" : "false") - << "\" kerning=\"" << lines(fid).char_space() - << "\">" - << std::endl; - - internal::print_box_coords(file, parset(p).bbox(), " "); - - file << " </text_region>" << std::endl; - } - } - - // Page elements (Pictures, ...) - if (doc.has_elements()) - { - const component_set<L>& elts = doc.elements(); - for_all_comps(e, elts) - if (elts(e).is_valid()) - { - file << " <image_region id=\"ir" << elts(e).id() - << "\" img_colour_type=\"24_Bit_Colour\"" - << " img_orientation=\"0.000000\" " - << " img_emb_text=\"No\" " - << " img_bgcolour=\"White\">" << std::endl; - - internal::print_box_coords(file, elts(e).bbox(), " "); - - file << " </image_region>" << std::endl; - } - } - - - file << " </page>" << std::endl; - file << "</pcGts>" << std::endl; - - trace::exiting("scribo::io::xml::save_text_lines"); + scribo::io::xml::internal::extended_page_xml_visitor f(output); + doc.accept(f); } - - - template <typename L> - void - save_extended(const document<L>& doc, - const std::string& output_name) + void save_full(const document<L>& doc, std::ofstream& output) { - trace::entering("scribo::io::xml:save_text_lines"); - - std::ofstream file(output_name.c_str()); - if (! file) - { - std::cerr << "error: cannot open file '" << doc.filename() << "'!"; - abort(); - } - - std::map<char, std::string> html_map; - html_map['\"'] = """; - html_map['<'] = "<"; - html_map['>'] = ">"; - html_map['&'] = "&"; - - file << "<?xml version=\"1.0\"?>" << std::endl; - file << "<pcGts>" << std::endl; - - file << " <pcMetadata>" << std::endl; - file << " <pcCreator>LRDE</pcCreator>" << std::endl; - file << " <pcCreated/>" << std::endl; - file << " <pcLastChange/>" << std::endl; - file << " <pcComments>Generated by Scribo from Olena.</pcComments>" << std::endl; - file << " </pcMetadata>" << std::endl; - - file << " <page image_filename=\"" << doc.filename() - << "\" image_width=\"" << doc.width() - << "\" image_height=\"" << doc.height() - << "\">" << std::endl; - - // Text - if (doc.has_text()) - { - const line_set<L>& lines = doc.lines(); - const paragraph_set<L>& parset = doc.paragraphs(); - - for_all_paragraphs(p, parset) - { - const mln::util::array<line_id_t>& line_ids = parset(p).line_ids(); - - // FIXME: compute that information on the whole paragraph - // and use them here. - line_id_t fid = line_ids(0); - file << " <text_region id=\"" << p - << "\" txt_orientation=\"" << lines(fid).orientation() - << "\" txt_reading_orientation=\"" << lines(fid).reading_orientation() - << "\" txt_reading_direction=\"" << lines(fid).reading_direction() - << "\" txt_text_type=\"" << lines(fid).type() - << "\" txt_reverse_video=\"" << (lines(fid).reverse_video() ? "true" : "false") - << "\" txt_indented=\"" << (lines(fid).indented() ? "true" : "false") - << "\" kerning=\"" << lines(fid).char_space(); - - // EXTENSIONS - Not officially supported - file << "\" baseline=\"" << lines(fid).baseline() - << "\" meanline=\"" << lines(fid).meanline() - << "\" x_height=\"" << lines(fid).x_height() - << "\" d_height=\"" << lines(fid).d_height() - << "\" a_height=\"" << lines(fid).a_height() - << "\" char_width=\"" << lines(fid).char_width(); - // End of EXTENSIONS - file << "\">" - << std::endl; - - internal::print_box_coords(file, parset(p).bbox(), " "); - - - // EXTENSIONS - Not officially supported - for_all_paragraph_lines(lid, line_ids) - { - line_id_t l = line_ids(lid); - - if (lines(l).has_text()) - { - std::string tmp = lines(l).text(); - tmp = internal::html_markups_replace(tmp, html_map); - - file << " <line text=\"" << tmp << "\" "; - } - else - file << " <line " << std::endl; - - file << "id=\"" << lines(l).id() - << "\" txt_orientation=\"" << lines(l).orientation() - << "\" txt_reading_orientation=\"" << lines(l).reading_orientation() - << "\" txt_reading_direction=\"" << lines(l).reading_direction() - << "\" txt_text_type=\"" << lines(l).type() - << "\" txt_reverse_video=\"" << (lines(l).reverse_video() ? "true" : "false") - << "\" txt_indented=\"" << (lines(l).indented() ? "true" : "false") - << "\" kerning=\"" << lines(l).char_space() - << "\" baseline=\"" << lines(l).baseline() - << "\" meanline=\"" << lines(l).meanline() - << "\" x_height=\"" << lines(l).x_height() - << "\" d_height=\"" << lines(l).d_height() - << "\" a_height=\"" << lines(l).a_height() - << "\" char_width=\"" << lines(l).char_width() - << "\">" << std::endl; - - internal::print_box_coords(file, lines(l).bbox(), " "); - - file << " </line>" << std::endl; - } - - file << " </text_region>" << std::endl; - } - } - // End of EXTENSIONS - - // Page elements (Pictures, ...) - if (doc.has_elements()) - { - const component_set<L>& elts = doc.elements(); - for_all_comps(e, elts) - if (elts(e).is_valid()) - { - switch (elts(e).type()) - { - case component::Separator: - { - file << " <separator_region id=\"sr" << elts(e).id() - << "\" sep_orientation=\"0.000000\" " - << " sep_colour=\"Black\" " - << " sep_bgcolour=\"White\">" << std::endl; - - internal::print_box_coords(file, elts(e).bbox(), " "); - - file << " </separator_region>" << std::endl; - break; - break; - } - - default: - case component::Image: - { - file << " <image_region id=\"ir" << elts(e).id() - << "\" img_colour_type=\"24_Bit_Colour\"" - << " img_orientation=\"0.000000\" " - << " img_emb_text=\"No\" " - << " img_bgcolour=\"White\">" << std::endl; - - internal::print_box_coords(file, elts(e).bbox(), " "); - - file << " </image_region>" << std::endl; - break; - } - } - } - } - - - // Whitespace seraparators - if (doc.has_whitespace_seps()) - { - const component_set<L>& - whitespace_seps_comps = doc.whitespace_seps_comps(); - - for_all_comps(c, whitespace_seps_comps) - { - file << " <whitespace_separator_region id=\"wss" - << whitespace_seps_comps(c).id() - << "\">" << std::endl; - - internal::print_box_coords(file, whitespace_seps_comps(c).bbox(), " "); - - file << " </whitespace_separator_region>" << std::endl; - } - } - - file << " </page>" << std::endl; - file << "</pcGts>" << std::endl; - - trace::exiting("scribo::io::xml::save_text_lines"); + scribo::io::xml::internal::full_xml_visitor f(output); + doc.accept(f); } } // end of namespace scribo::io::xml::internal + // FACADE template <typename L> void save(const document<L>& doc, const std::string& output_name, - bool allow_extensions) + Format format) { - if (allow_extensions) - internal::save_extended(doc, output_name); - else - internal::save(doc, output_name); + trace::entering("scribo::io::xml::save"); + + // Open file + std::ofstream output(output_name.c_str()); + if (! output) + { + std::cerr << "scribo::io::xml::save - ERROR: cannot open file '" + << doc.filename() << "'!"; + return; + } + + // Choose saving method. + switch (format) + { + case Page: + internal::save_page(doc, output); + break; + + case PageExtended: + internal::save_page_extended(doc, output); + break; + + case Full: + internal::save_full(doc, output); + break; + + default: + trace::warning("scribo::io::xml::save - Invalid XML format! Skip saving..."); + } + + output.close(); + trace::exiting("scribo::io::xml::save"); } diff --git a/scribo/scribo/toolchain/internal/content_in_doc_functor.hh b/scribo/scribo/toolchain/internal/content_in_doc_functor.hh index 48098ba..dcbb4f7 100644 --- a/scribo/scribo/toolchain/internal/content_in_doc_functor.hh +++ b/scribo/scribo/toolchain/internal/content_in_doc_functor.hh @@ -36,7 +36,6 @@ # include <scribo/primitive/extract/non_text.hh> # include <scribo/primitive/extract/components.hh> -//# include <scribo/primitive/extract/vertical_separators.hh> # include <scribo/primitive/extract/separators.hh> # include <scribo/primitive/extract/separators_nonvisible.hh> @@ -114,7 +113,7 @@ namespace scribo bool enable_whitespace_seps; bool enable_debug; bool save_doc_as_xml; - bool allow_xml_extensions; + scribo::io::xml::Format xml_format; //============ // Parameters @@ -139,7 +138,7 @@ namespace scribo enable_whitespace_seps(true), enable_debug(false), save_doc_as_xml(false), - allow_xml_extensions(true), + xml_format(scribo::io::xml::PageExtended), ocr_language("eng"), output_file("/tmp/foo.xml"), doc(doc_filename) @@ -189,7 +188,7 @@ namespace scribo // Whitespace separators on_new_progress_label("Find whitespace separators..."); - whitespaces = primitive::extract::separators_nonvisible(processed_image); + whitespaces = primitive::extract::separators_nonvisible(input_cleaned); on_progress(); } @@ -483,7 +482,7 @@ namespace scribo { on_new_progress_label("Saving results"); - scribo::io::xml::save(doc, output_file, allow_xml_extensions); + scribo::io::xml::save(doc, output_file, xml_format); on_xml_saved(); on_progress(); diff --git a/scribo/src/content_in_doc.cc b/scribo/src/content_in_doc.cc index 9748b28..d8d4e52 100644 --- a/scribo/src/content_in_doc.cc +++ b/scribo/src/content_in_doc.cc @@ -172,7 +172,9 @@ int main(int argc, char* argv[]) debug); // Saving results - scribo::io::xml::save(doc, argv[2], true); + scribo::io::xml::save(doc, argv[2], scribo::io::xml::PageExtended); + scribo::io::xml::save(doc, "page.xml", scribo::io::xml::Page); + scribo::io::xml::save(doc, "full.xml", scribo::io::xml::Full); trace::exiting("main"); } -- 1.5.6.5