olena-2.0-229-g115049c src/contest/bsec2013/process_toc_page.cc: New.

--- scribo/ChangeLog | 4 + scribo/src/contest/bsec2013/process_toc_page.cc | 190 +++++++++++++++++++++++ 2 files changed, 194 insertions(+), 0 deletions(-) create mode 100644 scribo/src/contest/bsec2013/process_toc_page.cc diff --git a/scribo/ChangeLog b/scribo/ChangeLog index c4bdcfc..e44574a 100644 --- a/scribo/ChangeLog +++ b/scribo/ChangeLog @@ -1,3 +1,7 @@ +2013-03-01 Guillaume Lazzara <z@lrde.epita.fr> + + * src/contest/bsec2013/process_toc_page.cc: New. + 2013-02-26 Guillaume Lazzara <z@lrde.epita.fr> * tests/unit_test/unit-tests.mk: Update. diff --git a/scribo/src/contest/bsec2013/process_toc_page.cc b/scribo/src/contest/bsec2013/process_toc_page.cc new file mode 100644 index 0000000..f261e2c --- /dev/null +++ b/scribo/src/contest/bsec2013/process_toc_page.cc @@ -0,0 +1,190 @@ +#include <iostream> +#include <mln/core/image/image2d.hh> +#include <mln/value/rgba.hh> +#include <mln/value/rgb8.hh> +#include <mln/value/qt/rgb32.hh> +#include <mln/io/ppm/save.hh> +#include <mln/io/pdf/load.hh> +#include <mln/io/pdf/get_header.hh> +#include <mln/data/convert.hh> +#include <poppler/cpp/poppler-document.h> +#include <poppler/cpp/poppler-page.h> +#include <poppler/cpp/poppler-page-renderer.h> +#include <mln/debug/filename.hh> +#include <QString> +#include <QStringList> +#include <QDebug> + +int main(int argc, char *argv[]) +{ + if (argc != 3) + { + std::cout << "Usage: " << argv[0] << " <doc.pdf> <page>" << std::endl; + return 1; + } + + using namespace poppler; + using namespace mln; + + // Load document + poppler::document *pdf = poppler::document::load_from_file(argv[1]); + if (pdf == 0) + { + std::cerr << "Error: Cannot load PDF " << argv[1] << std::endl; + abort(); + } + + + + // Read page + poppler::page* p = pdf->create_page(atoi(argv[2])); + std::vector<char> byte_array = p->text().to_utf8(); + if (byte_array.size() > 0) + { + QString text = QString::fromUtf8((const char *)&byte_array[0]); + + // Remove '.' and '-' (usually used in table of contents between + // title and page number. + text = text.replace('.', ' '); + text = text.replace('-', ' '); + text = text.replace('*', ' '); + + QStringList entries = text.split('\n'); + + qDebug() << entries; + + qDebug(); + + + // PASS 1 - Remove useless spaces. + for (int i = 0; i < entries.size(); ++i) + entries[i] = entries[i].simplified(); + + + // PASS 2 - Merge lines. + for (int i = 0; i < entries.size();) + { + bool removed = false; + QString& entry = entries[i]; + + // Skip Table of Contents title. + if (entry.contains("table of contents", Qt::CaseInsensitive)) + { + ++i; + continue; + } + + // Remove empty lines + if (entry.isEmpty() || entry == "") // FIXME: There is a bug! + // Some empty strings seems + // to pass this test! + // (100_book_subset/pdfs/aidstothebible01popeuoft.pdf + // page10) + { + entries.removeAt(i); + continue; + } + + // Move single numbers to previous lines. + qDebug() << "Checking " << entry; + if (entry.contains(QRegExp("^[0-9mcxilv]+$", Qt::CaseInsensitive))) + { + if ((i - 1) > 0) + { + //qDebug() << "MERGING BACK " << entry << " TO " << entries[i - 1]; + + entries[i - 1].append(" " + entry); + entries.removeAt(i); + removed = true; + } + } + // Trying to merge lines corresponding to entries on several + // lines. + else if (! entry.contains(QRegExp(" [0-9mcxilv]+$", Qt::CaseInsensitive))) + if ((i + 1) < entries.size()) + { + //qDebug() << "MERGING FORWARD " << entries[i + 1] << " TO " << entry; + entry.append(" " + entries[i + 1].simplified()); + entries.removeAt(i + 1); + removed = true; + //entries[i + 1].clear(); + } + + if (!removed) + ++i; + } + + // PASS 3 - Grouping page numbers if there is space in between. + // Remove multiple invalid page numbers like "14 VI". + for (int i = 0; i < entries.size(); ++i) + { + QString& entry = entries[i]; + + // Garbage page numbers. Keep the first one. + if (entry.contains(QRegExp(" [0-9]+ [mcxilv]+$", Qt::CaseInsensitive)) + || entry.contains(QRegExp(" [mcxilv]+ [0-9]+$", Qt::CaseInsensitive))) + { + QStringList words = entry.split(' '); + QString new_entry = ""; + for (int j = 0; j < words.size() - 2; ++j) + new_entry += words[j] + " "; + new_entry += words[words.size() - 1]; + entry = new_entry; + } + else if (entry.contains(QRegExp(" [mcxilv]+ [mcxilv]+$", Qt::CaseInsensitive))) + { + QStringList words = entry.split(' '); + QString new_entry = ""; + for (int j = 0; j < words.size() - 2; ++j) + new_entry += words[j] + " "; + new_entry += words[words.size() - 2] + words[words.size() - 1]; + entry = new_entry; + } + else if (entry.contains(QRegExp(" [0-9]+ [0-9]+$", Qt::CaseInsensitive))) + { // Same case as previous one except that we check if the page number exists! + QStringList words = entry.split(' '); + QString new_entry = ""; + for (int j = 0; j < words.size() - 2; ++j) + new_entry += words[j] + " "; + + int page = QString(words[words.size() - 2] + words[words.size() - 1]).toInt(); + + if (page > pdf->pages()) + new_entry += words[words.size() - 2]; // Keep first page number + else + new_entry += words[words.size() - 2] + words[words.size() - 1]; // Merging both. + entry = new_entry; + } + + } + + qDebug() << entries; + } + + + // io::pdf::pdf_header header = + // io::pdf::get_header(argv[1]); + + // std::cout << header.page_count << std::endl; + // util::array<image2d<value::rgb8> > arr; + // io::pdf::load(arr, argv[1], 0, 10); + + // for (unsigned i = 0; i < arr.size(); ++i) + // io::ppm::save(arr[i], mln::debug::filename("page.ppm")); + + // util::array<int> pages; + // pages.append(1); + // pages.append(9); + + // arr.clear(); + // io::pdf::load(arr, argv[1], pages); + + // for (unsigned i = 0; i < arr.size(); ++i) + // io::ppm::save(arr[i], mln::debug::filename("pages.ppm")); + + + //pima.save("page.png", "PNG", 300); + + + +} -- 1.7.2.5
participants (1)
-
Guillaume Lazzara