C++: PdfPageCount.cpp

Está meio ruim de ler aqui na página por causa do tamanho das letras e do tamanho das linhas. Sugiro copiar e colar para um editor com sintaxe colorida. Obs.: eu testei com vários arquivos PDF, mas é certo que há erros à espreita. Considere-se avisado.

Versão usada nos testes de 2010-10-27:
BitBucket – PdfPageCount.cpp v2010-10-27

/*
Este programa mostra na tela o número de páginas de um
ou mais arquivos PDF passados na linha de comando.
Este código fonte é de domínio público (PUBLIC DOMAIN)
Desenvolvido em C++0x (que já é C++1x) com o Visual C++ 2010 Express.
Pode conter erros no uso dos novos recursos da linguagem
(ou outros erros quaisquer, hehehehe).
Se conseguirem compilar em outro compilador, me avisem.
Falta suporte a a coisas como stoll, to_string, nullptr e regex_search
no mingw-w64.
Marcus.
*/

// Em modo debug, dá um loop infinito ao destruir o unordered_map PdfXref::objs,
// com certos arquivos pdf.
// Se isso indica um bug na biblioteca ou no meu código, isso eu não sei.
// De qualquer jeito, estou usando mais o modo Release porque a diferença
// de desempenho é enorme.
#define _HAS_ITERATOR_DEBUGGING 0

#include <cctype>
#include <cstddef>
#include <cstring>
#include <algorithm>
#include <fstream>
#include <iostream>
#include <memory>
#include <regex>
#include <stack>
#include <stdexcept>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>

using namespace std;

const string delimiters("()<>[]{}/%");

bool is_delimiter(char ch) {
    if(delimiters.find_first_of(ch) != string::npos) {
        return true;
    }
    return isspace(ch) != 0;
}

struct PdfObject {
    virtual ~PdfObject() { }
};

struct PdfNumberObj : PdfObject {
    double number;
    PdfNumberObj(double value)
        : number(value) { }
};

struct PdfStrObj : PdfObject {
    string str;
    PdfStrObj(const string & value)
        : str(value) { }
    PdfStrObj(string && value) // Atenção! Warning! Atentu!
        : str(move(value)) { } // É assim que se faz um move constructor?
                               // Não! A ideia básica de usar o move está certa, mas
                               // o parâmetro devia ser PdfStrObj&&
                               // para ser um verdadeiro move constructor.
                               // Mas como eu precisava mover a partir de uma string,
                               // acabei fazendo assim.
                               // (esta observação foi adicionada quase 1 ano depois...)
    PdfStrObj& operator=(PdfStrObj && other) {
        if(this != &other) {
            str = move(other.str);
        }
        return *this;
    }
};

struct PdfDictObj : PdfObject {
    unordered_map<string, shared_ptr<PdfObject>> dict;
    PdfDictObj() { }
};

struct PdfArrayObj : PdfObject {
    vector<shared_ptr<PdfObject>> array;
    PdfArrayObj() { }
};

struct PdfReference {
    long long obj_nr;
    long gen_nr;
    PdfReference(long long n, long g)
        : obj_nr(n), gen_nr(g) { }
    bool operator==(PdfReference r) const {
        return obj_nr == r.obj_nr && gen_nr == r.gen_nr;
    }
};

struct PdfReferenceHash {
    size_t operator()(PdfReference r) const {
        return hash<long long>()(r.obj_nr) ^ (31*(hash<long>()(r.gen_nr)));
    }
};

struct PdfReferenceObj : PdfObject {
    PdfReference reference;
    PdfReferenceObj(PdfReference ref)
        : reference(ref) { }
};

struct PdfToken {
    string t, value;
    PdfToken(const string & type, const string & value)
        : t(type), value(value) { }
    PdfToken(PdfToken && tok)
        : t(move(tok.t)), value(move(tok.value)) { }
    PdfToken& operator=(PdfToken && tok) {
        if(this != &tok) {
            t = move(tok.t);
            value = move(tok.value);
        }
        return *this;
    }
};

struct PdfTokenObj : PdfObject {
    PdfToken token;
    PdfTokenObj(const PdfToken & tok)
        : token(tok) { }
    PdfTokenObj(PdfToken && tok)
        : token(move(tok)) { }
};

struct PdfStreamObj : PdfObject {
    shared_ptr<PdfDictObj> dict;
    vector<char> stream;
    PdfStreamObj(const shared_ptr<PdfDictObj> & dict, const vector<char> & stream)
        : dict(dict), stream(stream) { }
};

void skip_spaces(ifstream & pdf_file) {
    int by;
    do {
        by = pdf_file.get();
    } while(by != char_traits<char>::eof() && isspace(by));
    pdf_file.seekg(-1, ios_base::cur);
}

void expect(const string & tok1, const string & tok2) {
    if(tok1 != tok2) {
        throw runtime_error("Encontrou " + tok1 + " quando esperava " + tok2);
    }
}

const string number_extra_chars(".+-");

class PdfXref;

class PdfTokenReader {
private:
    stack<PdfToken> token_buffer;
public:
    PdfXref *xref;
    ifstream &pdf_file;
    PdfTokenReader(ifstream & pdf_file):
        xref(nullptr),
        pdf_file(pdf_file) { }
    void unread_token(const PdfToken & token) {
        token_buffer.push(token);
    }
    PdfToken next_token() {
        if(!token_buffer.empty()) {
            PdfToken result = token_buffer.top();
            token_buffer.pop();
            return result;
        }
        int by;
        while(true) {
            by = pdf_file.get();
            if(by == '%')
                skip_comment();
            else if(isspace(by))
                skip_spaces(pdf_file);
            else
                 break;
        }

        auto trata_word = [this]() -> PdfToken {
            auto word = read_word();
            if(word == "obj" || word == "endobj" || word == "stream" || word == "endstream"
                    || word == "true" || word == "false" || word == "null"
                    || word == "xref" || word == "trailer") {
                return PdfToken(word, "");
            } else {
                throw runtime_error(
                    "Erro no PDF no byte " + to_string(pdf_file.tellg())
                    + " lendo " + word);
            }
        };
        if(isdigit(by) || number_extra_chars.find_first_of(by) != string::npos) {
            pdf_file.seekg(-1, ios_base::cur);
            return PdfToken("NUMBER", read_number());
        } else {
            switch(by) {
                case '<':
                    by = pdf_file.get();
                    if(by == '<') {
                        return PdfToken("<<", "");
                    } else {
                        pdf_file.seekg(-1, ios_base::cur);
                        return PdfToken("HX_STRING", read_hx_string());
                    }
                case '>':
                    by = pdf_file.get();
                    if(by == '>') {
                        return PdfToken(">>", "");
                    } else {
                        pdf_file.seekg(-1, ios_base::cur);
                        return PdfToken(">", ""); // Não deve acontecer, porque o read_hx_string vai consumir o '>'
                    }
                case '(':
                    return PdfToken("STRING", read_string());
                case '/':
                    return PdfToken("NAME", read_name());
                case '[':
                    return PdfToken("[", "");
                case ']':
                    return PdfToken("]", "");
                case '{':
                    return PdfToken("CODE", read_code());
                case 'R':
                    by = pdf_file.get();
                    pdf_file.seekg(-1, ios_base::cur);
                    if(is_delimiter(by)) {
                        return PdfToken("R", "");
                    } else {
                        pdf_file.seekg(-1, ios_base::cur);
                        return trata_word();
                    }
                default:
                    pdf_file.seekg(-1, ios_base::cur);
                    return trata_word();
            }
        }
    }
private:
    string read_word() {
        string result;
        int by = pdf_file.get();
        while(!is_delimiter(by)) {
            result += by;
            by = pdf_file.get();
        }
        pdf_file.seekg(-1, ios_base::cur);
        return result;
    }
    string read_number() {
        string result;
        int by = pdf_file.get();
        while(isdigit(by) || number_extra_chars.find_first_of(by) != string::npos) {
            result += by;
            by = pdf_file.get();
        }
        pdf_file.seekg(-1, ios_base::cur);
        return result;
    }
    void skip_comment() {
        int by;
        do {
            by = pdf_file.get();
            if(by == '\r') {
                by = pdf_file.get();
                if(by != '\n') {
                    pdf_file.seekg(-1, ios_base::cur);
                }
                return;
            }
        } while(by != '\n');
    }
    string read_code() {
        // Tratando código como 1 token só porque
        // não estamos interessados no seu conteúdo.
        // Mas é claro que o código é composto de vários tokens!
        int par_level = 0;
        string result;
        int by;
        while(true) {
            by = pdf_file.get();
            if(by == '{') {
                ++par_level;
                result += '{';
            } else if(by == '}') {
                --par_level;
                if(par_level < 0) {
                    return result;
                } else {
                    result += '}';
                }
            } else {
                result += by;
            }
        }
    }
    string read_name() {
        return read_word();
    }
    string read_hx_string() {
        string result;
        int by;
        while(by = pdf_file.get(), by != '>') {
            result += by;
        }
        return result;
    }
    string read_string() {
        int par_level = 0;
        bool escaped = false;
        string result;
        int by;
        while(true) {
            by = pdf_file.get();
            if(escaped) {
                switch(by) {
                    case 'n': result += '\n'; break;
                    case 'r': result += '\r'; break;
                    case 'b': result += '\b'; break;
                    case 'f': result += '\f'; break;
                    case '(': result += '(';  break;
                    case ')': result += ')';  break;
                    default:
                        if(isdigit(by)) {
                            // Não suportado ainda
                            result += '\\' + by;
                        }
                        break;
                }
                escaped = false;
            } else if(by == '\\') {
                escaped = true;
            } else if(by == '(') {
                ++par_level;
                result += '(';
            } else if(by == ')') {
                --par_level;
                if(par_level < 0) {
                    return result;
                }
            } else {
                result += by;
            }
        }
    }
};

struct PdfXrefItem {
    long long id, byte_pos;
    long gen_nr;
    char type;
};

class PdfXref {
    unordered_map<PdfReference, PdfXrefItem, PdfReferenceHash> objs;
public:
    void parse_xref(PdfTokenReader & token_reader) {
        auto token = token_reader.next_token();
        int by;
        expect(token.t, "xref");
        do {
            token = token_reader.next_token();
            expect(token.t, "NUMBER");
            auto start_id = stoll(token.value);

            token = token_reader.next_token();
            expect(token.t, "NUMBER");
            int nr_of_lines = stoi(token.value);

            skip_spaces(token_reader.pdf_file);
            if(nr_of_lines != 0) {
                auto nr_of_expected_bytes = 20 * nr_of_lines;
                string bytes_xref(nr_of_expected_bytes, '');
                token_reader.pdf_file.read(&bytes_xref[0], nr_of_expected_bytes);
                add_block(start_id, nr_of_lines, bytes_xref);
            }

            skip_spaces(token_reader.pdf_file);
            by = token_reader.pdf_file.get();
            token_reader.pdf_file.seekg(-1, ios_base::cur);
        } while(isdigit(by));
    }
    PdfXrefItem get_obj(PdfReference reference) {
        return objs[reference];
    }
private:
    void add_block(long long start_id, int nr_of_lines, const string & bytes_xref) {
        for(int i = 0; i < nr_of_lines; ++i) {
            add_obj(i + start_id, bytes_xref.substr(20*i, 20));
        }
    }
    void add_obj(long long id, const string & str_line) {
        PdfXrefItem xref_item;
        xref_item.type = str_line[17];
        xref_item.id = id;
        if(str_line[17] == 'n') {
            xref_item.byte_pos = stoll(str_line);
        }
        xref_item.gen_nr = stol(str_line.substr(11));

        PdfReference key(xref_item.id, xref_item.gen_nr);
        if(objs.find(key) == objs.end()) {
            objs.insert(make_pair(key, xref_item));
        }
    }
};

shared_ptr<PdfDictObj> parse_dict(PdfTokenReader &);
shared_ptr<PdfArrayObj> parse_array(PdfTokenReader &);
shared_ptr<PdfStreamObj> read_stream(PdfTokenReader &, const shared_ptr<PdfDictObj> &);

shared_ptr<PdfObject> parse_object(PdfTokenReader & token_reader) {
    auto objhelper = [&token_reader]() -> shared_ptr<PdfObject> {
        auto result = parse_object(token_reader);
        auto token = token_reader.next_token();
        expect(token.t, "endobj");
        return result;
    };
    auto token1 = token_reader.next_token();
    if("NUMBER" == token1.t) {
        auto token2 = token_reader.next_token();
        if("NUMBER" == token2.t) {
            auto token3 = token_reader.next_token();
            if("obj" == token3.t) {
                return objhelper();
            } else if("R" == token3.t) {
                return shared_ptr<PdfObject>(new PdfReferenceObj(PdfReference(stoll(token1.value), stol(token2.value))));
            } else {
                token_reader.unread_token(token3);
                token_reader.unread_token(token2);
                return shared_ptr<PdfObject>(new PdfNumberObj(stod(token1.value)));
            }
        } else {
            token_reader.unread_token(token2);
            return shared_ptr<PdfObject>(new PdfNumberObj(stod(token1.value)));
        }
    } else if("[" == token1.t) {
        return parse_array(token_reader);
    } else if("<<" == token1.t) {
        shared_ptr<PdfDictObj> dictionary(parse_dict(token_reader));
        auto token4 = token_reader.next_token();
        if("stream" == token4.t) {
            return read_stream(token_reader, dictionary);
        } else {
            token_reader.unread_token(token4);
            return dictionary;
        }
    } else if("obj" == token1.t) {
        return objhelper();
    } else if("STRING" == token1.t || "HX_STRING" == token1.t) {
        return shared_ptr<PdfObject>(new PdfStrObj(move(token1.value)));
    } else {
        return shared_ptr<PdfObject>(new PdfTokenObj(move(token1)));
    }
}

shared_ptr<PdfDictObj> parse_dict(PdfTokenReader & token_reader) {
    shared_ptr<PdfDictObj> result(new PdfDictObj);
    while(true) {
        auto key_token = token_reader.next_token();
        if(key_token.t == ">>") {
            return result;
        } else {
            auto objeto = parse_object(token_reader);
            auto tok = dynamic_cast<PdfTokenObj*>(objeto.get());
            if(tok && tok->token.t == ">>") {
                throw new runtime_error(">> inesperado em pos = " + to_string(token_reader.pdf_file.tellg()));
            } else {
                result->dict.insert(make_pair(move(key_token.value), objeto));
            }
        }
    }
}

shared_ptr<PdfArrayObj> parse_array(PdfTokenReader & token_reader) {
    shared_ptr<PdfArrayObj> result(new PdfArrayObj);
    while(true) {
        auto token = token_reader.next_token();
        if(token.t == "]") {
            return result;
        } else {
            token_reader.unread_token(token);
            result->array.push_back(parse_object(token_reader));
        }
    }
}

shared_ptr<PdfObject> find_indirect_obj(PdfTokenReader & token_reader, PdfReference reference);

shared_ptr<PdfStreamObj> read_stream(PdfTokenReader & token_reader, const shared_ptr<PdfDictObj> & dict) {
    int by = token_reader.pdf_file.get();
    if(by == '\r') { // É \r\n?
        by = token_reader.pdf_file.get(); // leu \r, ignorar \n
    } else if(by == '\n') {
        // ignorar \n
    } else {
        token_reader.pdf_file.seekg(-1, ios_base::cur);
    }
    shared_ptr<PdfObject> len_obj(dict->dict["Length"]);
    if(auto len_reference_obj = dynamic_cast<PdfReferenceObj*>(len_obj.get())) {
        auto saved_pos = token_reader.pdf_file.tellg();
        len_obj = find_indirect_obj(token_reader, len_reference_obj->reference);
        token_reader.pdf_file.seekg(saved_pos, ios_base::beg);
    }
    if(auto len_number_obj = dynamic_cast<PdfNumberObj*>(len_obj.get())) {
        auto len = static_cast<unsigned int>(len_number_obj->number);
        vector<char> bytes(len);
        // faz de conta que lê o stream
        token_reader.pdf_file.seekg(len, ios_base::cur);
        auto token = token_reader.next_token();
        expect(token.t, "endstream");
        return shared_ptr<PdfStreamObj>(new PdfStreamObj(dict, bytes));
    } else {
        throw new runtime_error("Erro: /Length não é um número. Pos = " + to_string(token_reader.pdf_file.tellg()));
    }
}

shared_ptr<PdfObject> read_indirect_obj(PdfTokenReader & token_reader, PdfXrefItem xref_item) {
    if(xref_item.type == 'n') {
        token_reader.pdf_file.seekg(xref_item.byte_pos, ios_base::beg);
        return parse_object(token_reader);
    } else {
        throw new runtime_error("Procurando objeto inválido " + to_string(xref_item.id));
    }
}

shared_ptr<PdfObject> find_indirect_obj(PdfTokenReader & token_reader, PdfReference reference) {
    return read_indirect_obj(token_reader, token_reader.xref->get_obj(reference));
}

shared_ptr<PdfDictObj> parse_trailer(PdfTokenReader & token_reader) {
    auto token = token_reader.next_token();
    expect(token.t, "trailer");
    token = token_reader.next_token();
    expect(token.t, "<<");
    return parse_dict(token_reader);
}

int get_number_of_pages(const string & file_name) {
    const auto buffer_size = 5;
    ifstream pdf_file(file_name, ios_base::in | ios_base::binary);
    char comeco_pdf[buffer_size + 1] = { 0 };
    pdf_file.read(comeco_pdf, buffer_size);
    if(strcmp(comeco_pdf, "%PDF-") != 0) {
        throw runtime_error("Não é um arquivo PDF");
    }
    pdf_file.seekg(0, ios_base::end);
    auto length = pdf_file.tellg();
    const auto default_end_buffer_size = 400u;
    const int end_buffer_size = min((unsigned int) length, default_end_buffer_size);
    pdf_file.seekg(-end_buffer_size, ios_base::end);
    string startxref(end_buffer_size, '');
    pdf_file.read(&startxref[0], end_buffer_size);

    smatch regex_match;
    if(!regex_search(startxref, regex_match, regex("startxref\\s+(\\d+)\\s+%%EOF"))) {
        throw runtime_error("Não achou o número de páginas (startxref)");
    }
    PdfTokenReader token_reader(pdf_file);

    auto linearized = [&pdf_file, &token_reader]() -> int {
        pdf_file.seekg(0, ios_base::beg);
        auto first_obj = parse_object(token_reader);
        auto& first_dict = dynamic_cast<PdfDictObj&>(*first_obj);
        if(first_dict.dict.find("Linearized") != first_dict.dict.end()) {
            auto iter_n = first_dict.dict.find("N");
            if(iter_n != first_dict.dict.end()) {
                auto& n = dynamic_cast<PdfNumberObj&>(*iter_n->second);
                return static_cast<int>(n.number);
            } else {
                throw runtime_error("Não achou o número de páginas (/N)");
            }
        } else {
            throw runtime_error("Não achou o número de páginas (/Linearized)");
        }
    };
    auto pos = stoll(regex_match[1]);
    if(pos == 0LL) {
        return linearized();
    }
    pdf_file.seekg(pos, ios_base::beg);
    auto token = token_reader.next_token();
    if(token.t == "xref") {
        token_reader.unread_token(token);
        vector<shared_ptr<PdfDictObj>> trailers;
        PdfXref xref;
        token_reader.xref = &xref;
        while(true) {
            xref.parse_xref(token_reader);
            shared_ptr<PdfDictObj> trailer(parse_trailer(token_reader));
            trailers.push_back(trailer);
            auto iter_prev = trailer->dict.find("Prev");
            if(iter_prev != trailer->dict.end()) {
                PdfNumberObj & nPrev = dynamic_cast<PdfNumberObj&>(*iter_prev->second);
                pdf_file.seekg(static_cast<long long>(nPrev.number), ios_base::beg);
            } else {
                break;
            }
        }
        shared_ptr<PdfObject> catalog = nullptr;
        for(auto tr = trailers.begin(), tr_end = trailers.end(); tr != tr_end; ++tr) {
            auto& dict = tr->get()->dict;
            auto iter_root = dict.find("Root");
            if(iter_root != dict.end()) {
                auto& ref_root = dynamic_cast<PdfReferenceObj&>(*iter_root->second);
                catalog = find_indirect_obj(token_reader, ref_root.reference);
                break;
            }
        }
        if(catalog == nullptr) throw new runtime_error("Não achou o número de páginas (catalog)");

        shared_ptr<PdfObject> pages(
            find_indirect_obj(
                token_reader,
                dynamic_cast<PdfReferenceObj&>(
                    *dynamic_cast<PdfDictObj&>(*catalog).dict["Pages"]
                ).reference
            )
        );

        PdfNumberObj& n = dynamic_cast<PdfNumberObj&>(*dynamic_cast<PdfDictObj&>(*pages).dict["Count"]);

        return static_cast<int>(n.number);
    } else {
        return linearized();
    }
}

int main(int argc, char** argv) {
    if(argc < 2) {
        cout << "Uso: PdfPageCount <nome-arquivo.pdf>\n";
        return 1;
    } else {
        ios_base::sync_with_stdio(false);
        int sucessos = 0, falhas = 0;
        for(int i = 1; i < argc; ++i) {
            try {
                cout << "Número de páginas de " << argv[i] << ": " << get_number_of_pages(argv[i]) << '\n';
                ++sucessos;
            } catch(exception e) {
                cout << e.what() << '\n';
                cout << "Ocorreu um erro ao processar " << argv[i] << ". Continuando...\n";
                ++falhas;
            }
        }
            if(argc > 2)
                cout << "Fim do processo. " << sucessos << " sucesso(s), " << falhas << " falha(s).\n";
            return 0;
        }
    }
}

Versão usada nos testes de 2010-09-19:
BitBucket – PdfPageCount.cpp v2010-09-19

/*
Este programa mostra na tela o número de páginas de um
ou mais arquivos PDF passados na linha de comando.
Este código fonte é de domínio público (PUBLIC DOMAIN)
Desenvolvido em C++0x (que já é C++1x) com o Visual C++ 2010 Express.
Pode conter erros no uso dos novos recursos da linguagem
(ou outros erros quaisquer, hehehehe).
Se conseguirem compilar em outro compilador, me avisem.
Falta suporte a a coisas como stoll, to_string, nullptr
no mingw-w64.
Marcus.
*/

// Em modo debug, dá um loop infinito ao destruir o unordered_map PdfXref::objs,
// com certos arquivos pdf.
// Se isso indica um bug na biblioteca ou no meu código, isso eu não sei.
// De qualquer jeito, estou usando mais o modo Release porque a diferença
// de desempenho é enorme.
#define _HAS_ITERATOR_DEBUGGING 0

#include <cctype>
#include <cstddef>
#include <cstring>
#include <algorithm>
#include <fstream>
#include <iostream>
#include <memory>
#include <regex>
#include <stack>
#include <stdexcept>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>

using namespace std;

const string delimiters("()<>[]{}/%");

bool is_delimiter(char ch) {
    if(delimiters.find_first_of(ch) != string::npos) {
        return true;
    }
    return isspace(ch) != 0;
}

struct PdfObject {
    virtual ~PdfObject() { }
};

struct PdfNumberObj : PdfObject {
    double number;
    PdfNumberObj(double value)
        : number(value) { }
};

struct PdfStrObj : PdfObject {
    string str;
    PdfStrObj(const string & value)
        : str(value) { }
    PdfStrObj(string && value)
        : str(move(value)) { } // é assim que se faz um move constructor?
    PdfStrObj& operator=(PdfStrObj && other) {
        if(this != &other) {
            str = move(other.str);
        }
        return *this;
    }
};

struct PdfDictObj : PdfObject {
    unordered_map<string, shared_ptr<PdfObject>> dict;
    PdfDictObj() { }
};

struct PdfArrayObj : PdfObject {
    vector<shared_ptr<PdfObject>> array;
    PdfArrayObj() { }
};

struct PdfReference {
    long long objNum, genNum;
    PdfReference(long long n, long long g)
        : objNum(n), genNum(g) { }
    bool operator==(PdfReference r) const {
        return objNum == r.objNum && genNum == r.genNum;
    }
};

struct PdfReferenceHash {
    size_t operator()(PdfReference r) const {
        return hash<long long>()(r.objNum) ^ (31*(hash<long long>()(r.genNum)));
    }
};

struct PdfReferenceObj : PdfObject {
    PdfReference reference;
    PdfReferenceObj(PdfReference ref)
        : reference(ref) { }
};

struct PdfToken {
    string t, value;
    PdfToken(const string & type, const string & value)
        : t(type), value(value) { }
    PdfToken(PdfToken && tok)
        : t(move(tok.t)), value(move(tok.value)) { }
    PdfToken& operator=(PdfToken && tok) {
        if(this != &tok) {
            t = move(tok.t);
            value = move(tok.value);
        }
        return *this;
    }
};

struct PdfTokenObj : PdfObject {
    PdfToken token;
    PdfTokenObj(const PdfToken & tok)
        : token(tok) { }
    PdfTokenObj(PdfToken && tok)
        : token(move(tok)) { }
};

struct PdfStreamObj : PdfObject {
    shared_ptr<PdfDictObj> dict;
    vector<char> stream;
    PdfStreamObj(const shared_ptr<PdfDictObj> & dict, const vector<char> & stream)
        : dict(dict), stream(stream) { }
};

void skip_spaces(ifstream & pdf_file) {
    while(true) {
        auto by = pdf_file.get();
        if(by != char_traits<char>::eof() && !isspace(by)) {
            pdf_file.seekg(-1, ios_base::cur);
            return;
        }
    }
}

void expect(const string & tok1, const string & tok2) {
    if(tok1 != tok2) {
        throw runtime_error("Encontrou " + tok1 + " quando esperava " + tok2);
    }
}

const string number_extra_chars(".+-");

class PdfXref;

class PdfTokenReader {
private:
    stack<PdfToken> token_buffer;
public:
    PdfXref *xref;
    ifstream &pdf_file;
    PdfTokenReader(ifstream & pdf_file):
        xref(nullptr),
        pdf_file(pdf_file) { }
    void unread_token(const PdfToken & token) {
        token_buffer.push(token);
    }
    PdfToken next_token() {
        if(!token_buffer.empty()) {
            PdfToken result = token_buffer.top();
            token_buffer.pop();
            return result;
        }
        int by;
        while(true) {
            by = pdf_file.get();
            if(by == '%') {
                skip_comment();
            } else if(isspace(by)) {
                skip_spaces(pdf_file);
            } else {
                pdf_file.seekg(-1, ios_base::cur);
                break;
            }
        }

        auto trata_word = [this]() -> PdfToken {
            auto word = read_word();
            if(word == "obj" || word == "endobj" || word == "stream" || word == "endstream"
                    || word == "true" || word == "false" || word == "null"
                    || word == "xref" || word == "trailer") {
                return PdfToken(word, "");
            } else {
                throw runtime_error(
                    "Erro no PDF no byte " + to_string(pdf_file.tellg())
                    + " lendo " + word);
            }
        };
        by = pdf_file.get();
        if(isdigit(by) || number_extra_chars.find_first_of(by) != string::npos) {
            pdf_file.seekg(-1, ios_base::cur);
            return PdfToken("NUMBER", read_number());
        } else {
            switch(by) {
                case '<':
                    by = pdf_file.get();
                    if(by == '<') {
                        return PdfToken("<<", "");
                    } else {
                        pdf_file.seekg(-1, ios_base::cur);
                        return PdfToken("HX_STRING", read_hx_string());
                    }
                case '>':
                    by = pdf_file.get();
                    if(by == '>') {
                        return PdfToken(">>", "");
                    } else {
                        pdf_file.seekg(-1, ios_base::cur);
                        return PdfToken(">", ""); // Não deve acontecer, porque o read_hx_string vai consumir o '>'
                    }
                case '(':
                    return PdfToken("STRING", read_string());
                case '/':
                    return PdfToken("NAME", read_name());
                case '[':
                    return PdfToken("[", "");
                case ']':
                    return PdfToken("]", "");
                case '{':
                    return PdfToken("CODE", read_code());
                case 'R':
                    by = pdf_file.get();
                    pdf_file.seekg(-1, ios_base::cur);
                    if(is_delimiter(by)) {
                        return PdfToken("R", "");
                    } else {
                        pdf_file.seekg(-1, ios_base::cur);
                        return trata_word();
                    }
                default:
                    pdf_file.seekg(-1, ios_base::cur);
                    return trata_word();
            }
        }
    }
private:
    string read_word() {
        string result;
        int by = pdf_file.get();
        while(!is_delimiter(by)) {
            result += by;
            by = pdf_file.get();
        }
        pdf_file.seekg(-1, ios_base::cur);
        return result;
    }
    string read_number() {
        string result;
        int by = pdf_file.get();
        while(isdigit(by) || number_extra_chars.find_first_of(by) != string::npos) {
            result += by;
            by = pdf_file.get();
        }
        pdf_file.seekg(-1, ios_base::cur);
        return result;
    }
    void skip_comment() {
        int by;
        while(true) {
            by = pdf_file.get();
            if(by == '\r') {
                by = pdf_file.get();
                if(by == '\n') {
                    return;
                } else {
                    pdf_file.seekg(-1, ios_base::cur);
                }
            } else if(by == '\n') {
                return;
            }
        }
    }
    string read_code() {
        // Tratando código como 1 token só porque
        // não estamos interessados no seu conteúdo.
        // Mas é claro que o código é composto de vários tokens!
        int par_level = 0;
        string result;
        int by;
        while(true) {
            by = pdf_file.get();
            if(by == '{') {
                ++par_level;
                result += '{';
            } else if(by == '}') {
                --par_level;
                if(par_level < 0) {
                    return result;
                } else {
                    result += '}';
                }
            } else {
                result += by;
            }
        }
    }
    string read_name() {
        return read_word();
    }
    string read_hx_string() {
        string result;
        int by;
        while(by = pdf_file.get(), by != '>') {
            result += by;
        }
        return result;
    }
    string read_string() {
        int par_level = 0;
        bool escaped = false;
        string result;
        int by;
        while(true) {
            by = pdf_file.get();
            if(escaped) {
                switch(by) {
                    case 'n': result += '\n'; break;
                    case 'r': result += '\r'; break;
                    case 'b': result += '\b'; break;
                    case 'f': result += '\f'; break;
                    case '(': result += '(';  break;
                    case ')': result += ')';  break;
                    default:
                        if(isdigit(by)) {
                            // Não suportado ainda
                            result += '\\' + by;
                        }
                        break;
                }
                escaped = false;
            } else if(by == '\\') {
                escaped = true;
            } else if(by == '(') {
                ++par_level;
                result += '(';
            } else if(by == ')') {
                --par_level;
                if(par_level < 0) {
                    return result;
                }
            } else {
                result += by;
            }
        }
    }
};

struct PdfXrefItem {
    long long id, byte_pos, gen_nr;
    char type;
};

long long read_number(const string & str, int pos) {
    int endPos = pos;
    while(true) {
        if(number_extra_chars.find_first_of(str[endPos]) == string::npos && !isdigit(str[endPos])) {
            return stoll(str.substr(pos, endPos - pos));
        }
        ++endPos;
    }
}

class PdfXref {
    unordered_map<PdfReference, PdfXrefItem, PdfReferenceHash> objs;
public:
    void parse_xref(PdfTokenReader & token_reader) {
        auto token = token_reader.next_token();
        int by;
        expect(token.t, "xref");
        do {
            token = token_reader.next_token();
            expect(token.t, "NUMBER");
            auto start_id = stoll(token.value);

            token = token_reader.next_token();
            expect(token.t, "NUMBER");
            int nr_of_lines = stoi(token.value);

            skip_spaces(token_reader.pdf_file);
            if(nr_of_lines != 0) {
                auto nr_of_expected_bytes = 20 * nr_of_lines;
                string bytes_xref(nr_of_expected_bytes, '');
                token_reader.pdf_file.read(&bytes_xref[0], nr_of_expected_bytes);
                add_block(start_id, nr_of_lines, bytes_xref);
            }

            skip_spaces(token_reader.pdf_file);
            by = token_reader.pdf_file.get();
            token_reader.pdf_file.seekg(-1, ios_base::cur);
        } while(isdigit(by));
    }
    PdfXrefItem get_obj(PdfReference reference) {
        return objs[reference];
    }
private:
    void add_block(long long start_id, int nr_of_lines, const string & bytes_xref) {
        for(int i = 0; i < nr_of_lines; ++i) {
            add_obj(i + start_id, bytes_xref.substr(20*i, 20));
        }
    }
    void add_obj(long long id, const string & str_line) {
        PdfXrefItem xref_item;
        xref_item.type = str_line[17];
        xref_item.id = id;
        if(str_line[17] == 'n') {
            xref_item.byte_pos = read_number(str_line, 0);
        }
        xref_item.gen_nr = read_number(str_line, 11);
        PdfReference key(xref_item.id, xref_item.gen_nr);
        if(objs.find(key) == objs.end()) {
            objs.insert(make_pair(key, xref_item));
        }
    }
};

shared_ptr<PdfDictObj> parse_dict(PdfTokenReader &);
shared_ptr<PdfArrayObj> parse_array(PdfTokenReader &);
shared_ptr<PdfStreamObj> read_stream(PdfTokenReader &, const shared_ptr<PdfDictObj> &);

shared_ptr<PdfObject> parse_object(PdfTokenReader & token_reader) {
    auto objhelper = [&token_reader]() -> shared_ptr<PdfObject> {
        auto result = parse_object(token_reader);
        auto token = token_reader.next_token();
        expect(token.t, "endobj");
        return result;
    };
    auto token1 = token_reader.next_token();
    if("NUMBER" == token1.t) {
        auto token2 = token_reader.next_token();
        if("NUMBER" == token2.t) {
            auto token3 = token_reader.next_token();
            if("obj" == token3.t) {
                return objhelper();
            } else if("R" == token3.t) {
                return shared_ptr<PdfObject>(new PdfReferenceObj(PdfReference(stoll(token1.value), stoll(token2.value))));
            } else {
                token_reader.unread_token(token3);
                token_reader.unread_token(token2);
                return shared_ptr<PdfObject>(new PdfNumberObj(stod(token1.value)));
            }
        } else {
            token_reader.unread_token(token2);
            return shared_ptr<PdfObject>(new PdfNumberObj(stod(token1.value)));
        }
    } else if("[" == token1.t) {
        return parse_array(token_reader);
    } else if("<<" == token1.t) {
        shared_ptr<PdfDictObj> dictionary(parse_dict(token_reader));
        auto token4 = token_reader.next_token();
        if("stream" == token4.t) {
            return read_stream(token_reader, dictionary);
        } else {
            token_reader.unread_token(token4);
            return dictionary;
        }
    } else if("obj" == token1.t) {
        return objhelper();
    } else if("STRING" == token1.t || "HX_STRING" == token1.t) {
        return shared_ptr<PdfObject>(new PdfStrObj(move(token1.value)));
    } else {
        return shared_ptr<PdfObject>(new PdfTokenObj(move(token1)));
    }
}

shared_ptr<PdfDictObj> parse_dict(PdfTokenReader & token_reader) {
    shared_ptr<PdfDictObj> result(new PdfDictObj);
    while(true) {
        auto key_token = token_reader.next_token();
        if(key_token.t == ">>") {
            return result;
        } else {
            auto objeto = parse_object(token_reader);
            auto tok = dynamic_cast<PdfTokenObj*>(objeto.get());
            if(tok && tok->token.t == ">>") {
                throw new runtime_error(">> inesperado em pos = " + to_string(token_reader.pdf_file.tellg()));
            } else {
                result->dict.insert(make_pair(move(key_token.value), objeto));
            }
        }
    }
}

shared_ptr<PdfArrayObj> parse_array(PdfTokenReader & token_reader) {
    shared_ptr<PdfArrayObj> result(new PdfArrayObj);
    while(true) {
        auto token = token_reader.next_token();
        if(token.t == "]") {
            return result;
        } else {
            token_reader.unread_token(token);
            result->array.push_back(parse_object(token_reader));
        }
    }
}

shared_ptr<PdfObject> find_indirect_obj(PdfTokenReader & token_reader, PdfReference reference);

shared_ptr<PdfStreamObj> read_stream(PdfTokenReader & token_reader, const shared_ptr<PdfDictObj> & dict) {
    int by = token_reader.pdf_file.get();
    if(by == '\r') { // É \r\n?
        by = token_reader.pdf_file.get(); // leu \r, ignorar \n
    } else if(by == '\n') {
        // ignorar \n
    } else {
        token_reader.pdf_file.seekg(-1, ios_base::cur);
    }
    shared_ptr<PdfObject> len_obj(dict->dict["Length"]);
    if(auto len_reference_obj = dynamic_cast<PdfReferenceObj*>(len_obj.get())) {
        auto saved_pos = token_reader.pdf_file.tellg();
        len_obj = find_indirect_obj(token_reader, len_reference_obj->reference);
        token_reader.pdf_file.seekg(saved_pos, ios_base::beg);
    }
    if(auto len_number_obj = dynamic_cast<PdfNumberObj*>(len_obj.get())) {
        auto len = static_cast<unsigned int>(len_number_obj->number);
        vector<char> bytes(len);
        // faz de conta que lê o stream
        token_reader.pdf_file.seekg(len, ios_base::cur);
        auto token = token_reader.next_token();
        expect(token.t, "endstream");
        return shared_ptr<PdfStreamObj>(new PdfStreamObj(dict, bytes));
    } else {
        throw new runtime_error("Erro: /Length não é um número. Pos = " + to_string(token_reader.pdf_file.tellg()));
    }
}

shared_ptr<PdfObject> read_indirect_obj(PdfTokenReader & token_reader, PdfXrefItem xref_item) {
    if(xref_item.type == 'n') {
        token_reader.pdf_file.seekg(xref_item.byte_pos, ios_base::beg);
        return parse_object(token_reader);
    } else {
        throw new runtime_error("Procurando objeto inválido " + to_string(xref_item.id));
    }
}

shared_ptr<PdfObject> find_indirect_obj(PdfTokenReader & token_reader, PdfReference reference) {
    return read_indirect_obj(token_reader, token_reader.xref->get_obj(reference));
}

shared_ptr<PdfDictObj> parse_trailer(PdfTokenReader & token_reader) {
    auto token = token_reader.next_token();
    expect(token.t, "trailer");
    token = token_reader.next_token();
    expect(token.t, "<<");
    return parse_dict(token_reader);
}

int get_number_of_pages(const string & file_name) {
    const auto buffer_size = 5;
    ifstream pdf_file(file_name, ios_base::in | ios_base::binary);
    char comeco_pdf[buffer_size + 1] = { 0 };
    pdf_file.read(comeco_pdf, buffer_size);
    if(strcmp(comeco_pdf, "%PDF-") != 0) {
        throw runtime_error("Não é um arquivo PDF");
    }
    pdf_file.seekg(0, ios_base::end);
    auto length = pdf_file.tellg();
    const auto default_end_buffer_size = 400u;
    const int end_buffer_size = min((unsigned int) length, default_end_buffer_size);
    pdf_file.seekg(-end_buffer_size, SEEK_END);
    string startxref(end_buffer_size, '');
    pdf_file.read(&startxref[0], end_buffer_size);

    smatch regex_match;
    if(!regex_search(startxref, regex_match, regex("startxref\\s+(\\d+)\\s+%%EOF"))) {
        throw runtime_error("Não achou o número de páginas (startxref)");
    }
    PdfTokenReader token_reader(pdf_file);

    auto linearized = [&pdf_file, &token_reader]() -> int {
        pdf_file.seekg(0, ios_base::beg);
        auto first_obj = parse_object(token_reader);
        auto& first_dict = dynamic_cast<PdfDictObj&>(*first_obj);
        if(first_dict.dict.find("Linearized") != first_dict.dict.end()) {
            auto iter_n = first_dict.dict.find("N");
            if(iter_n != first_dict.dict.end()) {
                auto& n = dynamic_cast<PdfNumberObj&>(*iter_n->second);
                return static_cast<int>(n.number);
            } else {
                throw runtime_error("Não achou o número de páginas (/N)");
            }
        } else {
            throw runtime_error("Não achou o número de páginas (/Linearized)");
        }
    };
    auto pos = stoll(regex_match[1]);
    if(pos == 0LL) {
        return linearized();
    }
    pdf_file.seekg(pos, ios_base::beg);
    auto token = token_reader.next_token();
    if(token.t == "xref") {
        token_reader.unread_token(token);
        vector<shared_ptr<PdfDictObj>> trailers;
        PdfXref xref;
        token_reader.xref = &xref;
        while(true) {
            xref.parse_xref(token_reader);
            shared_ptr<PdfDictObj> trailer(parse_trailer(token_reader));
            trailers.push_back(trailer);
            auto iter_prev = trailer->dict.find("Prev");
            if(iter_prev != trailer->dict.end()) {
                PdfNumberObj & nPrev = dynamic_cast<PdfNumberObj&>(*iter_prev->second);
                pdf_file.seekg(static_cast<long long>(nPrev.number), ios_base::beg);
            } else {
                break;
            }
        }
        shared_ptr<PdfObject> catalog = nullptr;
        for(auto tr = trailers.begin(), tr_end = trailers.end(); tr != tr_end; ++tr) {
            auto& dict = tr->get()->dict;
            auto iter_root = dict.find("Root");
            if(iter_root != dict.end()) {
                auto& ref_root = dynamic_cast<PdfReferenceObj&>(*iter_root->second);
                catalog = find_indirect_obj(token_reader, ref_root.reference);
                break;
            }
        }
        if(catalog == nullptr) throw new runtime_error("Não achou o número de páginas (catalog)");

        shared_ptr<PdfObject> pages(
            find_indirect_obj(
                token_reader,
                dynamic_cast<PdfReferenceObj&>(
                    *dynamic_cast<PdfDictObj&>(*catalog).dict["Pages"]
                ).reference
            )
        );

        PdfNumberObj& n = dynamic_cast<PdfNumberObj&>(*dynamic_cast<PdfDictObj&>(*pages).dict["Count"]);

        return static_cast<int>(n.number);
    } else {
        return linearized();
    }
}

int main(int argc, char** argv) {
    if(argc < 2) {
        cout << "Uso: PdfPageCount <nome-arquivo.pdf>\n";
        return 1;
    } else {
        ios_base::sync_with_stdio(false);
        int sucessos = 0, falhas = 0;
        for(int i = 1; i < argc; ++i) {
            try {
                cout << "Número de páginas de " << argv[i] << ": " << get_number_of_pages(argv[i]) << '\n';
                ++sucessos;
            } catch(exception e) {
                cout << e.what() << '\n';
                cout << "Ocorreu um erro ao processar " << argv[i] << ". Continuando...\n";
                ++falhas;
            }
        }
        if(argc > 2)
            cout << "Fim do processo. " << sucessos << " sucesso(s), " << falhas << " falha(s).\n";
        return 0;
    }
}
Anúncios

2 pensamentos sobre “C++: PdfPageCount.cpp

  1. Comparação de desempenho lendo PDFs « Visions of hope

  2. Comparação de desempenho (2) « Visions of hope

Deixe um comentário

Preencha os seus dados abaixo ou clique em um ícone para log in:

Logotipo do WordPress.com

Você está comentando utilizando sua conta WordPress.com. Sair / Alterar )

Imagem do Twitter

Você está comentando utilizando sua conta Twitter. Sair / Alterar )

Foto do Facebook

Você está comentando utilizando sua conta Facebook. Sair / Alterar )

Foto do Google+

Você está comentando utilizando sua conta Google+. Sair / Alterar )

Conectando a %s