Java: PdfPageCount.java

Está meio ruim de ler aqui na página por causa do tamanho das letras e do tamanho das linhas. Sugiro copiar e colar para um editor com sintaxe colorida. Obs.: eu testei com vários arquivos PDF, mas é certo que há erros à espreita. Considere-se avisado.

Existe uma versão mais nova com suporte a mais tipos de PDFs no branch experimental no BitBucket

Versão usada nos testes de 2010-10-27:

PdfPageCount.java v2010-10-27 no repositório do BitBucket

import java.util.*;
import java.util.regex.*;
import java.io.*;
import static java.lang.System.out;
import static java.lang.System.err;
import static java.lang.Character.isWhitespace;
import static java.lang.Character.isDigit;

/*
Este programa mostra na tela o número de páginas de um
ou mais arquivos PDF passados na linha de comando.
Este código fonte é de domínio público (PUBLIC DOMAIN)
Desenvolvido em Java 5/6
Fiz com classes internas para poder colocar tudo
num arquivo só.
Marcus.
*/

public final class PdfPageCount {

    static boolean isDelimiter(char ch) {
        if("()<>[]{}/%".indexOf(ch) != -1) {
            return true;
        }
        return isWhitespace(ch);
    }

    static abstract class PdfObject {}

    static class PdfNumberObj extends PdfObject {
        double number;
        PdfNumberObj(double value) { number = value; }
    }
    static class PdfStrObj extends PdfObject {
        String str;
        PdfStrObj(String value) { str = value; }
    }
    static class PdfDictObj extends PdfObject {
        Map<String, PdfObject> dict;
        PdfDictObj(Map<String, PdfObject> value) { dict = value; }
    }
    static class PdfArrayObj extends PdfObject {
        List<PdfObject> array;
        PdfArrayObj(List<PdfObject> value) { array = value; }
    }
    static class PdfReferenceObj extends PdfObject {
        long objNum;
        int genNum;
        PdfReferenceObj(long obj, int gen) { objNum = obj; genNum = gen; }
        public boolean equals(Object o) {
            if(o == null || !(o instanceof PdfReferenceObj))
                return false;
            else {
                PdfReferenceObj p = (PdfReferenceObj) o;
                return objNum == p.objNum && genNum == p.genNum;
            }
        }
        public int hashCode() {
            return (int)((objNum ^ (objNum >>> 32)) ^ (31*genNum)); // I guess genNum will be usually 0...
        }
    }
    static class PdfTokenObj extends PdfObject {
        String t, value;
        PdfTokenObj(String type, String value) { this.t = type; this.value = value; }
    }
    static class PdfStreamObj extends PdfObject {
        Map<String, PdfObject> dict;
        byte[] stream;
        PdfStreamObj(Map<String, PdfObject> dict, byte[] stream) { this.dict = dict; this.stream = stream; }
    }

    static void skipSpaces(RandomAccessFile aPdfFile) throws Exception {
        int by;
        do {
            by = aPdfFile.read();
        } while(isWhitespace((char)by));
        aPdfFile.seek(aPdfFile.getFilePointer() - 1L);
    }

    static class PdfTokenReader {
        private ArrayList<PdfTokenObj> tokenBuffer = new ArrayList<PdfTokenObj>();
        PdfXref xref; // Descobri que preciso do xref em mais lugares, então
        // criei esta propriedade para ter fácil acesso ao xref.
        RandomAccessFile pdfFile;
        public PdfTokenReader(RandomAccessFile aPdfFile) {
            pdfFile = aPdfFile;
        }
        public void unreadToken(PdfTokenObj token) {
            tokenBuffer.add(token);
        }
        private PdfTokenObj trataWord() throws Exception {
            String word = readWord();
            String[] words = {"obj", "endobj", "stream", "endstream",
                    "true", "false", "null", "xref", "trailer"};
            for(String testWord : words) {
                if(word.equals(testWord))
                    return new PdfTokenObj(word, null);
            }
            throw new Exception(
                "Erro no PDF no byte " + pdfFile.getFilePointer()
                + " lendo " + word);

        }
        public PdfTokenObj nextToken() throws Exception {
            if(!tokenBuffer.isEmpty()) {
                return tokenBuffer.remove(tokenBuffer.size() - 1);
            }
            int by;
            while(true) {
                by = pdfFile.read();
                if(by == '%') {
                    skipComment();
                } else if(isWhitespace((char)by)) {
                    skipSpaces(pdfFile);
                } else {
                    break;
                }
            }
            if(isDigit((char)by) || ".+-".indexOf((char)by) != -1) {
                pdfFile.seek(pdfFile.getFilePointer() - 1L);
                return new PdfTokenObj("NUMBER", readNumber());
            } else {
                switch((char)by) {
                    case '<':
                        by = pdfFile.read();
                        if(by == '<') {
                            return new PdfTokenObj("<<", null);
                        } else {
                            pdfFile.seek(pdfFile.getFilePointer() - 1L);
                            return new PdfTokenObj("HX_STRING", readHxString());
                        }
                    case '>':
                        by = pdfFile.read();
                        if(by == '>') {
                            return new PdfTokenObj(">>", null);
                        } else {
                            pdfFile.seek(pdfFile.getFilePointer() - 1L);
                            return new PdfTokenObj(">", null); // Não deve acontecer, porque o readHxString vai consumir o '>'
                        }
                    case '(':
                        return new PdfTokenObj("STRING", readString());
                    case '/':
                        return new PdfTokenObj("NAME", readName());
                    case '[':
                        return new PdfTokenObj("[", null);
                    case ']':
                        return new PdfTokenObj("]", null);
                    case '{':
                        return new PdfTokenObj("CODE", readCode());
                    case 'R':
                        by = pdfFile.read();
                        pdfFile.seek(pdfFile.getFilePointer() - 1L);
                        if(isDelimiter((char)by)) {
                            return new PdfTokenObj("R", null);
                        } else {
                            pdfFile.seek(pdfFile.getFilePointer() - 1L);
                            return trataWord();
                        }
                    default:
                        pdfFile.seek(pdfFile.getFilePointer() - 1L);
                        return trataWord();
                }
            }
        }
        private String readWord() throws Exception {
            StringBuilder result = new StringBuilder();
            int by;
            by = pdfFile.read();
            while(!isDelimiter((char)by)) {
                result.append((char)by);
                by = pdfFile.read();
            }
            pdfFile.seek(pdfFile.getFilePointer() - 1L);
            return result.toString();
        }
        private String readNumber() throws Exception {
            StringBuilder result = new StringBuilder();
            int by;
            by = pdfFile.read();
            while(isDigit((char)by) || ".+-".indexOf((char)by) != -1) {
                result.append((char)by);
                by = pdfFile.read();
            }
            pdfFile.seek(pdfFile.getFilePointer() - 1L);
            return result.toString();
        }
        private void skipComment() throws Exception {
            int by;
            do {
                by = pdfFile.read();
                if(by == '\r') {
                    by = pdfFile.read();
                    if(by != '\n') {
                        pdfFile.seek(pdfFile.getFilePointer() - 1L);
                    }
                    return;
                }
            } while(by != '\n');
        }
        private String readCode() throws Exception {
            // Tratando código como 1 token só porque
            // não estamos interessados no seu conteúdo.
            // Mas é claro que o código é composto de vários tokens!
            int parLevel = 0;
            StringBuilder result = new StringBuilder();
            int by;
            while(true) {
                by = pdfFile.read();
                if(by == '{') {
                    ++parLevel;
                    result.append('{');
                } else if(by == '}') {
                    --parLevel;
                    if(parLevel < 0) {
                        return result.toString();
                    } else {
                        result.append('}');
                    }
                } else {
                    result.append((char)by);
                }
            }
        }
        private String readName() throws Exception {
            return readWord();
        }
        private String readHxString() throws Exception {
            StringBuilder result = new StringBuilder();
            int by = pdfFile.read();
            while(by != '>') {
                result.append((char)by);
                by = pdfFile.read();
            }
            return result.toString();
        }
        private String readString() throws Exception {
            int parLevel = 0;
            boolean escaped = false;
            StringBuilder result = new StringBuilder();
            int by;
            while(true) {
                by = pdfFile.read();
                if(escaped) {
                    switch((char)by) {
                        case 'n': result.append('\n'); break;
                        case 'r': result.append('\r'); break;
                        case 'b': result.append('\b'); break;
                        case 'f': result.append('\f'); break;
                        case '(': result.append('(');  break;
                        case ')': result.append(')');  break;
                        default:
                            if(isDigit((char)by)) {
                                // Não suportado ainda
                                result.append('\\').append((char)by);
                            }
                            break;
                    }
                    escaped = false;
                } else if(by == '\\') {
                    escaped = true;
                } else if(by == '(') {
                    ++parLevel;
                    result.append('(');
                } else if(by == ')') {
                    --parLevel;
                    if(parLevel < 0) {
                        return result.toString();
                    }
                } else {
                    result.append((char)by);
                }
            }
        }
    }

    static void expect(String tok1, String tok2) throws Exception {
        if(!tok1.equals(tok2)) {
            throw new Exception("Encontrou " + tok1 + " quando esperava " + tok2);
        }
    }

    static class PdfXrefItem {
        long id, bytePos;
        int genNr;
        byte type;
    }

    static class PdfXref {
        private HashMap<PdfReferenceObj, PdfXrefItem> objs = new HashMap<PdfReferenceObj, PdfXrefItem>();
        public void parseXref(PdfTokenReader tokenReader) throws Exception {
            PdfTokenObj token = tokenReader.nextToken();
            int by;
            expect(token.t, "xref");
            do {
                token = tokenReader.nextToken();
                expect(token.t, "NUMBER");
                long startId = Long.parseLong(token.value);

                token = tokenReader.nextToken();
                expect(token.t, "NUMBER");
                int nrOfLines = Integer.parseInt(token.value);

                skipSpaces(tokenReader.pdfFile);
                if(nrOfLines != 0) {
                    int nrOfExpectedBytes = 20 * nrOfLines;
                    byte[] bytesXref = new byte[nrOfExpectedBytes];
                    tokenReader.pdfFile.readFully(bytesXref);
                    addBlock(startId, nrOfLines, bytesXref);
                }

                skipSpaces(tokenReader.pdfFile);
                by = tokenReader.pdfFile.read();
                tokenReader.pdfFile.seek(tokenReader.pdfFile.getFilePointer() - 1L);
            } while(isDigit((char)by));
        }
        public void addBlock(long startId, int nrOfLines, byte[] bytesXref) throws Exception {
            byte[] bytesLine = new byte[20];
            for(int i = 0; i < nrOfLines; ++i) {
                System.arraycopy(bytesXref, 20*i, bytesLine, 0, 20);
                addObj(i + startId, bytesLine);
            }
        }
        public void addObj(long id, byte[] bytesLine) throws Exception {
            String strLine = new String(bytesLine, "ISO-8859-1");
            PdfXrefItem xrefItem = new PdfXrefItem();
            xrefItem.type = bytesLine[17];
            xrefItem.id = id;
            if(bytesLine[17] == 'n') {
                xrefItem.bytePos = Long.parseLong(strLine.substring(0, 10));
            }
            xrefItem.genNr = Integer.parseInt(strLine.substring(11, 16));
            PdfReferenceObj idGen = new PdfReferenceObj(xrefItem.id, xrefItem.genNr);
            if(!objs.containsKey(idGen)) {
                objs.put(idGen, xrefItem);
            }
        }
        public PdfXrefItem getObj(PdfReferenceObj reference) {
            return objs.get(reference);
        }
    }

    private static PdfObject objhelper(PdfTokenReader tokenReader) throws Exception {
        PdfObject result = parseObject(tokenReader);
        PdfTokenObj token = tokenReader.nextToken();
        expect(token.t, "endobj");
        return result;
    }

    static PdfObject parseObject(PdfTokenReader tokenReader) throws Exception {
        PdfTokenObj token1 = tokenReader.nextToken();
        if("NUMBER".equals(token1.t)) {
            PdfTokenObj token2 = tokenReader.nextToken();
            if("NUMBER".equals(token2.t)) {
                PdfTokenObj token3 = tokenReader.nextToken();
                if("obj".equals(token3.t)) {
                    return objhelper(tokenReader);
                } else if("R".equals(token3.t)) {
                    return new PdfReferenceObj(Long.parseLong(token1.value), Integer.parseInt(token2.value));
                } else {
                    tokenReader.unreadToken(token3);
                    tokenReader.unreadToken(token2);
                    return new PdfNumberObj(Double.parseDouble(token1.value));
                }
            } else {
                tokenReader.unreadToken(token2);
                return new PdfNumberObj(Double.parseDouble(token1.value));
            }
        } else if("[".equals(token1.t)) {
            return parseArray(tokenReader);
        } else if("<<".equals(token1.t)) {
            PdfDictObj dictionary = parseDict(tokenReader);
            PdfTokenObj token4 = tokenReader.nextToken();
            if("stream".equals(token4.t)) {
                return readStream(tokenReader, dictionary.dict);
            } else {
                tokenReader.unreadToken(token4);
                return dictionary;
            }
        } else if("obj".equals(token1.t)) {
            return objhelper(tokenReader);
        } else if("STRING".equals(token1.t) || "HX_STRING".equals(token1.t)) {
            return new PdfStrObj(token1.value);
        } else {
            return token1;
        }
    }

    static PdfDictObj parseDict(PdfTokenReader tokenReader) throws Exception {
        HashMap<String, PdfObject> result = new HashMap<String, PdfObject>();
        while(true) {
            PdfTokenObj keyToken = tokenReader.nextToken();
            if(">>".equals(keyToken.t)) {
                break;
            } else {
                PdfObject objeto = parseObject(tokenReader);
                if(objeto instanceof PdfTokenObj && ">>".equals(((PdfTokenObj) objeto).t)) {
                    throw new Exception(">> inesperado em pos = " + tokenReader.pdfFile.getFilePointer());
                } else {
                    result.put(keyToken.value, objeto);
                }
            }
        }
        return new PdfDictObj(result);
    }

    static PdfArrayObj parseArray(PdfTokenReader tokenReader) throws Exception {
        ArrayList<PdfObject> result = new ArrayList<PdfObject>();
        while(true) {
            PdfTokenObj token = tokenReader.nextToken();
            if(token.t.equals("]")) {
                break;
            } else {
                tokenReader.unreadToken(token);
                result.add(parseObject(tokenReader));
            }
        }
        return new PdfArrayObj(result);
    }

    static PdfStreamObj readStream(PdfTokenReader tokenReader, Map<String, PdfObject> dict) throws Exception {
        int by;
        by = tokenReader.pdfFile.read();
        if(by == '\r') { // É \r\n?
            by = tokenReader.pdfFile.read(); // leu \r, ignorar \n
        } else if(by == '\n') {
            // ignorar \n
        } else {
            tokenReader.pdfFile.seek(tokenReader.pdfFile.getFilePointer() - 1L);
        }
        PdfObject lenObj = dict.get("Length");
        PdfNumberObj lenNumberObj;
        if(lenObj instanceof PdfReferenceObj) {
            long savedPos = tokenReader.pdfFile.getFilePointer();
            lenObj = findIndirectObj(tokenReader, tokenReader.xref, (PdfReferenceObj) lenObj);
            tokenReader.pdfFile.seek(savedPos);
        }
        lenNumberObj = (PdfNumberObj) lenObj;
        int len = (int) lenNumberObj.number;
        byte[] bytes = new byte[len];
        // faz de conta que lê o stream
        tokenReader.pdfFile.seek(tokenReader.pdfFile.getFilePointer() + len);
        PdfTokenObj token = tokenReader.nextToken();
        expect(token.t, "endstream");
        return new PdfStreamObj(dict, bytes);
    }

    static PdfObject readIndirectObj(PdfTokenReader tokenReader, PdfXrefItem xrefItem) throws Exception {
        if(xrefItem.type == 'n') {
            tokenReader.pdfFile.seek(xrefItem.bytePos);
            return parseObject(tokenReader);
        } else {
            throw new Exception("Procurando objeto inválido " + xrefItem.id);
        }
    }

    static PdfObject findIndirectObj(PdfTokenReader tokenReader, PdfXref xref, PdfReferenceObj reference) throws Exception {
        PdfXrefItem xrefItem = xref.getObj(reference);
        return readIndirectObj(tokenReader, xrefItem);
    }

    static PdfObject parseTrailer(PdfTokenReader tokenReader) throws Exception {
        PdfTokenObj token = tokenReader.nextToken();
        expect(token.t, "trailer");
        token = tokenReader.nextToken();
        expect(token.t, "<<");
        return parseDict(tokenReader);
    }

    static int linearized(PdfTokenReader tokenReader, RandomAccessFile pdfFile) throws Exception {
        pdfFile.seek(0L);
        PdfObject firstObj = parseObject(tokenReader);
        PdfDictObj firstDict = (PdfDictObj) firstObj;
        if(firstDict.dict.containsKey("Linearized")) {
            PdfNumberObj n = (PdfNumberObj) firstDict.dict.get("N");
            if(n != null)
                return (int) n.number;
        }
        throw new Exception("Não achou o número de páginas (/Linearized)");
    }


    static int getNumberOfPages(String fileName) throws Exception {
        final int bufferSize = 5;
        RandomAccessFile pdfFile = new RandomAccessFile(fileName, "r");
        try {
            byte[] comecoPdf = new byte[bufferSize];
            pdfFile.read(comecoPdf);
            byte[] pdfSig = {'%', 'P', 'D', 'F', '-'};
            for(int i = 0; i < comecoPdf.length && i < pdfSig.length; ++i){
                if(pdfSig[i] != comecoPdf[i])
                    throw new Exception("Não é um arquivo PDF");
            }
            long endBufferSize = 400L;
            if(pdfFile.length() < endBufferSize)
                endBufferSize = pdfFile.length();
            pdfFile.seek(pdfFile.length() - endBufferSize);
            byte[] startxref = new byte[(int)endBufferSize];
            pdfFile.readFully(startxref);
            String strStartxref = new String(startxref, "ISO-8859-1");
            Pattern pat = Pattern.compile("^.*startxref\\s+(\\d+)\\s+%%EOF.*$", Pattern.DOTALL);
            Matcher regExMatch = pat.matcher(strStartxref);
            if(!regExMatch.matches()) {
                throw new Exception("Nao achou o número de páginas (startxref)");
            }
            PdfTokenReader tokenReader = new PdfTokenReader(pdfFile);
            long pos = Long.parseLong(regExMatch.group(1));
            if(pos == 0L) {
                return linearized(tokenReader, pdfFile);
            }
            pdfFile.seek(pos);
            PdfTokenObj token = tokenReader.nextToken();
            if(token.t.equals("xref")) {
                tokenReader.unreadToken(token);
                ArrayList<Map<String, PdfObject>> trailers = new ArrayList<Map<String, PdfObject>>();
                PdfXref xref = new PdfXref();
                tokenReader.xref = xref;
                while(true) {
                    xref.parseXref(tokenReader);
                    Map<String, PdfObject> trailer = ((PdfDictObj) parseTrailer(tokenReader)).dict;
                    trailers.add(trailer);
                    PdfObject prev = trailer.get("Prev");
                    if(prev != null) {
                        PdfNumberObj nPrev = (PdfNumberObj) prev;
                        pdfFile.seek((long) nPrev.number);
                    } else {
                        break;
                    }
                }
                PdfObject catalog = null;
                for(Map<String, PdfObject> tr : trailers) {
                    PdfObject root = tr.get("Root");
                    if(root != null) {
                        PdfReferenceObj refRoot = (PdfReferenceObj) root;
                        catalog = findIndirectObj(tokenReader, xref, refRoot);
                        break;
                    }
                }
                if(catalog == null) {
                    throw new Exception("Não achou o número de páginas (catalog)");
                }
                PdfDictObj catalogDict = (PdfDictObj) catalog;
                PdfReferenceObj pagesRef = (PdfReferenceObj) catalogDict.dict.get("Pages");
                PdfDictObj pages = (PdfDictObj) findIndirectObj(tokenReader, xref, pagesRef);
                PdfNumberObj n = (PdfNumberObj) pages.dict.get("Count");
                return (int) n.number;
            } else {
                return linearized(tokenReader, pdfFile);
            }
        } catch(Exception e) {
            err.printf("Ocorreu erro na posição %d\n", pdfFile.getFilePointer());
            throw e;
        } finally {
            pdfFile.close();
        }
    }

    public static void main(String[] args) throws Exception {
        if(args.length < 1) {
            out.println("Uso: PdfPageCount <nome-arquivo.pdf>");
        } else {
            int sucessos = 0, falhas = 0;
            for(String fileName : args) {
                try {
                    out.printf("Número de páginas de %s: %d\n", fileName, getNumberOfPages(fileName));
                    ++sucessos;
                } catch(Exception e) {
                    out.println(e);
                    out.printf("Ocorreu um erro ao processar %s. Continuando...\n", fileName);
                    ++falhas;
                }
            }
            if(args.length > 1) {
                out.printf("Fim do processo. %d sucesso(s), %d falha(s).", sucessos, falhas);
            }
        }
    }
}

Versão usada nos testes de 2010-09-19:

PdfPageCount.java v2010-09-19 no repositório do BitBucket

import java.util.*;
import java.util.regex.*;
import java.io.*;
import static java.lang.System.out;
import static java.lang.System.err;
import static java.lang.Character.isWhitespace;
import static java.lang.Character.isDigit;

/*
Este programa mostra na tela o número de páginas de um
ou mais arquivos PDF passados na linha de comando.
Este código fonte é de domínio público (PUBLIC DOMAIN)
Desenvolvido em Java 5/6.
Fiz com classes internas para poder colocar tudo
num arquivo só.
Marcus.
*/

public final class PdfPageCount {

    static boolean isDelimiter(char ch) {
        if("()<>[]{}/%".indexOf(ch) != -1) {
            return true;
        }
        return isWhitespace(ch);
    }

    static abstract class PdfObject {}

    static class PdfNumberObj extends PdfObject {
        double number;
        PdfNumberObj(double value) { number = value; }
    }
    static class PdfStrObj extends PdfObject {
        String str;
        PdfStrObj(String value) { str = value; }
    }
    static class PdfDictObj extends PdfObject {
        Map<String, PdfObject> dict;
        PdfDictObj(Map<String, PdfObject> value) { dict = value; }
    }
    static class PdfArrayObj extends PdfObject {
        List<PdfObject> array;
        PdfArrayObj(List<PdfObject> value) { array = value; }
    }
    static class PdfReferenceObj extends PdfObject {
        long objNum, genNum;
        PdfReferenceObj(long obj, long gen) { objNum = obj; genNum = gen; }
        public boolean equals(Object o) {
            if(o == null || !(o instanceof PdfReferenceObj))
                return false;
            else {
                PdfReferenceObj p = (PdfReferenceObj) o;
                return objNum == p.objNum && genNum == p.genNum;
            }
        }
        public int hashCode() {
            return (int)((objNum ^ (objNum >>> 32)) ^ (31*(genNum ^ (genNum >>> 32)))); // I guess genNum will be usually 0...
        }
    }
    static class PdfTokenObj extends PdfObject {
        String t, value;
        PdfTokenObj(String type, String value) { this.t = type; this.value = value; }
    }
    static class PdfStreamObj extends PdfObject {
        Map<String, PdfObject> dict;
        byte[] stream;
        PdfStreamObj(Map<String, PdfObject> dict, byte[] stream) { this.dict = dict; this.stream = stream; }
    }

    static void skipSpaces(RandomAccessFile aPdfFile) throws Exception {
        int by;
        while(true) {
            by = aPdfFile.read();
            if(!isWhitespace((char)by)) {
                aPdfFile.seek(aPdfFile.getFilePointer() - 1L);
                return;
            }
        }
    }

    static class PdfTokenReader {
        private ArrayList<PdfTokenObj> tokenBuffer = new ArrayList<PdfTokenObj>();
        public PdfXref xref; // Descobri que preciso do xref em mais lugares, então
        // criei esta propriedade para ter fácil acesso ao xref.
        public RandomAccessFile pdfFile;
        public PdfTokenReader(RandomAccessFile aPdfFile) {
            pdfFile = aPdfFile;
        }
        public void unreadToken(PdfTokenObj token) {
            tokenBuffer.add(token);
        }
        private PdfTokenObj trataWord() throws Exception {
            String word = readWord();
            String[] words = {"obj", "endobj", "stream", "endstream",
                    "true", "false", "null", "xref", "trailer"};
            for(String testWord : words) {
                if(word.equals(testWord))
                    return new PdfTokenObj(word, null);
            }
            throw new Exception(
                "Erro no PDF no byte " + pdfFile.getFilePointer()
                + " lendo " + word);

        }
        public PdfTokenObj nextToken() throws Exception {
            if(!tokenBuffer.isEmpty()) {
                return tokenBuffer.remove(tokenBuffer.size() - 1);
            }
            int by;
            while(true) {
                by = pdfFile.read();
                if(by == '%') {
                    skipComment();
                } else if(isWhitespace((char)by)) {
                    skipSpaces(pdfFile);
                } else {
                    pdfFile.seek(pdfFile.getFilePointer() - 1L);
                    break;
                }
            }
            by = pdfFile.read();
            if(isDigit((char)by) || ".+-".indexOf((char)by) != -1) {
                pdfFile.seek(pdfFile.getFilePointer() - 1L);
                return new PdfTokenObj("NUMBER", readNumber());
            } else {
                switch((char)by) {
                    case '<':
                        by = pdfFile.read();
                        if(by == '<') {
                            return new PdfTokenObj("<<", null);
                        } else {
                            pdfFile.seek(pdfFile.getFilePointer() - 1L);
                            return new PdfTokenObj("HX_STRING", readHxString());
                        }
                    case '>':
                        by = pdfFile.read();
                        if(by == '>') {
                            return new PdfTokenObj(">>", null);
                        } else {
                            pdfFile.seek(pdfFile.getFilePointer() - 1L);
                            return new PdfTokenObj(">", null); // Não deve acontecer, porque o readHxString vai consumir o '>'
                        }
                    case '(':
                        return new PdfTokenObj("STRING", readString());
                    case '/':
                        return new PdfTokenObj("NAME", readName());
                    case '[':
                        return new PdfTokenObj("[", null);
                    case ']':
                        return new PdfTokenObj("]", null);
                    case '{':
                        return new PdfTokenObj("CODE", readCode());
                    case 'R':
                        by = pdfFile.read();
                        pdfFile.seek(pdfFile.getFilePointer() - 1L);
                        if(isDelimiter((char)by)) {
                            return new PdfTokenObj("R", null);
                        } else {
                            pdfFile.seek(pdfFile.getFilePointer() - 1L);
                            return trataWord();
                        }
                    default:
                        pdfFile.seek(pdfFile.getFilePointer() - 1L);
                        return trataWord();
                }
            }
        }
        private String readWord() throws Exception {
            StringBuilder result = new StringBuilder();
            int by;
            by = pdfFile.read();
            while(!isDelimiter((char)by)) {
                result.append((char)by);
                by = pdfFile.read();
            }
            pdfFile.seek(pdfFile.getFilePointer() - 1L);
            return result.toString();
        }
        private String readNumber() throws Exception {
            StringBuilder result = new StringBuilder();
            int by;
            by = pdfFile.read();
            while(isDigit((char)by) || ".+-".indexOf((char)by) != -1) {
                result.append((char)by);
                by = pdfFile.read();
            }
            pdfFile.seek(pdfFile.getFilePointer() - 1L);
            return result.toString();
        }
        private void skipComment() throws Exception {
            int by;
            while(true) {
                by = pdfFile.read();
                if(by == '\r') {
                    by = pdfFile.read();
                    if(by == '\n') {
                        return;
                    } else {
                        pdfFile.seek(pdfFile.getFilePointer() - 1L);
                    }
                } else if(by == '\n') {
                    return;
                }
            }
        }
        private String readCode() throws Exception {
            // Tratando código como 1 token só porque
            // não estamos interessados no seu conteúdo.
            // Mas é claro que o código é composto de vários tokens!
            int parLevel = 0;
            StringBuilder result = new StringBuilder();
            int by;
            while(true) {
                by = pdfFile.read();
                if(by == '{') {
                    ++parLevel;
                    result.append('{');
                } else if(by == '}') {
                    --parLevel;
                    if(parLevel < 0) {
                        return result.toString();
                    } else {
                        result.append('}');
                    }
                } else {
                    result.append((char)by);
                }
            }
        }
        private String readName() throws Exception {
            return readWord();
        }
        private String readHxString() throws Exception {
            StringBuilder result = new StringBuilder();
            int by = pdfFile.read();
            while(by != '>') {
                result.append((char)by);
                by = pdfFile.read();
            }
            return result.toString();
        }
        private String readString() throws Exception {
            int parLevel = 0;
            boolean escaped = false;
            StringBuilder result = new StringBuilder();
            int by;
            while(true) {
                by = pdfFile.read();
                if(escaped) {
                    switch((char)by) {
                        case 'n': result.append('\n'); break;
                        case 'r': result.append('\r'); break;
                        case 'b': result.append('\b'); break;
                        case 'f': result.append('\f'); break;
                        case '(': result.append('(');  break;
                        case ')': result.append(')');  break;
                        default:
                            if(isDigit((char)by)) {
                                // Não suportado ainda
                                result.append('\\').append((char)by);
                            }
                            break;
                    }
                    escaped = false;
                } else if(by == '\\') {
                    escaped = true;
                } else if(by == '(') {
                    ++parLevel;
                    result.append('(');
                } else if(by == ')') {
                    --parLevel;
                    if(parLevel < 0) {
                        return result.toString();
                    }
                } else {
                    result.append((char)by);
                }
            }
        }
    }

    static void expect(String tok1, String tok2) throws Exception {
        if(!tok1.equals(tok2)) {
            throw new Exception("Encontrou " + tok1 + " quando esperava " + tok2);
        }
    }

    static class PdfXrefItem {
        byte type;
        long id, bytePos, genNr;
    }

    static long readNumber(String str, int pos) {
        int endPos = pos;
        while(true) {
            if(".+-".indexOf(str.charAt(endPos)) == -1 && !isDigit(str.charAt(endPos))) {
                return Long.parseLong(str.substring(pos, endPos));
            }
            ++endPos;
        }
    }

    static class PdfXref {
        private HashMap<PdfReferenceObj, PdfXrefItem> objs = new HashMap<PdfReferenceObj, PdfXrefItem>();
        public void parseXref(PdfTokenReader tokenReader) throws Exception {
            PdfTokenObj token = tokenReader.nextToken();
            int by;
            expect(token.t, "xref");
            do {
                token = tokenReader.nextToken();
                expect(token.t, "NUMBER");
                long startId = Long.parseLong(token.value);

                token = tokenReader.nextToken();
                expect(token.t, "NUMBER");
                int nrOfLines = Integer.parseInt(token.value);

                skipSpaces(tokenReader.pdfFile);
                if(nrOfLines != 0) {
                    int nrOfExpectedBytes = 20 * nrOfLines;
                    byte[] bytesXref = new byte[nrOfExpectedBytes];
                    tokenReader.pdfFile.readFully(bytesXref);
                    addBlock(startId, nrOfLines, bytesXref);
                }

                skipSpaces(tokenReader.pdfFile);
                by = tokenReader.pdfFile.read();
                tokenReader.pdfFile.seek(tokenReader.pdfFile.getFilePointer() - 1L);
            } while(isDigit((char)by));
        }
        public void addBlock(long startId, int nrOfLines, byte[] bytesXref) throws Exception {
            byte[] bytesLine = new byte[20];
            for(int i = 0; i < nrOfLines; ++i) {
                System.arraycopy(bytesXref, 20*i, bytesLine, 0, 20);
                addObj(i + startId, bytesLine);
            }
        }
        public void addObj(long id, byte[] bytesLine) throws Exception {
            String strLine = new String(bytesLine, "ISO-8859-1");
            PdfXrefItem xrefItem = new PdfXrefItem();
            xrefItem.type = bytesLine[17];
            xrefItem.id = id;
            if(bytesLine[17] == 'n') {
                xrefItem.bytePos = readNumber(strLine, 0);
            }
            xrefItem.genNr = readNumber(strLine, 11);
            PdfReferenceObj idGen = new PdfReferenceObj(xrefItem.id, xrefItem.genNr);
            if(!objs.containsKey(idGen)) {
                objs.put(idGen, xrefItem);
            }
        }
        public PdfXrefItem getObj(PdfReferenceObj reference) {
            return objs.get(reference);
        }
    }

    private static PdfObject objhelper(PdfTokenReader tokenReader) throws Exception {
        PdfObject result = parseObject(tokenReader);
        PdfTokenObj token = tokenReader.nextToken();
        expect(token.t, "endobj");
        return result;
    }

    static PdfObject parseObject(PdfTokenReader tokenReader) throws Exception {
        PdfTokenObj token1 = tokenReader.nextToken();
        if("NUMBER".equals(token1.t)) {
            PdfTokenObj token2 = tokenReader.nextToken();
            if("NUMBER".equals(token2.t)) {
                PdfTokenObj token3 = tokenReader.nextToken();
                if("obj".equals(token3.t)) {
                    return objhelper(tokenReader);
                } else if("R".equals(token3.t)) {
                    return new PdfReferenceObj(Long.parseLong(token1.value), Long.parseLong(token2.value));
                } else {
                    tokenReader.unreadToken(token3);
                    tokenReader.unreadToken(token2);
                    return new PdfNumberObj(Double.parseDouble(token1.value));
                }
            } else {
                tokenReader.unreadToken(token2);
                return new PdfNumberObj(Double.parseDouble(token1.value));
            }
        } else if("[".equals(token1.t)) {
            return parseArray(tokenReader);
        } else if("<<".equals(token1.t)) {
            PdfDictObj dictionary = parseDict(tokenReader);
            PdfTokenObj token4 = tokenReader.nextToken();
            if("stream".equals(token4.t)) {
                return readStream(tokenReader, dictionary.dict);
            } else {
                tokenReader.unreadToken(token4);
                return dictionary;
            }
        } else if("obj".equals(token1.t)) {
            return objhelper(tokenReader);
        } else if("STRING".equals(token1.t) || "HX_STRING".equals(token1.t)) {
            return new PdfStrObj(token1.value);
        } else {
            return token1;
        }
    }

    static PdfDictObj parseDict(PdfTokenReader tokenReader) throws Exception {
        HashMap<String, PdfObject> result = new HashMap<String, PdfObject>();
        while(true) {
            PdfTokenObj keyToken = tokenReader.nextToken();
            if(">>".equals(keyToken.t)) {
                break;
            } else {
                PdfObject objeto = parseObject(tokenReader);
                if(objeto instanceof PdfTokenObj && ">>".equals(((PdfTokenObj) objeto).t)) {
                    throw new Exception(">> inesperado em pos = " + tokenReader.pdfFile.getFilePointer());
                } else {
                    result.put(keyToken.value, objeto);
                }
            }
        }
        return new PdfDictObj(result);
    }

    static PdfArrayObj parseArray(PdfTokenReader tokenReader) throws Exception {
        ArrayList<PdfObject> result = new ArrayList<PdfObject>();
        while(true) {
            PdfTokenObj token = tokenReader.nextToken();
            if(token.t.equals("]")) {
                break;
            } else {
                tokenReader.unreadToken(token);
                result.add(parseObject(tokenReader));
            }
        }
        return new PdfArrayObj(result);
    }

    static PdfStreamObj readStream(PdfTokenReader tokenReader, Map<String, PdfObject> dict) throws Exception {
        int by;
        by = tokenReader.pdfFile.read();
        if(by == '\r') { // É \r\n?
            by = tokenReader.pdfFile.read(); // leu \r, ignorar \n
        } else if(by == '\n') {
            // ignorar \n
        } else {
            tokenReader.pdfFile.seek(tokenReader.pdfFile.getFilePointer() - 1L);
        }
        PdfObject lenObj = dict.get("Length");
        PdfNumberObj lenNumberObj;
        if(lenObj instanceof PdfReferenceObj) {
            long savedPos = tokenReader.pdfFile.getFilePointer();
            lenObj = findIndirectObj(tokenReader, tokenReader.xref, (PdfReferenceObj) lenObj);
            tokenReader.pdfFile.seek(savedPos);
        }
        lenNumberObj = (PdfNumberObj) lenObj;
        int len = (int) lenNumberObj.number;
        byte[] bytes = new byte[len];
        // faz de conta que lê o stream
        tokenReader.pdfFile.seek(tokenReader.pdfFile.getFilePointer() + len);
        PdfTokenObj token = tokenReader.nextToken();
        expect(token.t, "endstream");
        return new PdfStreamObj(dict, bytes);
    }

    static PdfObject readIndirectObj(PdfTokenReader tokenReader, PdfXrefItem xrefItem) throws Exception {
        if(xrefItem.type == 'n') {
            tokenReader.pdfFile.seek(xrefItem.bytePos);
            return parseObject(tokenReader);
        } else {
            throw new Exception("Procurando objeto inválido " + xrefItem.id);
        }
    }

    static PdfObject findIndirectObj(PdfTokenReader tokenReader, PdfXref xref, PdfReferenceObj reference) throws Exception {
        PdfXrefItem xrefItem = xref.getObj(reference);
        return readIndirectObj(tokenReader, xrefItem);
    }

    static PdfObject parseTrailer(PdfTokenReader tokenReader) throws Exception {
        PdfTokenObj token = tokenReader.nextToken();
        expect(token.t, "trailer");
        token = tokenReader.nextToken();
        expect(token.t, "<<");
        return parseDict(tokenReader);
    }

    static int linearized(PdfTokenReader tokenReader, RandomAccessFile pdfFile) throws Exception {
        pdfFile.seek(0L);
        PdfObject firstObj = parseObject(tokenReader);
        PdfDictObj firstDict = (PdfDictObj) firstObj;
        if(firstDict.dict.containsKey("Linearized")) {
            PdfNumberObj n = (PdfNumberObj) firstDict.dict.get("N");
            if(n != null)
                return (int) n.number;
        }
        throw new Exception("Não achou o número de páginas (/Linearized)");
    }

    static int getNumberOfPages(String fileName) throws Exception {
        final int bufferSize = 5;
        RandomAccessFile pdfFile = new RandomAccessFile(fileName, "r");
        try {
            byte[] comecoPdf = new byte[bufferSize];
            pdfFile.read(comecoPdf);
            byte[] pdfSig = {'%', 'P', 'D', 'F', '-'};
            for(int i = 0; i < comecoPdf.length && i < pdfSig.length; ++i){
                if(pdfSig[i] != comecoPdf[i])
                    throw new Exception("Não é um arquivo PDF");
            }
            long endBufferSize = 400L;
            if(pdfFile.length() < endBufferSize)
                endBufferSize = pdfFile.length();
            pdfFile.seek(pdfFile.length() - endBufferSize);
            byte[] startxref = new byte[(int)endBufferSize];
            pdfFile.readFully(startxref);
            String strStartxref = new String(startxref, "ISO-8859-1");
            Pattern pat = Pattern.compile("^.*startxref\\s+(\\d+)\\s+%%EOF.*$", Pattern.DOTALL);
            Matcher regExMatch = pat.matcher(strStartxref);
            if(!regExMatch.matches()) {
                throw new Exception("Nao achou o número de páginas (startxref)");
            }
            PdfTokenReader tokenReader = new PdfTokenReader(pdfFile);
            long pos = Long.parseLong(regExMatch.group(1));
            if(pos == 0L) {
                return linearized(tokenReader, pdfFile);
            }
            pdfFile.seek(pos);
            PdfTokenObj token = tokenReader.nextToken();
            if(token.t.equals("xref")) {
                tokenReader.unreadToken(token);
                ArrayList<Map<String, PdfObject>> trailers = new ArrayList<Map<String, PdfObject>>();
                PdfXref xref = new PdfXref();
                tokenReader.xref = xref;
                while(true) {
                    xref.parseXref(tokenReader);
                    Map<String, PdfObject> trailer = ((PdfDictObj) parseTrailer(tokenReader)).dict;
                    trailers.add(trailer);
                    PdfObject prev = trailer.get("Prev");
                    if(prev != null) {
                        PdfNumberObj nPrev = (PdfNumberObj) prev;
                        pdfFile.seek((long) nPrev.number);
                    } else {
                        break;
                    }
                }
                PdfObject catalog = null;
                for(Map<String, PdfObject> tr : trailers) {
                    PdfObject root = tr.get("Root");
                    if(root != null) {
                        PdfReferenceObj refRoot = (PdfReferenceObj) root;
                        catalog = findIndirectObj(tokenReader, xref, refRoot);
                        break;
                    }
                }
                if(catalog == null) {
                    throw new Exception("Não achou o número de páginas (catalog)");
                }
                PdfDictObj catalogDict = (PdfDictObj) catalog;
                PdfReferenceObj pagesRef = (PdfReferenceObj) catalogDict.dict.get("Pages");
                PdfDictObj pages = (PdfDictObj) findIndirectObj(tokenReader, xref, pagesRef);
                PdfNumberObj n = (PdfNumberObj) pages.dict.get("Count");
                return (int) n.number;
            } else {
                return linearized(tokenReader, pdfFile);
            }
        } catch(Exception e) {
            err.printf("Ocorreu erro na posição %d\n", pdfFile.getFilePointer());
            throw e;
        } finally {
            pdfFile.close();
         }
    }

    public static void main(String[] args) throws Exception {
        if(args.length < 1) {
            out.println("Uso: PdfPageCount <nome-arquivo.pdf>");
        } else {
            int sucessos = 0, falhas = 0;
            for(String fileName : args) {
                try {
                    out.printf("Número de páginas de %s: %d\n", fileName, getNumberOfPages(fileName));
                    ++sucessos;
                } catch(Exception e) {
                    out.println(e);
                    out.printf("Ocorreu um erro ao processar %s. Continuando...\n", fileName);
                    ++falhas;
                }
            }
            if(args.length > 1) {
                out.printf("Fim do processo. %d sucesso(s), %d falha(s).", sucessos, falhas);
            }
        }
    }
}
Anúncios

2 pensamentos sobre “Java: PdfPageCount.java

  1. Comparação de desempenho lendo PDFs « Visions of hope

  2. Comparação de desempenho (2) « Visions of hope

Deixe um comentário

Preencha os seus dados abaixo ou clique em um ícone para log in:

Logotipo do WordPress.com

Você está comentando utilizando sua conta WordPress.com. Sair / Alterar )

Imagem do Twitter

Você está comentando utilizando sua conta Twitter. Sair / Alterar )

Foto do Facebook

Você está comentando utilizando sua conta Facebook. Sair / Alterar )

Foto do Google+

Você está comentando utilizando sua conta Google+. Sair / Alterar )

Conectando a %s