Python 3: PdfPageCount.py

Está meio ruim de ler aqui na página por causa do tamanho das letras e do tamanho das linhas. Sugiro copiar e colar para um editor com sintaxe colorida. Obs.: eu testei com vários arquivos PDF, mas é certo que há erros à espreita. Considere-se avisado.

Versão usada nos testes de 2010-10-27:

#!/usr/bin/env python

# Este programa mostra na tela o número de páginas de um
# arquivo PDF passado na linha de comando.
# Este código fonte é de domínio público (PUBLIC DOMAIN)
# Desenvolvido em Python 3.1
# Marcus.

import sys
import re
import os
import string

def isDelimiter(by):
    return by.isspace() or by in b'()<>[]{}/%'

def skipSpaces(pdf_file):
    while True:
        by = pdf_file.read(1)
        if not by[:1].isspace():
            pdf_file.seek(-1, os.SEEK_CUR)
            return

class Token:
    def __init__(self, type_, value):
        self.t = type_
        self.val = value

    def __str__(self):
        return 'Token(' + str(self.t) + ', ' + str(self.val) + ')'

class TokenReader:
    def __init__(self, pdf_file):
        self.pdf_file = pdf_file
        self.tok_buf = []
        self.xref = None

    def unreadToken(self, token):
        self.tok_buf.append(token)

    def nextToken(self):
        if bool(self.tok_buf):
            return self.tok_buf.pop()

        def trataWord():
            word = self.readWord()
            if word in (b'obj', b'endobj', b'stream', b'endstream', b'true', b'false', b'null', b'xref', b'trailer'):
                return Token(word, None)
            raise Exception('Error at byte ' + str(self.pdf_file.tell()) + ': ' + str(word))

        while True:
            by = self.pdf_file.read(1)[:1]
            if by == b'%':
                self.skipComment()
            elif by.isspace():
                skipSpaces(self.pdf_file)
            else:
                break

        if by.isdigit() or by in b'-+.':
            self.pdf_file.seek(-1, os.SEEK_CUR)
            return Token(b'NUMBER', self.readNumber())
        elif by == b'<':
            by = self.pdf_file.read(1)[:1]
            if by == b'<':
                return Token(b'<<', None)
            self.pdf_file.seek(-1, os.SEEK_CUR)
            return Token(b'HX_STRING', self.readHxString())
        elif by == b'>':
            by = self.pdf_file.read(1)[:1]
            if by == b'>':
                return Token(b'>>', None)
            self.pdf_file.seek(-1, os.SEEK_CUR)
            return Token(b'>', None) # Não deve acontecer, porque o readHxString vai consumir o '>'
        elif by == b'(':
            return Token(b'STRING', self.readString())
        elif by == b'/':
            return Token(b'NAME', self.readName())
        elif by == b'[':
            return Token(b'[', None)
        elif by == b']':
            return Token(b']', None)
        elif by == b'{':
            return Token(b'CODE', self.readCode())
        elif by == b'R':
            by = self.pdf_file.peek(1)[:1] # peek tem o costume de retornar mais do que a gente pede
            if isDelimiter(by):
                return Token(b'R', None)
            else:
                self.pdf_file.seek(-1, os.SEEK_CUR)
                return trataWord()
        else:
            self.pdf_file.seek(-1, os.SEEK_CUR)
            return trataWord()

    def readWord(self):
        result = []
        by = self.pdf_file.read(1)[:1]
        while not isDelimiter(by):
            result.append(by)
            by = self.pdf_file.read(1)[:1]
        self.pdf_file.seek(-1, os.SEEK_CUR)
        return b''.join(result)

    def readNumber(self):
        digits = []
        by = self.pdf_file.read(1)[:1]
        while by.isdigit() or by in b'.+-':
            digits.append(by)
            by = self.pdf_file.read(1)[:1]
        self.pdf_file.seek(-1, os.SEEK_CUR)
        return float(b''.join(digits))

    def skipComment(self):
        while True:
            by = self.pdf_file.read(1)[:1]
            if by == b'\r':
                by = self.pdf_file.read(1)[:1]
                if by != b'\n':
                    self.pdf_file.seek(-1, os.SEEK_CUR)
                return
            elif by == b'\n':
                return

    def readCode(self):
        # Tratando código como 1 token só porque
        # não estamos interessados no seu conteúdo.
        # Mas é claro que o código é composto de vários tokens!
        par_level = 0
        result = []
        while True:
            by = self.pdf_file.read(1)[:1]
            if by == b'{':
                par_level += 1
                result.append(by)
            elif by == b'}':
                par_level -= 1
                if par_level < 0:
                    return b''.join(result)
                else:
                    result.append(by)
            else:
                result.append(by)

    def readName(self):
        return self.readWord()

    def readHxString(self):
        result = []
        by = self.pdf_file.read(1)[:1]
        while by != b'>':
            result.append(by)
            by = self.pdf_file.read(1)[:1]
        return b''.join(result)

    def readString(self):
        par_level = 0
        escaped = False
        result = []
        while True:
            by = self.pdf_file.read(1)[:1]
            if escaped:
                if by == b'n':
                    result.append(b'\n')
                elif by == b'r':
                    result.append(b'\r')
                elif by == b'b':
                    result.append(b'\b')
                elif by == b'f':
                    result.append(b'\f')
                elif by.isdigit():
                    # Not supported yet
                    result.append(b'\\' + by)
                elif by == b'(':
                    result.append(b'(')
                elif by == b')':
                    result.append(b')')
                elif by == b'\n':
                    pass
                escaped = False
            elif by == b'\\':
                escaped = True
            elif by == b'(':
                par_level += 1
                result.append(b'(')
            elif by == b')':
                par_level -= 1
                if par_level < 0:
                    return b''.join(result)
            else:
                result.append(by)

class XrefItem:
    def __init__(self):
        self.type = None
        self.id = None
        self.byte_pos = None
        self.gen_nr = None

    def __str__(self):
        return ('XrefItem(type = ' + str(self.type) +
                ', id = ' + str(self.id) +
                ', byte_pos = ' + str(self.byte_pos) +
                ', gen_nr = ' + str(self.gen_nr) + ')')

class Xref:
    def __init__(self):
        self.objs = {} # indexado pela tupla (id, generation_number)

    def parseXref(self, token_reader):
        token = token_reader.nextToken()
        expect(token.t, b'xref')
        while True:
            token = token_reader.nextToken()
            expect(token.t, b'NUMBER')
            start_id = token.val
            token = token_reader.nextToken()
            expect(token.t, b'NUMBER')
            nr_of_lines = token.val
            skipSpaces(token_reader.pdf_file)
            if nr_of_lines != 0:
                str_xref = token_reader.pdf_file.read(20*int(nr_of_lines))
                self.addBlock(int(start_id), int(nr_of_lines), str_xref)
                #skipSpaces(token_reader.pdf_file) # precisa disto? Lua não está usando
            by = token_reader.pdf_file.peek(1)[:1]
            if not by.isdigit():
                return # deixa pronto para ler o trailer

    def addBlock(self, start_id, nr_of_lines, str_block):
        for i in range(0, nr_of_lines):
            self.addObj(i + start_id, str_block[20*i : 20*(i+1)])

    def addObj(self, id, str_line):
        xref_item = XrefItem()
        xref_item.type = str_line[17:18]
        xref_item.id = id
        if xref_item.type == b'n':
            xref_item.byte_pos = int(str_line[:10])
        xref_item.gen_nr = int(str_line[11:16])
        ref = (xref_item.id, xref_item.gen_nr)
        if ref not in self.objs:
            # Pelos meus testes, não pode sobrescrever o que já foi inserido.
            # O primeiro que entra é o que deve ficar no Xref, mesmo que
            # seguindo os trailer << /Prev nnnn >> encontremos redefinições
            self.objs[ref] = xref_item

    def getObj(self, reference):
        return self.objs[reference.obj_num, reference.gen_num]

    def __str__(self):
        return dictToStr(self.objs)

def dictToStr(d):
    keys = sorted(d.keys())
    result = []
    for key in keys:
        result.append(str(key) + ': ' + str(d[key]))
    return '{' + ',\n'.join(result) + '}'

def parseTrailer(token_reader):
    token = token_reader.nextToken()
    expect(token.t, b'trailer')
    token = token_reader.nextToken()
    expect(token.t, b'<<')
    return parseDict(token_reader)

def expect(tok1, tok2):
    if tok1 != tok2:
        raise Exception('Encontrou ' + str(tok1) + ' quando esperava ' + str(tok2))

def readIndirectObj(token_reader, xref_item):
    if xref_item.type == b'n':
        token_reader.pdf_file.seek(xref_item.byte_pos, os.SEEK_SET)
        return parseObject(token_reader)
    else:
        return None

# Pode retornar Token (para objetos compostos de 1 só token), dict, list, Reference ou Stream. Meio confuso, né?
# Melhorar, se possível
def parseObject(token_reader):
    def obj():
        result = parseObject(token_reader)
        token = token_reader.nextToken()
        expect(token.t, b'endobj')
        return result

    token1 = token_reader.nextToken()
    if token1.t == b'NUMBER':
        # pode ser número, objeto indireto, ou referência para objeto indireto
        token2 = token_reader.nextToken()
        if token2.t == b'NUMBER':
            token3 = token_reader.nextToken()
            if token3.t == b'obj':
                return obj()
            elif token3.t == b'R':
                return Reference(token1.val, token2.val)
            else:
                token_reader.unreadToken(token3)
                token_reader.unreadToken(token2)
                return token1 # Adicionado return aqui. Estava faltando
        else:
            token_reader.unreadToken(token2)
            return token1
    elif token1.t == b'[':
        return parseArray(token_reader)
    elif token1.t == b'<<':
        dictionary = parseDict(token_reader)
        token4 = token_reader.nextToken()
        if token4.t == b'stream':
            return readStream(token_reader, dictionary)
        else:
            token_reader.unreadToken(token4)
            return dictionary
    elif token1.t == b'obj':
        return obj()
    else:
        return token1

def parseArray(token_reader):
    result = []
    while True:
        token = token_reader.nextToken()
        if token.t == b']':
            return result
        token_reader.unreadToken(token)
        result.append(parseObject(token_reader))

def readStream(token_reader, dictionary):
    by = token_reader.pdf_file.read(1)[:1]
    if by == b'\r':
        token_reader.pdf_file.read(1) # skip \r\n
    elif by == b'\n':
        pass # skip \n
    else:
        token-reader.pdf_file.seek(-1, os.SEEK_CUR)

    len_obj = dictionary[b'Length']
    if isinstance(len_obj, Reference):
        saved_pos = token_reader.pdf_file.tell()
        len_obj = findIndirectObject(token_reader, len_obj)
        token_reader.pdf_file.seek(saved_pos, os.SEEK_SET)
    token_reader.pdf_file.seek(int(len_obj.val), os.SEEK_CUR)
    token = token_reader.nextToken()
    expect(token.t, b'endstream')
    return dictionary # Só fizemos de conta que lemos o stream

def parseDict(token_reader):
    result = {}
    while True:
        token = token_reader.nextToken()
        if token.t == b'>>':
            return result
        result[token.val] = parseObject(token_reader)
        if isinstance(result[token.val], Token):
            if result[token.val].t == b'>>':
                raise Exception('>> inesperado em pos = ' + str(token_reader.pdf_file.tell()))

class Reference:
    def __init__(self, obj_num, gen_num):
        self.obj_num = obj_num
        self.gen_num = gen_num

    def __str__(self):
        return 'Reference(' + str(self.obj_num) + ', ' + str(self.gen_num) + ')'

def findIndirectObject(token_reader, ref):
    xref_item = token_reader.xref.getObj(ref)
    return readIndirectObj(token_reader, xref_item)

def getNumberOfPages(file_name):
    with open(file_name, 'rb') as pdf_file:
        comeco_pdf = pdf_file.read(5)
        if not comeco_pdf.startswith(b'%PDF-'):
            raise Exception('Não é um arquivo PDF')

        end_buffer_size = min(400, os.stat(file_name).st_size)

        pdf_file.seek(-end_buffer_size, os.SEEK_END)
        startxref = str(pdf_file.read(end_buffer_size), 'iso-8859-1')
        match_startxref = re.search(r'startxref\s+(\d+)\s+%%EOF\s*', startxref)
        if match_startxref is None:
            raise Exception('Não achou o número de páginas (startxref)')
        else:
            token_reader = TokenReader(pdf_file)
            def linearized():
                pdf_file.seek(0, os.SEEK_SET)
                first_obj = parseObject(token_reader)
                if b'Linearized' in first_obj:
                    return int(first_obj[b'N'].val)
                else:
                    raise Exception('Não achou o número de páginas (/Linearized)')
            pos = int(match_startxref.group(1))
            #print('pos =', pos)
            if pos == 0:
                return linearized()
            else:
                pdf_file.seek(pos, os.SEEK_SET)

                token = token_reader.nextToken()
                if token.t == b'xref':
                    token_reader.unreadToken(token)

                    trailers = []
                    xref = Xref()
                    token_reader.xref = xref
                    while True:
                        #print('Vai ler xref na posição:', pdf_file.tell(), 'conteúdo:', pdf_file.peek(20)[:20])
                        xref.parseXref(token_reader) # Já posiciona para ler o trailer
                        trailer = parseTrailer(token_reader)
                        trailers.append(trailer)
                        if b'Prev' in trailer:
                            #print('trailer tem /Prev')
                            pdf_file.seek(int(trailer[b'Prev'].val), os.SEEK_SET)
                        else:
                            #print('trailer não tem /Prev')
                            break

                    #print('xref resultante:', xref)

                    for tr in trailers:
                        if b'Root' in tr:
                            catalog_num = tr[b'Root']
                            #print('Procura por catalog:', catalog_num)
                            catalog = findIndirectObject(token_reader, catalog_num)
                            #print('Achou catalog:', dictToStr(catalog))
                            break

                    pages_num = catalog[b'Pages']
                    #print('Procura por pages:', pages_num)
                    pages = findIndirectObject(token_reader, pages_num)
                    #print('Achou pages:', dictToStr(pages))
                    return int(pages[b'Count'].val)
                else:
                    return linearized()

if __name__ == '__main__':
    if len(sys.argv) < 2:
        print('Uso:', sys.argv[0], '<nome-arquivo.pdf>')
        sys.exit(1)
    sucessos = falhas = 0
    for file_name in sys.argv[1:]:
        try:
            print('Número de páginas de', file_name + ':', getNumberOfPages(file_name))
            sucessos += 1
        except Exception as e:
            print(str(e))
            print('Ocorreu um erro ao processar {0}. Continuando...'.format(file_name))
            falhas += 1
        #sys.stdout.flush();
    if len(sys.argv) > 2:
        print('Fim do processo. {0} sucesso(s), {1} falha(s).'.format(sucessos, falhas));

Versão usada nos testes de 2010-09-19:

#!/usr/bin/env python

# Este programa mostra na tela o número de páginas de um
# arquivo PDF passado na linha de comando.
# Este código fonte é de domínio público (PUBLIC DOMAIN)
# Desenvolvido em Python 3.1
# Marcus.


import sys
import re
import os
import string

def isDelimiter(char):
    return char.isspace() or char in ('(', ')', '<', '>', '[', ']', '{', '}', '/', '%')

def skipSpaces(pdf_file):
    while True:
        char = chr(pdf_file.peek(1)[0])
        if char.isspace():
            pdf_file.seek(1, os.SEEK_CUR)
        else:
            return

class Token:
    def __init__(self, type_, value):
        self.t = type_
        self.val = value

    def __str__(self):
        return 'Token(' + self.t + ', ' + str(self.val) + ')'

class TokenReader:
    def __init__(self, pdf_file):
        self.pdf_file = pdf_file
        self.tok_buf = []
        self.xref = None

    def unreadToken(self, token):
        self.tok_buf.append(token)

    def nextToken(self):
        if bool(self.tok_buf):
            return self.tok_buf.pop()

        while True:
            by = self.pdf_file.peek(1)
            ch = chr(by[0])
            if ch == '%':
                self.skipComment()
            elif chr(by[0]).isspace():
                skipSpaces(self.pdf_file)
            else:
                break

        #by = self.pdf_file.peek(1)
        #ch = chr(by[0])
        if ch.isdigit() or ch in '-+.':
            return Token('NUMBER', self.readNumber())
        elif ch == '<':
            self.pdf_file.seek(1, os.SEEK_CUR) # discard '<'
            by = self.pdf_file.peek(1)
            if by[0] == ord('<'):
                self.pdf_file.seek(1, os.SEEK_CUR) # discard '<'
                return Token('<<', None)
            return Token('HX_STRING', self.readHxString())
        elif ch == '>':
            self.pdf_file.seek(1, os.SEEK_CUR) # discard '>'
            by = self.pdf_file.peek(1)
            if by[0] == ord('>'):
                self.pdf_file.seek(1, os.SEEK_CUR) # discard '>'
                return Token('>>', None)
            return Token('>', None) # Não deve acontecer, porque o readHxString vai consumir o '>'
        elif ch == '(':
            self.pdf_file.seek(1, os.SEEK_CUR)
            return Token('STRING', self.readString())
        elif ch == '/':
            self.pdf_file.seek(1, os.SEEK_CUR)
            return Token('NAME', self.readName())
        elif ch == '[':
            self.pdf_file.seek(1, os.SEEK_CUR)
            return Token('[', None)
        elif ch == ']':
            self.pdf_file.seek(1, os.SEEK_CUR)
            return Token(']', None)
        elif ch == '{':
            self.pdf_file.seek(1, os.SEEK_CUR)
            return Token('CODE', self.readCode())
        elif ch == 'R':
            # falta testar se o char seguinte é delimiter
            self.pdf_file.seek(1, os.SEEK_CUR)
            return Token('R', None)
        else:
            word = self.readWord()
            for t in ('obj', 'endobj', 'stream', 'endstream', 'true', 'false', 'null', 'xref', 'trailer'):
                if word == t:
                    return Token(t, None)
            raise Exception('Error at byte ' + str(self.pdf_file.tell()) + ': ' + word)

    def readWord(self):
        result = []
        char = chr(self.pdf_file.peek(1)[0])
        while not isDelimiter(char):
            result.append(char)
            self.pdf_file.seek(1, os.SEEK_CUR)
            char = chr(self.pdf_file.peek(1)[0])
        return ''.join(result)

    def readNumber(self):
        digits = []
        is_float = False
        char = chr(self.pdf_file.peek(1)[0])
        while char in string.digits + '.+-':
            if char == '.':
                is_float = True
            digits.append(char)
            self.pdf_file.seek(1, os.SEEK_CUR)
            char = chr(self.pdf_file.peek(1)[0])
        if is_float:
            return float(''.join(digits))
        else:
            return int(''.join(digits))

    def skipComment(self):
        while True:
            char = chr(self.pdf_file.peek(1)[0])
            if char == '\r':
                self.pdf_file.seek(1, os.SEEK_CUR)
                char = chr(self.pdf_file.peek(1)[0])
                if char == '\n':
                    self.pdf_file.seek(1, os.SEEK_CUR)
                return
            elif char == '\n':
                self.pdf_file.seek(1, os.SEEK_CUR)
                return
            else:
                self.pdf_file.seek(1, os.SEEK_CUR)

    def readCode(self):
        par_level = 0
        result = []
        while True:
            char = chr(self.pdf_file.peek(1)[0])
            if char == '{':
                par_level += 1
                result.append(char)
            elif char == '}':
                par_level -= 1
                if par_level < 0:
                    return ''.join(result)
                else:
                    result.append(char)
            else:
                result.append(char)
            self.pdf_file.seek(1, os.SEEK_CUR)

    def readName(self):
        return self.readWord()

    def readHxString(self):
        result = []
        char = chr(self.pdf_file.peek(1)[0])
        while char != '>':
            result.append(char)
            self.pdf_file.seek(1, os.SEEK_CUR)
            char = chr(self.pdf_file.peek(1)[0])
        self.pdf_file.seek(1, os.SEEK_CUR)
        return ''.join(result)

    def readString(self):
        par_level = 0
        escaped = False
        result = []
        while True:
            char = chr(self.pdf_file.peek(1)[0])
            if escaped:
                if char == 'n':
                    result.append('\n')
                elif char == 'r':
                    result.append('\r')
                elif char == 'b':
                    result.append('\b')
                elif char == 'f':
                    result.append('\f')
                elif char.isdigit():
                    # Not supported yet
                    result.append('\\' + char)
                elif char == '(':
                    result.append('(')
                elif char == ')':
                    result.append(')')
                elif char == '\n':
                    pass
                escaped = False
            elif char == '\\':
                escaped = True
            elif char == '(':
                par_level += 1
                result.append('(')
            elif char == ')':
                par_level -= 1
                if par_level < 0:
                    self.pdf_file.seek(1, os.SEEK_CUR)
                    return ''.join(result)
            else:
                result.append(char)
            self.pdf_file.seek(1, os.SEEK_CUR)

class XrefItem:
    def __init__(self):
        self.type = None
        self.id = None
        self.byte_pos = None
        self.gen_nr = None

    def __str__(self):
        return ('XrefItem(type = ' + str(self.type) +
                ', id = ' + str(self.id) +
                ', byte_pos = ' + str(self.byte_pos) +
                ', gen_nr = ' + str(self.gen_nr) + ')')


def readNumber(string_, position):
    end = position
    is_float = False
    while string_[end] in string.digits + '.+-':
        if string_[end] == '.':
            is_float = True
        end += 1
    if is_float:
        return end, float(string_[position : end])
    else:
        return end, int(string_[position : end])

class Xref:
    def __init__(self):
        self.objs = {} # indexado pela tupla (id, generation_number)

    def parseXref(self, token_reader):
        token = token_reader.nextToken()
        expect(token.t, 'xref')
        while True:
            token = token_reader.nextToken()
            expect(token.t, 'NUMBER')
            start_id = token.val
            token = token_reader.nextToken()
            expect(token.t, 'NUMBER')
            nr_of_lines = token.val
            skipSpaces(token_reader.pdf_file)
            str_xref = token_reader.pdf_file.read(20*int(nr_of_lines))
            self.addBlock(int(start_id), int(nr_of_lines), str_xref)
            skipSpaces(token_reader.pdf_file)
            char = chr(token_reader.pdf_file.peek(1)[0])
            if not char.isdigit():
                return # deixa pronto para ler o trailer

    def addBlock(self, start_id, nr_of_lines, str_block):
        for i in range(0, nr_of_lines):
            self.addObj(i + start_id, str_block[20*i : 20*(i+1)])

    def addObj(self, id, str_line):
        # all chars will be ascii anyway, so just convert the bytes to chars
        # without thinking too much about encoding issues
        # we can't use 'ascii' because it rejects bytes > 127, should they happen
        str_line = str(str_line, 'iso-8859-1')
        xref_item = XrefItem()
        xref_item.type = str_line[17]
        xref_item.id = id
        if xref_item.type == 'n':
            _, xref_item.byte_pos = readNumber(str_line, 0)
        _, xref_item.gen_nr = readNumber(str_line, 11)
        #if xref_item.id == 1 and xref_item.gen_nr == 0:
        #    print('%% "1 0 obj" at position', xref_item.byte_pos)
        if (xref_item.id, xref_item.gen_nr) not in self.objs:
            # Pelos meus testes, não pode sobrescrever o que já foi inserido.
            # O primeiro que entra é o que deve ficar no Xref, mesmo que
            # seguindo os trailer << /Prev nnnn >> encontremos redefinições
            self.objs[xref_item.id, xref_item.gen_nr] = xref_item

    def getObj(self, reference):
        return self.objs[reference.obj_num, reference.gen_num]

    def __str__(self):
        return dictToStr(self.objs)

def dictToStr(d):
    #keys = list(d.keys())
    #keys.sort()
    keys = sorted(d.keys())
    result = []
    for key in keys:
        result.append(str(key) + ': ' + str(d[key]))
    return '{' + ',\n'.join(result) + '}'


def parseTrailer(token_reader):
    token = token_reader.nextToken()
    expect(token.t, 'trailer')
    token = token_reader.nextToken()
    expect(token.t, '<<')
    return parseDict(token_reader)

def expect(tok1, tok2):
    if tok1 != tok2:
        raise Exception('Encontrou ' + tok1 + ' quando esperava ' + tok2)

def readIndirectObj(token_reader, xref_item):
    if xref_item.type == 'n':
        token_reader.pdf_file.seek(xref_item.byte_pos, os.SEEK_SET)
        return parseObject(token_reader)
    else:
        return None

# Pode retornar Token (para objetos compostos de 1 só token), dict, list, Reference ou Stream. Meio confuso, né?
# Melhorar, se possível
def parseObject(token_reader):
    def obj():
        result = parseObject(token_reader)
        token = token_reader.nextToken()
        expect(token.t, 'endobj')
        return result

    token1 = token_reader.nextToken()
    if token1.t == 'NUMBER':
        # pode ser número, objeto indireto, ou referência para objeto indireto
        token2 = token_reader.nextToken()
        if token2.t == 'NUMBER':
            token3 = token_reader.nextToken()
            if token3.t == 'obj':
                return obj()
            elif token3.t == 'R':
                return Reference(token1.val, token2.val)
            else:
                token_reader.unreadToken(token3)
                token_reader.unreadToken(token2)
                return token1 # Adicionado return aqui. Estava faltando
        else:
            token_reader.unreadToken(token2)
            return token1
    elif token1.t == '[':
        return parseArray(token_reader)
    elif token1.t == '<<':
        dictionary = parseDict(token_reader)
        token4 = token_reader.nextToken()
        if token4.t == 'stream':
            return readStream(token_reader, dictionary)
        else:
            token_reader.unreadToken(token4)
        return dictionary
    elif token1.t == 'obj':
        return obj()
    else:
        return token1

def parseArray(token_reader):
    result = []
    while True:
        token = token_reader.nextToken()
        if token.t == ']':
            return result
        token_reader.unreadToken(token)
        result.append(parseObject(token_reader))

def readStream(token_reader, dictionary):
    ch = chr(token_reader.pdf_file.peek(1)[0])
    if ch == '\r':
        token_reader.pdf_file.seek(2, os.SEEK_CUR) # skip \r\n
    elif ch == '\n':
        token_reader.pdf_file.seek(1, os.SEEK_CUR) # skip \n

    len_obj = dictionary['Length']
    if isinstance(len_obj, Reference):
        saved_pos = token_reader.pdf_file.tell()
        len_obj = findIndirectObject(token_reader, len_obj)
        token_reader.pdf_file.seek(saved_pos, os.SEEK_SET)
    token_reader.pdf_file.seek(len_obj.val, os.SEEK_CUR)
    token = token_reader.nextToken()
    expect(token.t, 'endstream')
    return dictionary # Só fizemos de conta que lemos o stream

def parseDict(token_reader):
    result = {}
    while True:
        token = token_reader.nextToken()
        if token.t == '>>':
            return result
        result[token.val] = parseObject(token_reader)
        if isinstance(result[token.val], Token):
            if result[token.val].t == '>>':
                raise Exception('>> inesperado em pos = ' + str(token_reader.pdf_file.tell()))

class Reference:
    def __init__(self, obj_num, gen_num):
        self.obj_num = obj_num
        self.gen_num = gen_num

    def __str__(self):
        return 'Reference(' + str(self.obj_num) + ', ' + str(self.gen_num) + ')'

def findIndirectObject(token_reader, ref):
    xref_item = token_reader.xref.getObj(ref)
    return readIndirectObj(token_reader, xref_item)

def get_number_of_pages(file_name):
    with open(file_name, 'rb') as pdf_file:
        comeco_pdf = pdf_file.read(5)
        if not comeco_pdf.startswith(b'%PDF-'):
            raise Exception('Não é um arquivo PDF')

        end_buffer_size = min(400, os.stat(file_name).st_size)

        pdf_file.seek(-end_buffer_size, os.SEEK_END)
        startxref = str(pdf_file.read(end_buffer_size), 'iso-8859-1')
        match_startxref = re.search(r'startxref\s+(\d+)\s+%%EOF\s*', startxref)
        if match_startxref is None:
            raise Exception('Não achou o número de páginas (startxref)')
        else:
            token_reader = TokenReader(pdf_file)
            def linearized():
                pdf_file.seek(0, os.SEEK_SET)
                first_obj = parseObject(token_reader)
                if 'Linearized' in first_obj:
                    return int(first_obj['N'].val)
                else:
                    raise Exception('Não achou o número de páginas (/Linearized)')
            pos = int(match_startxref.group(1))
            #print('pos =', pos)
            if pos == 0:
                return linearized()
            else:
                pdf_file.seek(pos, os.SEEK_SET)

                token = token_reader.nextToken()
                if token.t == 'xref':
                    token_reader.unreadToken(token)

                    trailers = []
                    xref = Xref()
                    token_reader.xref = xref
                    while True:
                        #print('Vai ler xref na posição:', pdf_file.tell(), 'conteúdo:', pdf_file.peek(20)[:20])
                        xref.parseXref(token_reader) # Já posiciona para ler o trailer
                        trailer = parseTrailer(token_reader)
                        trailers.append(trailer)
                        if 'Prev' in trailer:
                            #print('trailer tem /Prev')
                            pdf_file.seek(trailer['Prev'].val, os.SEEK_SET)
                        else:
                            #print('trailer não tem /Prev')
                            break

                    #print('xref resultante:', xref)

                    for tr in trailers:
                        if 'Root' in tr:
                            catalog_num = tr['Root']
                            #print('Procura por catalog:', catalog_num)
                            catalog = findIndirectObject(token_reader, catalog_num)
                            #print('Achou catalog:', dictToStr(catalog))
                            break

                    pages_num = catalog['Pages']
                    #print('Procura por pages:', pages_num)
                    pages = findIndirectObject(token_reader, pages_num)
                    #print('Achou pages:', dictToStr(pages))
                    return int(pages['Count'].val)
                else:
                    return linearized()

if __name__ == "__main__":
    if len(sys.argv) < 2:
        print('Uso:', sys.argv[0], '<nome-arquivo.pdf>')
        sys.exit(1)
    sucessos = falhas = 0
    for file_name in sys.argv[1:]:
        try:
            print('Número de páginas de', file_name + ':', get_number_of_pages(file_name))
            sucessos += 1
        except Exception as e:
            print(str(e))
            print('Ocorreu um erro ao processar {0}. Continuando...'.format(file_name))
            falhas += 1
        #sys.stdout.flush(); # Eu não estava vendo a saída sem este flush.
    if len(sys.argv) > 2:
        print("Fim do processo. {0} sucesso(s), {1} falha(s).".format(sucessos, falhas));
Anúncios

3 pensamentos sobre “Python 3: PdfPageCount.py

  1. Python vs. PL/SQL vs. F# « Visions of hope

  2. Comparação de desempenho lendo PDFs « Visions of hope

  3. Comparação de desempenho (2) « Visions of hope

Deixe um comentário

Preencha os seus dados abaixo ou clique em um ícone para log in:

Logotipo do WordPress.com

Você está comentando utilizando sua conta WordPress.com. Sair / Alterar )

Imagem do Twitter

Você está comentando utilizando sua conta Twitter. Sair / Alterar )

Foto do Facebook

Você está comentando utilizando sua conta Facebook. Sair / Alterar )

Foto do Google+

Você está comentando utilizando sua conta Google+. Sair / Alterar )

Conectando a %s