Boo: PdfPageCount.boo

Está meio ruim de ler aqui na página por causa do tamanho das letras e do tamanho das linhas. Sugiro copiar e colar para um editor com sintaxe colorida. Obs.: eu testei com vários arquivos PDF, mas é certo que há erros à espreita. Considere-se avisado.

Versão usada nos testes de 2010-10-27:
BitBucket – PdfPageCount.boo v2010-10-27

/* Este programa mostra na tela o número de páginas de um
arquivo PDF passado na linha de comando.
Este código fonte é de domínio público (PUBLIC DOMAIN)
Desenvolvido em Boo 0.9.3
Marcus. */

namespace Pdf.BooPdfPageCount

import System
import System.IO
import System.Collections.Generic

def isDelimiter(ch as char):
    return Char.IsWhiteSpace(ch) or ch in '()<>[]{}/%'

def skipSpaces(pdfFile as FileStream):
    while true:
        by_ = pdfFile.ReadByte()
        if not Char.IsWhiteSpace(cast(char, by_)):
            pdfFile.Seek(-1L, SeekOrigin.Current)
            break

def expect(found as string, expected as string):
    unless found == expected:
        raise Exception("Encontrou ${found} quando esperava ${expected}")

class PdfToken:
    public t as string
    public val as string

class PdfTokenReader:
    //[Getter(PdfFile)] private pdfFile as FileStream
    public pdfFile as FileStream
    public xref as PdfXref = null
    private tokenBuffer = []

    def constructor(aPdfFile):
        pdfFile = aPdfFile

    def UnreadToken(token):
        tokenBuffer.Add(token)

    def NextToken() as PdfToken:
        if tokenBuffer.Count != 0:
            return tokenBuffer.Pop()


        def trataWord():
            word = ReadWord()
            if word in ('obj', 'endobj', 'stream', 'endstream',
                    'true', 'false', 'null', 'xref', 'trailer'):
                return PdfToken(t: word)
            raise Exception("Error at byte ${pdfFile.Position}, lendo ${word}")


        while true:
            by = pdfFile.ReadByte()
            ch = cast(char, by)
            if ch == char('%'):
                SkipComment()
            elif Char.IsWhiteSpace(ch):
                skipSpaces(pdfFile)
            else:
                break


        if Char.IsDigit(ch) or ch in '.+-':
            pdfFile.Seek(-1L, SeekOrigin.Current)
            return PdfToken(t: 'NUMBER', val: ReadNumber())
        elif ch == char('<'):
            by = pdfFile.ReadByte()
            if by == char('<'):
                return PdfToken(t: '<<')
            pdfFile.Seek(-1L, SeekOrigin.Current)
            return PdfToken(t: 'HX_STRING', val: ReadHxString())
        elif ch == char('>'):
            by = pdfFile.ReadByte()
            if by == char('>'):
                return PdfToken(t: '>>')
            pdfFile.Seek(-1L, SeekOrigin.Current)
            return PdfToken(t: '>') # Não deve acontecer, porque o readHxString vai consumir o '>'
        elif ch == char('('):
            return PdfToken(t: 'STRING', val: ReadString())
        elif ch == char('/'):
            return PdfToken(t: 'NAME', val: ReadName())
        elif ch == char('['):
            return PdfToken(t: '[')
        elif ch == char(']'):
            return PdfToken(t: ']')
        elif ch == char('{'):
            return PdfToken(t: 'CODE', val: ReadCode());
        elif ch == char('R'):
            by = pdfFile.ReadByte()
            pdfFile.Seek(-1L, SeekOrigin.Current)
            if isDelimiter(cast(char, by)):
                return PdfToken(t: 'R')
            else:
                pdfFile.Seek(-1L, SeekOrigin.Current)
                return trataWord()
        else:
            pdfFile.Seek(-1L, SeekOrigin.Current)
            return trataWord()

    def ReadWord():
        result = List of char()
        ch = cast(char, pdfFile.ReadByte())
        while not isDelimiter(ch):
            result.Add(ch)
            ch = cast(char, pdfFile.ReadByte())
        pdfFile.Seek(-1L, SeekOrigin.Current)
        return join(result, '')

    def ReadString():
        parLevel = 0
        escaped = false
        result = List of char()
        while true:
            by = pdfFile.ReadByte()
            if escaped:
                if by == char('n'):
                    result.Add(char('\n'))
                elif by == char('r'):
                    result.Add(char('\r'))
                elif by == char('b'):
                    result.Add(char('\b'))
                elif by == char('f'):
                    result.Add(char('\f'))
                elif Char.IsDigit(cast(char, by)):
                    # Not supported yet
                    result.Add(char('\\'))
                    result.Add(cast(char, by))
                elif by == char('('):
                    result.Add(char('('))
                elif by == char(')'):
                    result.Add(char(')'))
                elif by == char('\n'):
                    pass
                escaped = false
            elif by == char('\\'):
                escaped = true
            elif by == char('('):
                ++parLevel
                result.Add(char('('))
            elif by == char(')'):
                parLevel -= 1
                if parLevel < 0:
                    return join(result, '')
            else:
                result.Add(cast(char, by))

    def ReadName():
        return ReadWord()

    def ReadHxString():
        result = List of char()
        by = pdfFile.ReadByte()
        while by != char('>'):
            result.Add(cast(char, by))
            by = pdfFile.ReadByte()
        return join(result, '')

    def ReadNumber():
        result = List of char()
        ch = cast(char, pdfFile.ReadByte())
        while Char.IsDigit(ch) or ch in '.+-':
            result.Add(ch)
            ch = cast(char, pdfFile.ReadByte())
        pdfFile.Seek(-1L, SeekOrigin.Current)
        return join(result, '')

    def SkipComment():
        while true:
            by = pdfFile.ReadByte()
            if by == char('\r'):
                by = pdfFile.ReadByte()
                if by != char('\n'):
                    pdfFile.Seek(-1L, SeekOrigin.Current)
                return
            elif by == char('\n'):
                return


    def ReadCode():
        parLevel = 0
        result = List of char()
        while true:
            by = pdfFile.ReadByte()
            if by == char('{'):
                ++parLevel
                result.Add(char('{'))
            elif by == char('}'):
                parLevel -= 1
                if parLevel < 0:
                    return join(result, '')
                else:
                    result.Add(cast(char, by))
            else:
                result.Add(cast(char, by))


class PdfXrefItem:
    public type as byte
    public id as long
    public bytePos as long
    public genNum as int


class PdfXref:
    private objs = {}

    def ParseXref(tokenReader as PdfTokenReader):
        token = tokenReader.NextToken()
        expect(token.t, 'xref')
        while true:
            token = tokenReader.NextToken()
            expect(token.t, 'NUMBER')
            startId = long.Parse(token.val)
            token = tokenReader.NextToken()
            expect(token.t, 'NUMBER')
            nrOfLines = int.Parse(token.val)
            skipSpaces(tokenReader.pdfFile)
            if nrOfLines != 0:
                expectedNrOfBytes = 20 * nrOfLines
                bytesXref = array(byte, expectedNrOfBytes)
                tokenReader.pdfFile.Read(bytesXref, 0, expectedNrOfBytes)
                self.AddBlock(startId, nrOfLines, bytesXref)
            ch = cast(char, tokenReader.pdfFile.ReadByte())
            tokenReader.pdfFile.Seek(-1L, SeekOrigin.Current)
            if not Char.IsDigit(ch):
                return # deixa pronto para ler o trailer

    private def AddBlock(startId as long, nrOfLines as int, strBlock as (byte)):
        for i in range(0, nrOfLines):
            AddObj(i + startId, strBlock[20*i : 20*(i+1)])

    private def AddObj(id as long, bytesLine as (byte)):
        xrefItem = PdfXrefItem()
        xrefItem.type = bytesLine[17]
        charLine = array(char, c for c in bytesLine)
        strLine = string(charLine)
        xrefItem.id = id
        if xrefItem.type == char('n'):
            xrefItem.bytePos = int.Parse(strLine[:10])
        xrefItem.genNum = int.Parse(strLine[11:16])
        reference = (xrefItem.id, xrefItem.genNum)
        if reference not in objs:
            # Pelos meus testes, não pode sobrescrever o que já foi inserido.
            # O primeiro que entra é o que deve ficar no Xref, mesmo que
            # seguindo os trailer << /Prev nnnn >> encontremos redefinições
            objs[reference] = xrefItem

    def getObj(reference as PdfReference):
        return objs[(reference.objNum, reference.genNum)]


def parseTrailer(tokenReader as PdfTokenReader):
    token = tokenReader.NextToken()
    expect(token.t, 'trailer')
    token = tokenReader.NextToken()
    expect(token.t, '<<')
    return parseDict(tokenReader)


def readIndirectObj(tokenReader as PdfTokenReader, xrefItem as PdfXrefItem):
    if xrefItem.type == char('n'):
        tokenReader.pdfFile.Seek(xrefItem.bytePos, SeekOrigin.Begin)
        return parseObject(tokenReader)
    else:
        return null

struct PdfReference:
    public objNum as long
    public genNum as int

def parseObject(tokenReader as PdfTokenReader) as object:
    def obj():
        result = parseObject(tokenReader)
        token = tokenReader.NextToken()
        expect(token.t, 'endobj')
        return result

    token1 = tokenReader.NextToken()
    if token1.t == 'NUMBER':
        # pode ser número, objeto indireto, ou referência para objeto indireto
        token2 = tokenReader.NextToken()
        if token2.t == 'NUMBER':
            token3 = tokenReader.NextToken()
            if token3.t == 'obj':
                return obj()
            elif token3.t == 'R':
                return PdfReference(objNum: long.Parse(token1.val), genNum: int.Parse(token2.val))
            else:
                tokenReader.UnreadToken(token3)
                tokenReader.UnreadToken(token2)
                return token1
        else:
            tokenReader.UnreadToken(token2)
            return token1
    elif token1.t == '[':
        return parseArray(tokenReader)
    elif token1.t == '<<':
        dictionary = parseDict(tokenReader)
        token4 = tokenReader.NextToken()
        if token4.t == 'stream':
            return readStream(tokenReader, dictionary)
        else:
            tokenReader.UnreadToken(token4)
            return dictionary
    elif token1.t == 'obj':
        return obj()
    else:
        return token1

def parseArray(tokenReader as PdfTokenReader):
    result = []
    while true:
        token = tokenReader.NextToken()
        if token.t == ']':
            return result
        tokenReader.UnreadToken(token)
        result.Add(parseObject(tokenReader))

def readStream(tokenReader as PdfTokenReader, dictionary as Hash):
    by = tokenReader.pdfFile.ReadByte()
    if by == char('\r'):
        tokenReader.pdfFile.ReadByte() # skip \r\n
    elif by == char('\n'):
        pass # skip \n
    else:
        tokenReader.pdfFile.Seek(-1, SeekOrigin.Current)

    lenObj as duck = dictionary['Length']
    if lenObj isa PdfReference:
        savedPos = tokenReader.pdfFile.Position
        lenObj = findIndirectObject(tokenReader, lenObj)
        tokenReader.pdfFile.Seek(savedPos, SeekOrigin.Begin)
    tokenReader.pdfFile.Seek(long.Parse(lenObj.val), SeekOrigin.Current)
    token = tokenReader.NextToken()
    expect(token.t, 'endstream')
    return dictionary # Só fizemos de conta que lemos o stream

def parseDict(tokenReader as PdfTokenReader):
    result = {}
    while true:
        token = tokenReader.NextToken()
        if token.t == '>>':
            return result
        result[token.val] = parseObject(tokenReader)
        if result[token.val] isa PdfToken:
            if (result[token.val] as PdfToken).t == '>>':
                raise Exception('>> inesperado em pos = ${tokenReader.pdfFile.Position}')

def findIndirectObject(tokenReader as PdfTokenReader, reference as PdfReference):
    xrefItem = tokenReader.xref.getObj(reference)
    return readIndirectObj(tokenReader, xrefItem)



def getNumberOfPages(fileName) as int:
    using pdfFile = FileStream(fileName, FileMode.Open, FileAccess.Read):
        bytesComecoPdf = array(byte, 5)
        pdfFile.Read(bytesComecoPdf, 0, 5)
        charsComecoPdf = array(char, c for c in bytesComecoPdf)
        strComecoPdf = string(charsComecoPdf)
        if strComecoPdf != '%PDF-':
            raise Exception('Não é um arquivo PDF')

        endBufferSize = FileInfo(fileName).Length
        endBufferSize = 400 if endBufferSize > 400

        pdfFile.Seek(-endBufferSize, SeekOrigin.End)
        bytesEndBuffer = array(byte, endBufferSize)
        pdfFile.Read(bytesEndBuffer, 0, endBufferSize)
        charsEndBuffer = array(char, c for c in bytesEndBuffer)
        startxref = string(charsEndBuffer)
        matchStartxref = /startxref\s+(\d+)\s+%%EOF\s*/.Match(startxref)
        if not matchStartxref.Success:
            raise Exception('Não achou o número de páginas (startxref)')
        tokenReader = PdfTokenReader(pdfFile)
        def linearized() as int:
            pdfFile.Seek(0, SeekOrigin.Begin)
            firstObj = cast(Hash, parseObject(tokenReader))
            if firstObj.ContainsKey('Linearized'):
                return int.Parse(cast(PdfToken, firstObj['N']).val)
            else:
                raise Exception('Não achou o número de páginas (/Linearized)')
        pos = long.Parse(matchStartxref.Groups[1].Value)
        #print('pos =', pos)
        if pos == 0:
            return linearized()
        else:
            pdfFile.Seek(pos, SeekOrigin.Begin)

            token = tokenReader.NextToken()
            if token.t == 'xref':
                tokenReader.UnreadToken(token)

                trailers = []
                xref = PdfXref()
                tokenReader.xref = xref
                while true:
                    #print('Vai ler xref na posição:', pdfFile.tell(), 'conteúdo:', pdfFile.peek(20)[:20])
                    xref.ParseXref(tokenReader) # Já posiciona para ler o trailer
                    trailer = parseTrailer(tokenReader)
                    trailers.Add(trailer)
                    if 'Prev' in trailer:
                        #print('trailer tem /Prev')
                        pdfFile.Seek(long.Parse(cast(PdfToken, trailer['Prev']).val), SeekOrigin.Begin)
                    else:
                        #print('trailer não tem /Prev')
                        break

                #print('xref resultante:', xref)
                
                catalog as Hash

                for tr as Hash in trailers:
                    if tr.ContainsKey('Root'):
                        catalogNum = tr['Root']
                        catalog = findIndirectObject(tokenReader, catalogNum)
                        #print('Achou catalog:', dictToStr(catalog))
                        break

                pagesNum = catalog['Pages']
                #print('Procura por pages:', pages_num)
                pages as Hash = findIndirectObject(tokenReader, pagesNum)
                #print('Achou pages:', dictToStr(pages))
                return int.Parse(cast(PdfToken, pages['Count']).val)
            else:
                return linearized()



def Main(argv as (string)):
    if argv.Length < 1:
        print "Uso: BooPdfPageCount <nome-arquivo.pdf>"
        return 1
    sucessos = falhas = 0
    for fileName in argv:
        try:
            print "Número de páginas de ${fileName}: ${getNumberOfPages(fileName)}"
            ++sucessos
        except e as Exception:
            print e.ToString()
            print "Ocorreu um erro ao processar ${fileName}. Continuando..."
            ++falhas
    if argv.Length > 1:
        print "Fim do processo. ${sucessos} sucesso(s), ${falhas} falha(s)."
    return 0
Anúncios

Um pensamento sobre “Boo: PdfPageCount.boo

  1. Comparação de desempenho (2) « Visions of hope

Deixe um comentário

Preencha os seus dados abaixo ou clique em um ícone para log in:

Logotipo do WordPress.com

Você está comentando utilizando sua conta WordPress.com. Sair / Alterar )

Imagem do Twitter

Você está comentando utilizando sua conta Twitter. Sair / Alterar )

Foto do Facebook

Você está comentando utilizando sua conta Facebook. Sair / Alterar )

Foto do Google+

Você está comentando utilizando sua conta Google+. Sair / Alterar )

Conectando a %s