F#: PdfPageCount.fs

Está meio ruim de ler aqui na página por causa do tamanho das letras e do tamanho das linhas. Sugiro copiar e colar para um editor com sintaxe colorida. Obs.: eu testei com vários arquivos PDF, mas é certo que há erros à espreita. Considere-se avisado.

Versão usada nos testes de 2010-10-27:

module Pdf.PdfPageCount

(*
Este programa mostra na tela o número de páginas de um
arquivo PDF passado na linha de comando.
Este código fonte é de domínio público (PUBLIC DOMAIN)
Desenvolvido em F# 2.0
Marcus.
*)


open System
open System.IO
open System.Collections.Generic
open System.Text.RegularExpressions

let isDelimiter ch =
    match Array.tryFindIndex ((=) ch) [|'('; ')'; '<'; '>'; '['; ']'; '{'; '}'; '/'; '%'|] with
    | None -> Char.IsWhiteSpace(ch)
    | _ -> true

type PdfReference = {objNum: int64; genNum: int64}

// Guardar o tipo de token como uma string não é nada type-safe...
// Característica da implementação original em Python 3
type PdfToken = {t: string; value: string}

type PdfObject =
    | PdfNumber of float // Pra ficar mais certo, teria que diferenciar entre os tipos int e float
    | PdfString of string
    | PdfHxString of string
    | PdfDict of Dictionary<string, PdfObject>
    | PdfArray of ResizeArray<PdfObject>
    | PdfReferenceObj of PdfReference
    | PdfTokenObj of PdfToken
    | PdfStream of Dictionary<string, PdfObject> * byte array

let rec skipSpaces (aPdfFile: FileStream) =
    let by = aPdfFile.ReadByte()
    if Char.IsWhiteSpace(char by) then
        skipSpaces aPdfFile
    else
        // Opa, não era espaço. Voltamos um passo e terminamos a função
        aPdfFile.Seek(-1L, SeekOrigin.Current) |> ignore

let expect tok1 tok2 =
    if tok1 <> tok2 then
        failwith("Encontrou " + tok1 + " quando esperava " + tok2)

type PdfTokenReader(aPdfFile: FileStream) = class
    let mutable tokenBuffer = []
    let mutable xref: PdfXref = null
    member self.PdfFile = aPdfFile
    member self.Xref with get() = xref and set(value) = xref <- value
    member self.UnreadToken token = tokenBuffer <- token :: tokenBuffer

    member self.NextToken() :PdfToken =
        match tokenBuffer with
        | h :: t ->
            tokenBuffer <- t
            h
        | [] ->
            let rec skipCommentsLoop() =
                let by = aPdfFile.ReadByte()
                if by = -1 then
                    ()
                else 
                    let s = Char.ConvertFromUtf32(by)
                    if s = "%" then
                        self.SkipComment()
                        skipCommentsLoop()
                    elif String.IsNullOrWhiteSpace(s) then
                        skipSpaces(aPdfFile)
                        skipCommentsLoop()
                    else
                        aPdfFile.Seek(-1L, SeekOrigin.Current) |> ignore
            skipCommentsLoop()
            let trataWord() =
                let word = self.ReadWord();
                match word with
                | "obj" | "endobj" | "stream" | "endstream" | "true" | "false" | "null" | "xref" | "trailer" ->
                    {t = word; value = null}
                | _ ->
                    failwith(sprintf "Erro no PDF no byte %d, lendo %s" aPdfFile.Position word)
            let by = aPdfFile.ReadByte()
            let ch = char by
            if Char.IsDigit(ch) || (Array.tryFindIndex ((=) (char by)) [|'.'; '+'; '-'|]) <> None then
                aPdfFile.Seek(-1L, SeekOrigin.Current) |> ignore
                {t = "NUMBER"; value = self.ReadNumber()}
            elif by = int '<' then
                let by = aPdfFile.ReadByte()
                if by = int '<' then
                    {t = "<<"; value = null}
                else
                    aPdfFile.Seek(-1L, SeekOrigin.Current) |> ignore
                    {t = "HX_STRING"; value = self.ReadHxString()}
            elif by = int '>' then
                let by = aPdfFile.ReadByte()
                if by = int '>' then
                    {t = ">>"; value = null}
                else
                    aPdfFile.Seek(-1L, SeekOrigin.Current) |> ignore
                    {t = ">"; value = null} // Não deve acontecer, porque o readHxString vai consumir o '>'
            elif by = int '(' then {t = "STRING"; value = self.ReadString()}
            elif by = int '/' then {t = "NAME"; value = self.ReadName()}
            elif by = int '[' then {t = "["; value = null}
            elif by = int ']' then {t = "]"; value = null}
            elif by = int '{' then {t = "CODE"; value = self.ReadCode()}
            elif by = int 'R' then
                let by2 = aPdfFile.ReadByte()
                aPdfFile.Seek(-1L, SeekOrigin.Current) |> ignore
                if by2 = -1 || isDelimiter (char by2) then
                    {t = "R"; value = null}
                else
                    aPdfFile.Seek(-1L, SeekOrigin.Current) |> ignore
                    trataWord()
            else
                aPdfFile.Seek(-1L, SeekOrigin.Current) |> ignore
                trataWord()

    member self.ReadWord() =
        let result = new ResizeArray<char>()
        let mutable by = aPdfFile.ReadByte()
        while by <> -1 && not(isDelimiter(char by)) do
            result.Add(char by)
            by <- aPdfFile.ReadByte()
        aPdfFile.Seek(-1L, SeekOrigin.Current) |> ignore
        String.Join("", result)

    member self.ReadNumber() :string =
        let result = new ResizeArray<char>()
        let mutable by = aPdfFile.ReadByte()
        while by <> -1 && (Char.IsDigit(char by) || (Array.tryFindIndex ((=) (char by)) [|'.'; '+'; '-'|]) <> None) do
            result.Add(char by)
            by <- aPdfFile.ReadByte()
        aPdfFile.Seek(-1L, SeekOrigin.Current) |> ignore
        String.Join("", result)

    member self.SkipComment() =
        let by = aPdfFile.ReadByte()
        match Char.ConvertFromUtf32(by).[0] with
        | '\r' ->
            match Char.ConvertFromUtf32(aPdfFile.ReadByte()).[0] with
            | '\n' -> ()
            | _ -> aPdfFile.Seek(-1L, SeekOrigin.Current) |> ignore
        | '\n' -> ()
        | _ -> self.SkipComment()

    member self.ReadCode() =
        let mutable parLevel = 0
        let result = new ResizeArray<char>()
        let mutable finished = false
        while not finished do
            let by = aPdfFile.ReadByte()
            if by = -1 then finished <- true
            else
                if by = int '{' then
                    parLevel <- parLevel + 1
                    result.Add('{')
                elif by = int '}' then
                    parLevel <- parLevel - 1
                    if parLevel < 0 then
                        finished <- true
                    else
                        result.Add('}')
                else
                    result.Add(char by)
        String.Join("", result)

    member self.ReadName() = self.ReadWord()

    member self.ReadHxString() =
        let result = new ResizeArray<char>()
        let mutable by = aPdfFile.ReadByte()
        while by <> -1 && by <> int '>' do
            result.Add(char by)
            by <- aPdfFile.ReadByte()
        String.Join("", result)

    member self.ReadString() =
        let mutable parLevel = 0
        let mutable escaped = false
        let result = new ResizeArray<char>()
        let mutable finished = false
        while not finished do
            let by = aPdfFile.ReadByte()
            if escaped then
                match char by with
                | 'n' -> result.Add('\n')
                | 'r' -> result.Add('\r')
                | 'b' -> result.Add('\b')
                | 'f' -> result.Add('\f')
                | '(' -> result.Add('(')
                | ')' -> result.Add(')')
                | d when Char.IsDigit(d) ->
                    // Não suportado ainda
                    result.Add('\\')
                    result.Add(d)
                | _ -> ()
                escaped <- false
            elif by = int '\\' then
                escaped <- true
            elif by = int '(' then
                parLevel <- parLevel + 1
                result.Add('(')
            elif by = int ')' then
                parLevel <- parLevel - 1
                if parLevel < 0 then
                    finished <- true
            else
                result.Add(char by)
        String.Join("", result)
end
and PdfXrefItem = {
    type_: char
    id: int64
    bytePos: int64
    genNr: int64
}
and [<AllowNullLiteral>]PdfXref() = class
    let objs = new Dictionary<PdfReference, PdfXrefItem>()
    member self.ParseXref(tokenReader: PdfTokenReader) =
        let token = tokenReader.NextToken()
        expect token.t "xref"
        while (
                let token = tokenReader.NextToken()
                expect token.t "NUMBER"
                let startId = int token.value

                let token = tokenReader.NextToken()
                expect token.t "NUMBER"
                let nrOfLines = int token.value

                skipSpaces tokenReader.PdfFile
                let nrOfBytesExpected = 20 * nrOfLines
                let bytesXref = Array.zeroCreate nrOfBytesExpected
                let nrOfBytesFound = tokenReader.PdfFile.Read(bytesXref, 0, nrOfBytesExpected)
                if nrOfBytesExpected <> nrOfBytesFound then
                    failwith "Não conseguiu ler o xref"
                self.AddBlock(startId, nrOfLines, bytesXref)
                skipSpaces tokenReader.PdfFile
                let by = tokenReader.PdfFile.ReadByte()
                tokenReader.PdfFile.Seek(-1L, SeekOrigin.Current) |> ignore
                Char.IsDigit(char by) // continua no loop se for dígito, senão
                                      // sai do loop e deixa pronto para ler o trailer
        ) do ()

    member self.AddBlock(startId, nrOfLines, bytesXref) =
        for i=0 to nrOfLines-1 do
            self.AddObj(int64(i) + int64(startId), bytesXref.[20*i .. (20*(i+1) - 1)])

    member self.AddObj(id, bytesLine) =
        let strLine = new String(Array.map char bytesLine)
        let xrefItem = {
            type_ = strLine.[17]
            id = id
            bytePos = if strLine.[17] = 'n' then int64 strLine.[0..9] else 0L
            genNr = int64 strLine.[11..15]
        }
        let key = {objNum = xrefItem.id; genNum = xrefItem.genNr}
        if not(objs.ContainsKey key) then
            objs.[key] <- xrefItem

    member self.GetObj(reference) = objs.[reference]
end

let rec parseObject(tokenReader: PdfTokenReader) :PdfObject =
    let objhelper() =
        let result = parseObject(tokenReader)
        let token = tokenReader.NextToken()
        expect token.t "endobj"
        result
    let token1 = tokenReader.NextToken()
    match token1.t with
    | "NUMBER" ->
        // pode ser número, objeto indireto, ou referência para objeto indireto
        let token2 = tokenReader.NextToken()
        if token2.t = "NUMBER" then
            let token3 = tokenReader.NextToken()
            match token3.t with
            | "obj" -> objhelper()
            | "R" -> PdfReferenceObj({objNum = int64 token1.value; genNum = int64 token2.value})
            | _ ->
                tokenReader.UnreadToken(token3)
                tokenReader.UnreadToken(token2)
                PdfNumber(float token1.value)
        else
            tokenReader.UnreadToken(token2)
            PdfNumber(float token1.value)
    | "[" ->
        parseArray(tokenReader)
    | "<<" ->
        let dictionary = parseDict(tokenReader)
        let token4 = tokenReader.NextToken()
        if token4.t = "stream" then
            readStream(tokenReader, dictionary)
        else
            tokenReader.UnreadToken(token4)
            PdfDict(dictionary)
    | "obj" -> objhelper()
    | "STRING" -> PdfString(token1.value)
    | "HX_STRING" -> PdfHxString(token1.value)
    | _ -> PdfTokenObj(token1)
and parseDict(tokenReader: PdfTokenReader) =
    let result = new Dictionary<string, PdfObject>()
    while (
            let keyToken = tokenReader.NextToken()
            if keyToken.t = ">>" then
                false // break
            else (
                match parseObject tokenReader with
                | PdfTokenObj({t = ">>"; value = _}) ->
                    failwith(">> inesperado em pos = " + (string tokenReader.PdfFile.Position))
                | _ as objeto ->
                    result.[keyToken.value] <- objeto
                    true
            )
    ) do
        ()
    result
and parseArray(tokenReader: PdfTokenReader) =
    let result = new ResizeArray<PdfObject>()
    while (
            let token = tokenReader.NextToken()
            if token.t = "]" then
                false // break
            else
                tokenReader.UnreadToken(token)
                result.Add(parseObject(tokenReader))
                true
    ) do ()
    PdfArray(result)
and readStream(tokenReader: PdfTokenReader, dictionary: Dictionary<string, PdfObject>) =
    let by = tokenReader.PdfFile.ReadByte()
    if by = int '\r' then // É \r\n?
        tokenReader.PdfFile.ReadByte() |> ignore // leu \r, ignorar \n
    elif by = int '\n' then
        () // ignorar \n
    else
        tokenReader.PdfFile.Seek(-1L, SeekOrigin.Current) |> ignore
    // "Ler" (cof, cof) os bytes do stream
    let mutable lenObj = dictionary.["Length"]
    match lenObj with
    | PdfReferenceObj(reference) ->
        let savedPos = tokenReader.PdfFile.Position
        lenObj <- findIndirectObj tokenReader reference
        tokenReader.PdfFile.Seek(savedPos, SeekOrigin.Begin) |> ignore
    | _ -> ()
    tokenReader.PdfFile.Seek(
        (match lenObj with                
            | PdfNumber(nr) -> int64 nr
            | _ -> failwith("Erro: /Length de stream não é um número. Pos = " + (string tokenReader.PdfFile.Position))),
        SeekOrigin.Current
    ) |> ignore
    let token = tokenReader.NextToken()
    expect token.t "endstream"
    PdfStream(dictionary, null) (* Na verdade, no lugar de null deveríamos colocar os bytes do stream
                                mas por simplificação a gente só deu um Seek para pular os bytes do arquivo
                                e nem os trouxe para a memória *)
and readIndirectObj (tokenReader: PdfTokenReader) xrefItem =
    if xrefItem.type_ = 'n' then
        tokenReader.PdfFile.Seek(xrefItem.bytePos, SeekOrigin.Begin) |> ignore
        parseObject(tokenReader)
    else
        failwith(sprintf "Procurando objeto inválido: %A" xrefItem)
and findIndirectObj (tokenReader: PdfTokenReader) catalogNum =
    let xrefItem = tokenReader.Xref.GetObj(catalogNum)
    readIndirectObj tokenReader xrefItem


let parseTrailer(tokenReader: PdfTokenReader) =
    let token = tokenReader.NextToken()
    expect token.t "trailer"
    let token = tokenReader.NextToken()
    expect token.t "<<"
    parseDict tokenReader

let getNumberOfPages fileName :int =
    let bufferSize = 5
    use pdfFile = new FileStream(fileName, FileMode.Open, FileAccess.Read)
    let comecoPdf: byte array = Array.zeroCreate bufferSize
    pdfFile.Read(comecoPdf, 0, bufferSize) |> ignore
    if comecoPdf <> "%PDF-"B then
        failwith "Não é um arquivo PDF"
    let endBufferSize = int(min (FileInfo(fileName).Length) 400L)
    pdfFile.Seek(-(int64 endBufferSize), SeekOrigin.End) |> ignore
    let startxref = Array.zeroCreate endBufferSize
    pdfFile.Read(startxref, 0, endBufferSize) |> ignore
    let strStartxref = new String(Array.map char startxref)
    let regExpMatch = Regex.Match(strStartxref, @"startxref\s+(\d+)\s+%%EOF")
    if not regExpMatch.Success then
        failwith "Não achou o número de páginas (startxref)"
    let tokenReader = new PdfTokenReader(pdfFile)
    let linearized() :int =
        pdfFile.Seek(0L, SeekOrigin.Begin) |> ignore
        let firstObj = parseObject(tokenReader)
        match firstObj with
        | PdfDict(d) when d.ContainsKey("Linearized") ->
            match d.["N"] with
            | PdfNumber(n) -> int n
            | _ -> failwith "Não achou o número de páginas (/N)"
        | _ -> failwith "Não achou o número de páginas (/Linearized)"
    let pos = int64 regExpMatch.Groups.[1].Value
    if pos = 0L then
        linearized()
    else
        pdfFile.Seek(pos, SeekOrigin.Begin) |> ignore
        let token = tokenReader.NextToken()
        if token.t = "xref" then
            tokenReader.UnreadToken(token)
            let trailers = new ResizeArray<Dictionary<string, PdfObject>>()
            let xref = new PdfXref()
            tokenReader.Xref <- xref
            while (
                    xref.ParseXref(tokenReader)
                    let trailer = parseTrailer(tokenReader)
                    trailers.Add(trailer)
                    if trailer.ContainsKey("Prev") then
                        pdfFile.Seek((match trailer.["Prev"] with
                                        | PdfNumber(n) -> int64 n
                                        | _ -> failwith "/Prev não é um número"),
                                        SeekOrigin.Begin) |> ignore
                        true
                    else
                        false) do ()

            let rec findCatalog pos =
                if trailers.[pos].ContainsKey("Root") then
                    match trailers.[pos].["Root"] with
                    | PdfReferenceObj(catalogNum) ->
                        findIndirectObj tokenReader catalogNum
                    | _ ->
                        failwith "/Root não é uma referência"
                else
                    let newPos = pos + 1
                    if newPos >= trailers.Count then
                        failwith "Não achou o número de páginas (catalog)"
                    findCatalog newPos
            let catalog = findCatalog 0
            let pagesRef = match catalog with
                            | PdfDict(d) -> d.["Pages"]
                            | _ -> failwith "catalog não é um dict"
            match pagesRef with
            | PdfReferenceObj(reference) ->
                let pages = findIndirectObj tokenReader reference
                match pages with
                | PdfDict(d) ->
                    let count = d.["Count"]
                    match count with
                    | PdfNumber(n) -> int n
                    | _ -> failwith "Não achou o número de páginas. /Count não é um número."
                | _ -> failwith "pages não é um dict"
            | _ -> failwith "/Pages não é uma referência"
        else
            linearized()

[<EntryPoint>]
let main args =
    (*printfn "%A" args // não inclui o nome do programa
    printfn "%A" (System.Environment.GetCommandLineArgs()) // inclui o nome do programa (com o path)
    Console.ReadKey() |> ignore
    if true then 0 else*)
    if args.Length = 0 then
        printfn "Uso: PdfPageCount <nome-arquivo.pdf>"
        1
    else
        let mutable sucessos, falhas = 0, 0
        for fileName in args do
            try
                printfn "Número de páginas de %s: %d" fileName (getNumberOfPages fileName)
                sucessos <- sucessos + 1
            with e ->
                printfn "%s" e.Message
                printfn "Ocorreu um erro ao processar %s. Continuando..." fileName
                falhas <- falhas + 1
        if args.Length > 1 then
            printfn "Fim do processo. %d sucesso(s), %d falha(s)." sucessos falhas
        0

Versão usada nos testes de 2010-09-19:

module Pdf.PdfPageCount

(*
Este programa mostra na tela o número de páginas de um
arquivo PDF passado na linha de comando.
Este código fonte é de domínio público (PUBLIC DOMAIN)
Desenvolvido em F# 2.0
Marcus.
*)

open System
open System.IO
open System.Collections.Generic
open System.Text.RegularExpressions

let isDelimiter ch =
    match Array.tryFindIndex ((=) ch) [|'('; ')'; '<'; '>'; '['; ']'; '{'; '}'; '/'; '%'|] with
    | None -> Char.IsWhiteSpace(ch)
    | _ -> true

type PdfReference = {objNum: int64; genNum: int64}

// Guardar o tipo de token como uma string não é nada type-safe...
// Característica da implementação original em Python 3
type PdfToken = {t: string; value: string}

type PdfObject =
    | PdfNumber of float // Pra ficar mais certo, teria que diferenciar entre os tipos int e float
    | PdfString of string
    | PdfHxString of string
    | PdfDict of Dictionary<string, PdfObject>
    | PdfArray of ResizeArray<PdfObject>
    | PdfReferenceObj of PdfReference
    | PdfTokenObj of PdfToken
    | PdfStream of Dictionary<string, PdfObject> * byte array

let rec skipSpaces (aPdfFile: FileStream) =
    let by = aPdfFile.ReadByte()
    if Char.IsWhiteSpace(char by) then
        skipSpaces aPdfFile
    else
        // Opa, não era espaço. Voltamos um passo e terminamos a função
        aPdfFile.Seek(-1L, SeekOrigin.Current) |> ignore

let expect tok1 tok2 =
    if tok1 <> tok2 then
        failwith("Encontrou " + tok1 + " quando esperava " + tok2)

let readNumber64 (str: string) (pos: int) :int64 =
    let mutable endPos = pos
    while (match str.[endPos] with
            | '.' | '+' | '-' -> true
            | d when Char.IsDigit(d) -> true
            | _ -> false) do
        endPos <- endPos + 1
    //printfn "str = %s; str.[pos .. endPos-1] = %s; pos = %d; endPos = %d" str str.[pos .. endPos-1] pos endPos
    int64 str.[pos .. endPos-1]

type PdfTokenReader(aPdfFile: FileStream) = class
    let mutable tokenBuffer = []
    let mutable xref: PdfXref = null
    member self.PdfFile = aPdfFile
    member self.Xref with get() = xref and set(value) = xref <- value
    member self.UnreadToken token = tokenBuffer <- token :: tokenBuffer

    member self.NextToken() :PdfToken =
        match tokenBuffer with
        | h :: t ->
            tokenBuffer <- t
            h
        | [] ->
            let rec skipCommentsLoop() =
                let by = aPdfFile.ReadByte()
                if by = -1 then
                    ()
                else
                    let s = Char.ConvertFromUtf32(by)
                    if s = "%" then
                        self.SkipComment()
                        skipCommentsLoop()
                    elif String.IsNullOrWhiteSpace(s) then
                        skipSpaces(aPdfFile)
                        skipCommentsLoop()
                    else
                        aPdfFile.Seek(-1L, SeekOrigin.Current) |> ignore
            skipCommentsLoop()
            let trataWord() =
                let word = self.ReadWord();
                match word with
                | "obj" | "endobj" | "stream" | "endstream" | "true" | "false" | "null" | "xref" | "trailer" ->
                    {t = word; value = null}
                | _ ->
                    failwith(sprintf "Erro no PDF no byte %d, lendo %s" aPdfFile.Position word)
            let by = aPdfFile.ReadByte()
            let ch = char by
            if Char.IsDigit(ch) || (Array.tryFindIndex ((=) (char by)) [|'.'; '+'; '-'|]) <> None then
                aPdfFile.Seek(-1L, SeekOrigin.Current) |> ignore
                {t = "NUMBER"; value = self.ReadNumber()}
            elif by = int '<' then
                let by = aPdfFile.ReadByte()
                if by = int '<' then
                    {t = "<<"; value = null}
                else
                    aPdfFile.Seek(-1L, SeekOrigin.Current) |> ignore
                    {t = "HX_STRING"; value = self.ReadHxString()}
            elif by = int '>' then
                let by = aPdfFile.ReadByte()
                if by = int '>' then
                    {t = ">>"; value = null}
                else
                    aPdfFile.Seek(-1L, SeekOrigin.Current) |> ignore
                    {t = ">"; value = null} // Não deve acontecer, porque o readHxString vai consumir o '>'
            elif by = int '(' then {t = "STRING"; value = self.ReadString()}
            elif by = int '/' then {t = "NAME"; value = self.ReadName()}
            elif by = int '[' then {t = "["; value = null}
            elif by = int ']' then {t = "]"; value = null}
            elif by = int '{' then {t = "CODE"; value = self.ReadCode()}
            elif by = int 'R' then
                let by2 = aPdfFile.ReadByte()
                aPdfFile.Seek(-1L, SeekOrigin.Current) |> ignore
                if by2 = -1 || isDelimiter (char by2) then
                    {t = "R"; value = null}
                else
                    aPdfFile.Seek(-1L, SeekOrigin.Current) |> ignore
                    trataWord()
            else
                aPdfFile.Seek(-1L, SeekOrigin.Current) |> ignore
                trataWord()

    member self.ReadWord() =
        let result = new ResizeArray<char>()
        let mutable by = aPdfFile.ReadByte()
        while by <> -1 && not(isDelimiter(char by)) do
            result.Add(char by)
            by <- aPdfFile.ReadByte()
        aPdfFile.Seek(-1L, SeekOrigin.Current) |> ignore
        String.Join("", result)

    member self.ReadNumber() :string =
        let result = new ResizeArray<char>()
        let mutable by = aPdfFile.ReadByte()
        while by <> -1 && (Char.IsDigit(char by) || (Array.tryFindIndex ((=) (char by)) [|'.'; '+'; '-'|]) <> None) do
            result.Add(char by)
            by <- aPdfFile.ReadByte()
        aPdfFile.Seek(-1L, SeekOrigin.Current) |> ignore
        String.Join("", result)

    member self.SkipComment() =
        let by = aPdfFile.ReadByte()
        match Char.ConvertFromUtf32(by).[0] with
        | '\r' ->
            match Char.ConvertFromUtf32(aPdfFile.ReadByte()).[0] with
            | '\n' -> ()
            | _ -> aPdfFile.Seek(-1L, SeekOrigin.Current) |> ignore
        | '\n' -> ()
        | _ -> self.SkipComment()

    member self.ReadCode() =
        let mutable parLevel = 0
        let result = new ResizeArray<char>()
        let mutable finished = false
        while not finished do
            let by = aPdfFile.ReadByte()
            if by = -1 then finished <- true
            else
                if by = int '{' then
                    parLevel <- parLevel + 1
                    result.Add('{')
                elif by = int '}' then
                    parLevel <- parLevel - 1
                    if parLevel < 0 then
                        finished <- true
                    else
                        result.Add('}')
                else
                    result.Add(char by)
        String.Join("", result)

    member self.ReadName() = self.ReadWord()

    member self.ReadHxString() =
        let result = new ResizeArray<char>()
        let mutable by = aPdfFile.ReadByte()
        while by <> -1 && by <> int '>' do
            result.Add(char by)
            by <- aPdfFile.ReadByte()
        String.Join("", result)

    member self.ReadString() =
        let mutable parLevel = 0
        let mutable escaped = false
        let result = new ResizeArray<char>()
        let mutable finished = false
        while not finished do
            let by = aPdfFile.ReadByte()
            if escaped then
                match char by with
                | 'n' -> result.Add('\n')
                | 'r' -> result.Add('\r')
                | 'b' -> result.Add('\b')
                | 'f' -> result.Add('\f')
                | '(' -> result.Add('(')
                | ')' -> result.Add(')')
                | d when Char.IsDigit(d) ->
                    // Não suportado ainda
                    result.Add('\\')
                    result.Add(d)
                | _ -> ()
                escaped <- false
            elif by = int '\\' then
                escaped <- true
            elif by = int '(' then
                parLevel <- parLevel + 1
                result.Add('(')
            elif by = int ')' then
                parLevel <- parLevel - 1
                if parLevel < 0 then
                    finished <- true
            else
                result.Add(char by)
        String.Join("", result)
end
and PdfXrefItem = {
    type_: char
    id: int64
    bytePos: int64
    genNr: int64
}
and [<AllowNullLiteral>]PdfXref() = class
    let objs = new Dictionary<PdfReference, PdfXrefItem>()
    member self.ParseXref(tokenReader: PdfTokenReader) =
        let token = tokenReader.NextToken()
        expect token.t "xref"
        while (
                let token = tokenReader.NextToken()
                expect token.t "NUMBER"
                let startId = int token.value

                let token = tokenReader.NextToken()
                expect token.t "NUMBER"
                let nrOfLines = int token.value

                skipSpaces tokenReader.PdfFile
                let nrOfBytesExpected = 20 * nrOfLines
                let bytesXref = Array.zeroCreate nrOfBytesExpected
                let nrOfBytesFound = tokenReader.PdfFile.Read(bytesXref, 0, nrOfBytesExpected)
                if nrOfBytesExpected <> nrOfBytesFound then
                    failwith "Não conseguiu ler o xref"
                self.AddBlock(startId, nrOfLines, bytesXref)
                skipSpaces tokenReader.PdfFile
                let by = tokenReader.PdfFile.ReadByte()
                tokenReader.PdfFile.Seek(-1L, SeekOrigin.Current) |> ignore
                Char.IsDigit(char by) // continua no loop se for dígito, senão
                                      // sai do loop e deixa pronto para ler o trailer
        ) do ()

    member self.AddBlock(startId, nrOfLines, bytesXref) =
        for i=0 to nrOfLines-1 do
            self.AddObj(int64(i) + int64(startId), bytesXref.[20*i .. (20*(i+1) - 1)])

    member self.AddObj(id, bytesLine) =
        let strLine = new String(Array.map char bytesLine)
        let xrefItem = {
            type_ = strLine.[17]
            id = id
            bytePos = if strLine.[17] = 'n' then (readNumber64 strLine 0) else 0L
            genNr = (readNumber64 strLine 11)
        }
        let key = {objNum = xrefItem.id; genNum = xrefItem.genNr}
        if not(objs.ContainsKey key) then
            objs.[key] <- xrefItem

    member self.GetObj(reference) = objs.[reference]
end

let rec parseObject(tokenReader: PdfTokenReader) :PdfObject =
    let objhelper() =
        let result = parseObject(tokenReader)
        let token = tokenReader.NextToken()
        expect token.t "endobj"
        result
    let token1 = tokenReader.NextToken()
    match token1.t with
    | "NUMBER" ->
        // pode ser número, objeto indireto, ou referência para objeto indireto
        let token2 = tokenReader.NextToken()
        if token2.t = "NUMBER" then
            let token3 = tokenReader.NextToken()
            match token3.t with
            | "obj" -> objhelper()
            | "R" -> PdfReferenceObj({objNum = int64 token1.value; genNum = int64 token2.value})
            | _ ->
                tokenReader.UnreadToken(token3)
                tokenReader.UnreadToken(token2)
                PdfNumber(float token1.value)
        else
            tokenReader.UnreadToken(token2)
            PdfNumber(float token1.value)
    | "[" ->
        parseArray(tokenReader)
    | "<<" ->
        let dictionary = parseDict(tokenReader)
        let token4 = tokenReader.NextToken()
        if token4.t = "stream" then
            readStream(tokenReader, dictionary)
        else
            tokenReader.UnreadToken(token4)
            PdfDict(dictionary)
    | "obj" -> objhelper()
    | "STRING" -> PdfString(token1.value)
    | "HX_STRING" -> PdfHxString(token1.value)
    | _ -> PdfTokenObj(token1)
and parseDict(tokenReader: PdfTokenReader) =
    let result = new Dictionary<string, PdfObject>()
    while (
            let keyToken = tokenReader.NextToken()
            if keyToken.t = ">>" then
                false // break
            else (
                match parseObject tokenReader with
                | PdfTokenObj({t = ">>"; value = _}) ->
                    failwith(">> inesperado em pos = " + (string tokenReader.PdfFile.Position))
                | _ as objeto ->
                    result.[keyToken.value] <- objeto
                    true
            )
    ) do
        ()
    result
and parseArray(tokenReader: PdfTokenReader) =
    let result = new ResizeArray<PdfObject>()
    while (
            let token = tokenReader.NextToken()
            if token.t = "]" then
                false // break
            else
                tokenReader.UnreadToken(token)
                result.Add(parseObject(tokenReader))
                true
    ) do ()
    PdfArray(result)
and readStream(tokenReader: PdfTokenReader, dictionary: Dictionary<string, PdfObject>) =
    let by = tokenReader.PdfFile.ReadByte()
    if by = int '\r' then // É \r\n?
        tokenReader.PdfFile.ReadByte() |> ignore // leu \r, ignorar \n
    elif by = int '\n' then
        () // ignorar \n
    else
        tokenReader.PdfFile.Seek(-1L, SeekOrigin.Current) |> ignore
    // "Ler" (cof, cof) os bytes do stream
    let mutable lenObj = dictionary.["Length"]
    match lenObj with
    | PdfReferenceObj(reference) ->
        let savedPos = tokenReader.PdfFile.Position
        lenObj <- findIndirectObj tokenReader reference
        tokenReader.PdfFile.Seek(savedPos, SeekOrigin.Begin) |> ignore
    | _ -> ()
    tokenReader.PdfFile.Seek(
        (match lenObj with
            | PdfNumber(nr) -> int64 nr
            | _ -> failwith("Erro: /Length de stream não é um número. Pos = " + (string tokenReader.PdfFile.Position))),
        SeekOrigin.Current
    ) |> ignore
    let token = tokenReader.NextToken()
    expect token.t "endstream"
    PdfStream(dictionary, null) (* Na verdade, no lugar de null deveríamos colocar os bytes do stream
                                mas por simplificação a gente só deu um Seek para pular os bytes do arquivo
                                e nem os trouxe para a memória *)
and readIndirectObj (tokenReader: PdfTokenReader) xrefItem =
    if xrefItem.type_ = 'n' then
        tokenReader.PdfFile.Seek(xrefItem.bytePos, SeekOrigin.Begin) |> ignore
        parseObject(tokenReader)
    else
        failwith(sprintf "Procurando objeto inválido: %A" xrefItem)
and findIndirectObj (tokenReader: PdfTokenReader) catalogNum =
    let xrefItem = tokenReader.Xref.GetObj(catalogNum)
    readIndirectObj tokenReader xrefItem

let parseTrailer(tokenReader: PdfTokenReader) =
    let token = tokenReader.NextToken()
    expect token.t "trailer"
    let token = tokenReader.NextToken()
    expect token.t "<<"
    parseDict tokenReader

let getNumberOfPages fileName :int =
    let bufferSize = 5
    use pdfFile = new FileStream(fileName, FileMode.Open, FileAccess.Read)
    let comecoPdf: byte array = Array.zeroCreate bufferSize
    pdfFile.Read(comecoPdf, 0, bufferSize) |> ignore
    if comecoPdf <> "%PDF-"B then
        failwith "Não é um arquivo PDF"
    let endBufferSize = int(min (FileInfo(fileName).Length) 400L)
    pdfFile.Seek(-(int64 endBufferSize), SeekOrigin.End) |> ignore
    let startxref = Array.zeroCreate endBufferSize
    pdfFile.Read(startxref, 0, endBufferSize) |> ignore
    let strStartxref = new String(Array.map char startxref)
    let regExpMatch = Regex.Match(strStartxref, @"startxref\s+(\d+)\s+%%EOF")
    if not regExpMatch.Success then
        failwith "Não achou o número de páginas (startxref)"
    let tokenReader = new PdfTokenReader(pdfFile)
    let linearized() :int =
        pdfFile.Seek(0L, SeekOrigin.Begin) |> ignore
        let firstObj = parseObject(tokenReader)
        match firstObj with
        | PdfDict(d) when d.ContainsKey("Linearized") ->
            match d.["N"] with
            | PdfNumber(n) -> int n
            | _ -> failwith "Não achou o número de páginas (/N)"
        | _ -> failwith "Não achou o número de páginas (/Linearized)"
    let pos = int64 regExpMatch.Groups.[1].Value
    if pos = 0L then
        linearized()
    else
        pdfFile.Seek(pos, SeekOrigin.Begin) |> ignore
        let token = tokenReader.NextToken()
        if token.t = "xref" then
            tokenReader.UnreadToken(token)
            let trailers = new ResizeArray<Dictionary<string, PdfObject>>()
            let xref = new PdfXref()
            tokenReader.Xref <- xref
            while (
                    xref.ParseXref(tokenReader)
                    let trailer = parseTrailer(tokenReader)
                    trailers.Add(trailer)
                    if trailer.ContainsKey("Prev") then
                        pdfFile.Seek((match trailer.["Prev"] with
                                        | PdfNumber(n) -> int64 n
                                        | _ -> failwith "/Prev não é um número"),
                                        SeekOrigin.Begin) |> ignore
                        true
                    else
                        false) do ()

            let rec findCatalog pos =
                if trailers.[pos].ContainsKey("Root") then
                    match trailers.[pos].["Root"] with
                    | PdfReferenceObj(catalogNum) ->
                        findIndirectObj tokenReader catalogNum
                    | _ ->
                        failwith "/Root não é uma referência"
                else
                    let newPos = pos + 1
                    if newPos >= trailers.Count then
                        failwith "Não achou o número de páginas (catalog)"
                    findCatalog newPos
            let catalog = findCatalog 0
            let pagesRef = match catalog with
                            | PdfDict(d) -> d.["Pages"]
                            | _ -> failwith "catalog não é um dict"
            match pagesRef with
            | PdfReferenceObj(reference) ->
                let pages = findIndirectObj tokenReader reference
                match pages with
                | PdfDict(d) ->
                    let count = d.["Count"]
                    match count with
                    | PdfNumber(n) -> int n
                    | _ -> failwith "Não achou o número de páginas. /Count não é um número."
                | _ -> failwith "pages não é um dict"
            | _ -> failwith "/Pages não é uma referência"
        else
            linearized()

[<EntryPoint>]
let main args =
    (*printfn "%A" args // não inclui o nome do programa
    printfn "%A" (System.Environment.GetCommandLineArgs()) // inclui o nome do programa (com o path)
    Console.ReadKey() |> ignore
    if true then 0 else*)
    if args.Length = 0 then
        printfn "Uso: PdfPageCount <nome-arquivo.pdf>"
        1
    else
        let mutable sucessos, falhas = 0, 0
        for fileName in args do
            try
                printfn "Número de páginas de %s: %d" fileName (getNumberOfPages fileName)
                sucessos <- sucessos + 1
            with e ->
                printfn "%s" e.Message
                printfn "Ocorreu um erro ao processar %s. Continuando..." fileName
                falhas <- falhas + 1
        if args.Length > 1 then
            printfn "Fim do processo. %d sucesso(s), %d falha(s)." sucessos falhas
        0
Anúncios

3 pensamentos sobre “F#: PdfPageCount.fs

  1. Python vs. PL/SQL vs. F# « Visions of hope

  2. Comparação de desempenho lendo PDFs « Visions of hope

  3. Comparação de desempenho (2) « Visions of hope

Deixe um comentário

Preencha os seus dados abaixo ou clique em um ícone para log in:

Logotipo do WordPress.com

Você está comentando utilizando sua conta WordPress.com. Sair / Alterar )

Imagem do Twitter

Você está comentando utilizando sua conta Twitter. Sair / Alterar )

Foto do Facebook

Você está comentando utilizando sua conta Facebook. Sair / Alterar )

Foto do Google+

Você está comentando utilizando sua conta Google+. Sair / Alterar )

Conectando a %s