Está meio ruim de ler aqui na página por causa do tamanho das letras e do tamanho das linhas. Sugiro copiar e colar para um editor com sintaxe colorida. Obs.: eu testei com vários arquivos PDF, mas é certo que há erros à espreita. Considere-se avisado. Última atualização: 2010-09-19.
create or replace package pk_pdf as function number_of_pages(pdf in blob) return integer; end pk_pdf; create or replace package body pk_pdf as type Pdf_Reference is record ( obj_num number, gen_num number); type Pdf_Object is record ( t varchar2(32767), num number, str varchar2(32767), dict pls_integer, array_ pls_integer, refer Pdf_Reference); type Dictionary_T is table of Pdf_Object index by varchar2(32767); type Dictionary_Ref_T is table of Dictionary_T index by pls_integer; type Pdf_Array is varray(125000) of Pdf_Object; type Array_Ref_T is table of Pdf_Array index by pls_integer; type Trailer_Array is varray(125000) of Dictionary_T; type Pdf_Token is record ( t varchar2(32767), val varchar2(32767)); type Pdf_Token_Array is varray(20) of Pdf_Token; type Pdf_Xref_Item is record ( type_ varchar2(1), id_ number, byte_pos number, gen_num number); type Pdf_Xref_Table is table of Pdf_Xref_Item index by varchar2(20); type pdf_xref is record ( objs Pdf_Xref_Table); type Pdf_Token_Reader is record ( pdf blob, position number, tok_buf Pdf_Token_Array, dict_refs Dictionary_Ref_T, array_refs Array_Ref_T, xref pdf_xref); procedure obj_set_ref(self in out nocopy Pdf_Object, refer Pdf_Reference) is begin self.t := 'REFERENCE'; self.refer := refer; end; procedure obj_set_dict(self in out nocopy Pdf_Object, dict Dictionary_T, dict_refs in out nocopy Dictionary_Ref_T) is idx pls_integer; begin self.t := 'DICTIONARY'; if dict_refs.last is null then idx := 1; else idx := dict_refs.last + 1; end if; dict_refs(idx) := dict; self.dict := idx; end; procedure obj_set_stream(self in out nocopy Pdf_Object, stream blob, dict Dictionary_T, dict_refs in out nocopy Dictionary_Ref_T) is idx pls_integer; begin self.t := 'STREAM'; if dict_refs.last is null then idx := 1; else idx := dict_refs.last + 1; end if; dict_refs(idx) := dict; self.dict := idx; -- self.stream := stream; -- Não estamos usando por enquanto. end; procedure obj_set_array(self in out nocopy Pdf_Object, array_ Pdf_Array, array_refs in out nocopy Array_Ref_T) is idx pls_integer; begin self.t := 'ARRAY'; if array_refs.last is null then idx := 1; else idx := array_refs.last + 1; end if; array_refs(idx) := array_; self.dict := idx; end; procedure obj_set_token(self in out nocopy Pdf_Object, token Pdf_Token) is begin case token.t when 'NUMBER' then self.t := token.t; self.num := to_number(token.val); -- está sem distinção entre int e float when 'STRING' then self.t := token.t; self.str := token.val; when 'HX_STRING' then self.t := token.t; self.str := token.val; when 'true' then self.t := 'BOOLEAN'; self.str := 'true'; when 'false' then self.t := 'BOOLEAN'; self.str := 'false'; when 'null' then self.t := 'NULL'; else self.t := '?'; self.str := token.val; end case; end; procedure pdf_token_init(self in out nocopy Pdf_Token, type_ varchar2, value_ varchar2) is begin self.t := type_; self.val := value_; end; procedure skip_comment(self in out nocopy Pdf_Token_Reader); procedure skip_spaces(self in out nocopy Pdf_Token_Reader); function read_number(self in out nocopy Pdf_Token_Reader) return varchar2; function read_hx_string(self in out nocopy Pdf_Token_Reader) return varchar2; function read_string(self in out nocopy Pdf_Token_Reader) return varchar2; function read_name(self in out nocopy Pdf_Token_Reader) return varchar2; function read_word(self in out nocopy Pdf_Token_Reader) return varchar2; function read_code(self in out nocopy Pdf_Token_Reader) return varchar2; function is_digit(ch varchar2) return boolean; function is_space(ch varchar2) return boolean; function is_delimiter(ch varchar2) return boolean; procedure pdf_token_reader_init(self in out nocopy Pdf_Token_Reader, pdf blob, position number) is begin self.pdf := pdf; self.position := position; self.tok_buf := Pdf_Token_Array(); end; function next_token(self in out nocopy Pdf_Token_Reader) return Pdf_Token is result Pdf_Token; byt raw(1); ch varchar2(1); amount integer := 1; function trata_word return Pdf_Token is word varchar2(32767); result Pdf_Token; begin word := read_word(self); if word in ('obj', 'endobj', 'stream', 'endstream', 'true', 'false', 'null', 'xref', 'trailer') then pdf_token_init(result, word, null); return result; end if; raise_application_error(-20001, 'Erro no PDF no byte ' || self.position || ', lendo: ' + word); end; begin if self.tok_buf.last is not null then result := self.tok_buf(self.tok_buf.last); self.tok_buf.trim; return result; end if; loop dbms_lob.read(self.pdf, amount, self.position, byt); ch := utl_raw.cast_to_varchar2(byt); if ch = '%' then skip_comment(self); elsif is_space(ch) then skip_spaces(self); else exit; end if; end loop; dbms_lob.read(self.pdf, amount, self.position, byt); ch := utl_raw.cast_to_varchar2(byt); if is_digit(ch) or ch in ('.', '+', '-') then pdf_token_init(result, 'NUMBER', read_number(self)); return result; elsif ch = '<' then self.position := self.position + 1; dbms_lob.read(self.pdf, amount, self.position, byt); ch := utl_raw.cast_to_varchar2(byt); if ch = '<' then self.position := self.position + 1; pdf_token_init(result, '<<', null); return result; end if; pdf_token_init(result, 'HX_STRING', read_hx_string(self)); return result; elsif ch = '>' then self.position := self.position + 1; dbms_lob.read(self.pdf, amount, self.position, byt); ch := utl_raw.cast_to_varchar2(byt); if ch = '>' then self.position := self.position + 1; pdf_token_init(result, '>>', null); return result; end if; pdf_token_init(result, '>', null); -- Não deve acontecer, porque o readHxString vai consumir o '>' return result; elsif ch = '(' then self.position := self.position + 1; pdf_token_init(result, 'STRING', read_string(self)); return result; elsif ch = '/' then self.position := self.position + 1; pdf_token_init(result, 'NAME', read_name(self)); return result; elsif ch = '[' then self.position := self.position + 1; pdf_token_init(result, '[', null); return result; elsif ch = ']' then self.position := self.position + 1; pdf_token_init(result, ']', null); return result; elsif ch = '{' then self.position := self.position + 1; pdf_token_init(result, 'CODE', read_code(self)); return result; elsif ch = 'R' then byt := dbms_lob.substr(self.pdf, amount, self.position + 1); ch := utl_raw.cast_to_varchar2(byt); if ch is null or is_delimiter(ch) then self.position := self.position + 1; pdf_token_init(result, 'R', null); return result; else return trata_word; end if; else return trata_word; end if; end; procedure unread_token(self in out nocopy Pdf_Token_Reader, token Pdf_Token) is begin self.tok_buf.extend; self.tok_buf(self.tok_buf.last) := token; end; procedure skip_comment(self in out nocopy Pdf_Token_Reader) is ch varchar2(1); amount number := 1; byt raw(1); begin loop dbms_lob.read(self.pdf, amount, self.position, byt); ch := utl_raw.cast_to_varchar2(byt); if ch = chr(13) then self.position := self.position + 1; dbms_lob.read(self.pdf, amount, self.position, byt); ch := utl_raw.cast_to_varchar2(byt); if ch = chr(10) then self.position := self.position + 1; end if; return; elsif ch = chr(10) then self.position := self.position + 1; return; else self.position := self.position + 1; end if; end loop; end; procedure skip_spaces(self in out nocopy Pdf_Token_Reader) is amount number := 1; ch varchar2(1); byt raw(1); begin loop dbms_lob.read(self.pdf, amount, self.position, byt); ch := utl_raw.cast_to_varchar2(byt); if is_space(ch) then self.position := self.position + 1; else return; end if; end loop; end; function read_number(self in out nocopy Pdf_Token_Reader) return varchar2 is amount number := 1; ch varchar2(1); byt raw(1); result varchar2(32767) := ''; begin loop dbms_lob.read(self.pdf, amount, self.position, byt); ch := utl_raw.cast_to_varchar2(byt); exit when not is_digit(ch) and ch not in ('.', '+', '-'); result := result || ch; self.position := self.position + 1; end loop; return result; end; function read_number(str varchar2, pos pls_integer) return number is len pls_integer := 0; begin while substr(str, pos + len, 1) in ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', '+', '-') loop len := len + 1; end loop; return to_number(substr(str, pos, len)); end; function read_hx_string(self in out nocopy Pdf_Token_Reader) return varchar2 is --end_position integer; amount number := 1; ch varchar2(1); byt raw(1); result varchar2(32767) := ''; begin /* -- tem um erro, e estou com preguiça de debugar porque já tenho -- uma versão funcionando, mais abaixo. end_position := dbms_lob.instr(self.pdf, '>', self.position, 1); result := dbms_lob.substr(self.pdf, end_position - self.position, self.position); self.position := end_position + 1; return result; */ -- /* loop dbms_lob.read(self.pdf, amount, self.position, byt); ch := utl_raw.cast_to_varchar2(byt); exit when ch = '>'; result := result || ch; self.position := self.position + 1; end loop; return result; -- */ end; function read_string(self in out nocopy Pdf_Token_Reader) return varchar2 is par_level pls_integer := 0; escaped boolean := false; amount number := 1; ch varchar2(1); byt raw(1); result varchar2(32767); begin loop dbms_lob.read(self.pdf, amount, self.position, byt); ch := utl_raw.cast_to_varchar2(byt); /* Será que essas concatenações são eficientes? Em algumas linguagens, ficar adicionando caracter por caracter gera muitas cópias de strings temporárias que dão trabalho para o coletor de lixo. */ if escaped then if ch = 'n' then result := result || chr(10); elsif ch = 'r' then result := result || chr(13); elsif ch = 'b' then result := result || chr(8); elsif ch = 'f' then result := result || chr(12); elsif is_digit(ch) then -- Not supported yet result := result || '\' || ch; elsif ch = '(' then result := result || '('; elsif ch = ')' then result := result || ')'; elsif ch = chr(10) then null; elsif ch = chr(13) then dbms_lob.read(self.pdf, amount, self.position + 1, byt); ch := utl_raw.cast_to_varchar2(byt); if ch = chr(10) then self.position := self.position + 1; end if; end if; escaped := false; elsif ch = '\' then escaped := true; elsif ch = '(' then par_level := par_level + 1; result := result || '('; elsif ch = ')' then par_level := par_level - 1; if par_level < 0 then self.position := self.position + 1; return result; end if; else result := result || ch; end if; self.position := self.position + 1; end loop; end; function read_name(self in out nocopy Pdf_Token_Reader) return varchar2 is begin return read_word(self); end; function read_word(self in out nocopy Pdf_Token_Reader) return varchar2 is amount number := 1; ch varchar2(1); byt raw(1); result varchar2(32767) := ''; begin loop dbms_lob.read(self.pdf, amount, self.position, byt); ch := utl_raw.cast_to_varchar2(byt); exit when is_delimiter(ch); result := result || ch; self.position := self.position + 1; end loop; return result; end; function read_code(self in out nocopy Pdf_Token_Reader) return varchar2 is amount number := 1; ch varchar2(1); byt raw(1); par_level pls_integer := 0; result varchar2(32767) := ''; begin loop dbms_lob.read(self.pdf, amount, self.position, byt); ch := utl_raw.cast_to_varchar2(byt); if ch = '{' then par_level := par_level + 1; result := result || ch; elsif ch = '}' then par_level := par_level - 1; if par_level < 0 then return result; else result := result || ch; end if; end if; self.position := self.position + 1; end loop; end; procedure parse_xref(self in out nocopy pdf_xref, token_reader in out nocopy Pdf_Token_Reader); procedure add_block(self in out nocopy pdf_xref, start_id number, nr_of_lines number, str_block varchar2); procedure add_obj(self in out nocopy pdf_xref, id_ number, str_line varchar2); function get_obj(self in out nocopy pdf_xref, refer Pdf_Reference) return Pdf_Xref_Item; procedure parse_xref(self in out nocopy pdf_xref, token_reader in out nocopy Pdf_Token_Reader) is token Pdf_Token; start_id number; nr_of_lines number; str_xref varchar2(32767); amount integer; ch varchar2(1); byt raw(1); function get_number return number is begin token := next_token(token_reader); if token.t <> 'NUMBER' then raise_application_error(-20001, 'Não encontrado número no local esperado do PDF'); end if; return to_number(token.val); end; begin token := next_token(token_reader); if token.t <> 'xref' then raise_application_error(-20001, 'Não encontrado xref no local esperado do PDF'); end if; loop start_id := get_number; nr_of_lines := get_number; skip_spaces(token_reader); amount := 20 * nr_of_lines; str_xref := utl_raw.cast_to_varchar2(dbms_lob.substr(token_reader.pdf, amount, token_reader.position)); token_reader.position := token_reader.position + amount; add_block(self, start_id, nr_of_lines, str_xref); skip_spaces(token_reader); amount := 1; dbms_lob.read(token_reader.pdf, amount, token_reader.position, byt); ch := utl_raw.cast_to_varchar2(byt); if not is_digit(ch) then return; -- deixa o token_reader na posição para ler o trailer end if; end loop; end; procedure add_block(self in out nocopy pdf_xref, start_id number, nr_of_lines number, str_block varchar2) is begin for i in 0..(nr_of_lines-1) loop add_obj(self, i + start_id, substr(str_block, 20*i + 1, 20)); end loop; end; procedure add_obj(self in out nocopy pdf_xref, id_ number, str_line varchar2) is xref_item Pdf_Xref_Item; type_ varchar2(1); byte_pos number; gen_num number; begin xref_item.type_ := substr(str_line, 18, 1); if xref_item.type_ = 'n' then xref_item.byte_pos := read_number(str_line, 1); end if; xref_item.gen_num := read_number(str_line, 12); xref_item.id_ := id_; if not self.objs.exists(id_ || ' ' || xref_item.gen_num) then self.objs(id_ || ' ' || xref_item.gen_num) := xref_item; end if; end; function get_obj(self in out nocopy pdf_xref, refer Pdf_Reference) return Pdf_Xref_Item is begin return self.objs(refer.obj_num || ' ' || refer.gen_num); end; function is_digit(ch varchar2) return boolean is begin return ch in ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9'); end; function is_space(ch varchar2) return boolean is begin return ch in (chr(0), chr(9), chr(10), chr(12), chr(13), ' '); -- , \t, \n, \f, \r, ' ' end; function is_delimiter(ch varchar2) return boolean is begin return is_space(ch) or ch in ('(', ')', '<', '>', '[', ']', '{', '}', '/', '%'); end; procedure expect(t1 varchar2, t2 varchar2) is begin if t1 <> t2 then raise_application_error(-20001, 'Não encontrado token no local esperado do PDF (' || t1 || '<>' || t2 || ')'); end if; end; function parse_object(token_reader in out nocopy Pdf_Token_Reader) return Pdf_Object; function parse_array(token_reader in out nocopy Pdf_Token_Reader, array_refs in out nocopy Array_Ref_T) return Pdf_Object is result Pdf_Object; token Pdf_Token; array_ Pdf_Array := Pdf_Array(); begin loop token := next_token(token_reader); if token.t = ']' then obj_set_array(result, array_, array_refs); return result; end if; unread_token(token_reader, token); array_.extend; array_(array_.last) := parse_object(token_reader); end loop; end; function parse_dict(token_reader in out nocopy Pdf_Token_Reader) return Dictionary_T is result Dictionary_T; token Pdf_Token; obj Pdf_Object; begin loop token := next_token(token_reader); if token.t = '>>' then return result; end if; expect(token.t, 'NAME'); obj := parse_object(token_reader); if obj.t = '?' and obj.str = '>>' then raise_application_error(-20001, 'Token >> inesperado em pos = ' || token_reader.position); end if; result(token.val) := obj; end loop; end; function find_indirect_object(token_reader in out nocopy Pdf_Token_Reader, refer Pdf_Reference) return Pdf_Object; function read_stream(token_reader in out nocopy Pdf_Token_Reader, dict Dictionary_T) return Pdf_Object is byt raw(1); ch varchar2(1); amount integer := 1; token Pdf_Token; result Pdf_Object; len Pdf_Object; begin dbms_lob.read(token_reader.pdf, amount, token_reader.position, byt); ch := utl_raw.cast_to_varchar2(byt); if ch = chr(13) then token_reader.position := token_reader.position + 2; -- skip \r\n elsif ch = chr(10) then token_reader.position := token_reader.position + 1; -- skip \n end if; len := dict('Length'); if len.t = 'REFERENCE' then declare saved_pos number := token_reader.position; begin len := find_indirect_object(token_reader, len.refer); token_reader.position := saved_pos; end; end if; if len.t <> 'NUMBER' then raise_application_error(-20001, 'Erro: /Length de stream não é um número. Pos = ' || token_reader.position); end if; -- Apenas pula o stream, pois não temos nenhum uso para ele. -- se precisar melhorar, o stream pode ser guardado num blob dentro do -- Pdf_Object. token_reader.position := token_reader.position + len.num; token := next_token(token_reader); expect(token.t, 'endstream'); obj_set_stream(result, null, dict, token_reader.dict_refs); return result; end; function new_pdf_reference(obj_num number, gen_num number) return Pdf_Reference is result Pdf_Reference; begin result.obj_num := obj_num; result.gen_num := gen_num; return result; end; function parse_object(token_reader in out nocopy Pdf_Token_Reader) return Pdf_Object is token1 Pdf_Token; token2 Pdf_Token; token3 Pdf_Token; token4 Pdf_Token; dict Dictionary_T; result Pdf_Object; function obj return Pdf_Object is token Pdf_Token; result Pdf_Object; begin result := parse_object(token_reader); token := next_token(token_reader); expect(token.t, 'endobj'); return result; end; begin token1 := next_token(token_reader); if token1.t = 'NUMBER' then -- pode ser número, objeto indireto, ou referência para objeto indireto token2 := next_token(token_reader); if token2.t = 'NUMBER' then token3 := next_token(token_reader); if token3.t = 'obj' then return obj(); elsif token3.t = 'R' then obj_set_ref(result, new_pdf_reference(obj_num=>token1.val, gen_num=>token2.val)); return result; else unread_token(token_reader, token3); unread_token(token_reader, token2); end if; else unread_token(token_reader, token2); end if; obj_set_token(result, token1); return result; elsif token1.t = '[' then return parse_array(token_reader, token_reader.array_refs); elsif token1.t = '<<' then dict := parse_dict(token_reader); token4 := next_token(token_reader); if token4.t = 'stream' then return read_stream(token_reader, dict); else unread_token(token_reader, token4); end if; obj_set_dict(result, dict, token_reader.dict_refs); return result; elsif token1.t = 'obj' then return obj(); else obj_set_token(result, token1); return result; end if; end; function parse_trailer(token_reader in out nocopy Pdf_Token_Reader) return Dictionary_T is token Pdf_Token; begin token := next_token(token_reader); expect(token.t, 'trailer'); token := next_token(token_reader); expect(token.t, '<<'); return parse_dict(token_reader); end; function read_indirect_obj(token_reader in out nocopy Pdf_Token_Reader, xref_item Pdf_Xref_Item) return Pdf_Object is begin if xref_item.type_ = 'n' then token_reader.position := xref_item.byte_pos; return parse_object(token_reader); else return null; end if; end; function find_indirect_object(token_reader in out nocopy Pdf_Token_Reader, refer Pdf_Reference) return Pdf_Object is xref_item Pdf_Xref_Item; begin xref_item := get_obj(token_reader.xref, refer); return read_indirect_obj(token_reader, xref_item); end; function number_of_pages(pdf in blob) return integer is amount integer := 5; position number := 1; comeco_pdf_raw raw(5); comeco_pdf varchar2(5); final_pdf_raw raw(400); final_pdf varchar2(400); startxref_str varchar(100); startxref integer; token Pdf_Token; token_reader Pdf_Token_Reader; trailer Dictionary_T; trailers Trailer_Array; catalog Pdf_Object; len_pdf constant number := dbms_lob.getlength(pdf); function linearized return integer is first_obj Pdf_Object; begin token_reader.position := 1; first_obj := parse_object(token_reader); if token_reader.dict_refs(first_obj.dict).exists('Linearized') then return token_reader.dict_refs(first_obj.dict)('N').num; else raise_application_error(-20001, 'Não achou o número de páginas (/Linearized)'); end if; end; begin dbms_lob.read(pdf, amount, position, comeco_pdf_raw); --dbms_output.put_line(utl_raw.cast_to_varchar2(comeco_pdf_raw)); comeco_pdf := utl_raw.cast_to_varchar2(comeco_pdf_raw); if comeco_pdf <> '%PDF-' then return null; end if; amount := 400; if len_pdf < amount then amount := len_pdf; end if; position := len_pdf - (amount - 1); dbms_lob.read(pdf, amount, position, final_pdf_raw); final_pdf := utl_raw.cast_to_varchar2(final_pdf_raw); declare startxref_pattern constant varchar2(50) := '.*startxref\s+(\d+)\s+%%EOF.*$'; begin startxref_str := regexp_replace(final_pdf, startxref_pattern, '\1', 1, 1, 'n'); end; --dbms_output.put_line('[' || startxref_str || ']'); startxref := to_number(startxref_str); if startxref is null or startxref = 0 then return linearized; else position := startxref; pdf_token_reader_init(token_reader, pdf, position); token := next_token(token_reader); if token.t = 'xref' then unread_token(token_reader, token); trailers := Trailer_Array(); loop parse_xref(token_reader.xref, token_reader); trailer := parse_trailer(token_reader); trailers.extend; trailers(trailers.last) := trailer; if trailer.exists('Prev') then token_reader.position := trailer('Prev').num; else exit; end if; end loop; for i in 1 .. trailers.last loop if trailers(i).exists('Root') then declare catalog_num Pdf_Reference; root Pdf_Object; begin root := trailers(i)('Root'); expect(root.t, 'REFERENCE'); catalog_num := root.refer; catalog := find_indirect_object(token_reader, catalog_num); exit; end; end if; end loop; declare pages Pdf_Object; pages_num Pdf_Reference; count_ Pdf_Object; begin pages := token_reader.dict_refs(catalog.dict)('Pages'); expect(pages.t, 'REFERENCE'); pages_num := pages.refer; pages := find_indirect_object(token_reader, pages_num); count_ := token_reader.dict_refs(pages.dict)('Count'); expect(count_.t, 'NUMBER'); return count_.num; end; else return linearized; end if; end if; return 0; exception when dbms_lob.invalid_argval then dbms_output.put_line('INVALID_ARGVAL - ' || SQLERRM); return null; when no_data_found then dbms_output.put_line('NO_DATA_FOUND - ' || SQLERRM); return null; when others then dbms_output.put_line('Outro erro - ' || SQLERRM); return null; end; end pk_pdf;
Anúncios
Python vs. PL/SQL vs. F# « Visions of hope