pyPdf: upgrade from upstream git: 4abdca42a7d8a4

bzr revid: p_christ@hol.gr-20110106111400-hqw1nu5wx1mict4t
This commit is contained in:
P. Christeas 2011-01-06 13:14:00 +02:00
parent 660565f56e
commit 73af237c8b
6 changed files with 1136 additions and 767 deletions

View File

@ -1,3 +1,2 @@
# -*- coding: utf-8 -*-
from pdf import PdfFileReader, PdfFileWriter
__all__ = ["pdf"]

View File

@ -1,253 +1,252 @@
# -*- coding: utf-8 -*-
# vim: sw=4:expandtab:foldmethod=marker
#
# Copyright (c) 2006, Mathieu Fenniak
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# * The name of the author may not be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
"""
Implementation of stream filters for PDF.
"""
__author__ = "Mathieu Fenniak"
__author_email__ = "biziqe@mathieu.fenniak.net"
from utils import PdfReadError
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
try:
import zlib
def decompress(data):
return zlib.decompress(data)
def compress(data):
return zlib.compress(data)
except ImportError:
# Unable to import zlib. Attempt to use the System.IO.Compression
# library from the .NET framework. (IronPython only)
import System
from System import IO, Collections, Array
def _string_to_bytearr(buf):
retval = Array.CreateInstance(System.Byte, len(buf))
for i in range(len(buf)):
retval[i] = ord(buf[i])
return retval
def _bytearr_to_string(bytes):
retval = ""
for i in range(bytes.Length):
retval += chr(bytes[i])
return retval
def _read_bytes(stream):
ms = IO.MemoryStream()
buf = Array.CreateInstance(System.Byte, 2048)
while True:
bytes = stream.Read(buf, 0, buf.Length)
if bytes == 0:
break
else:
ms.Write(buf, 0, bytes)
retval = ms.ToArray()
ms.Close()
return retval
def decompress(data):
bytes = _string_to_bytearr(data)
ms = IO.MemoryStream()
ms.Write(bytes, 0, bytes.Length)
ms.Position = 0 # fseek 0
gz = IO.Compression.DeflateStream(ms, IO.Compression.CompressionMode.Decompress)
bytes = _read_bytes(gz)
retval = _bytearr_to_string(bytes)
gz.Close()
return retval
def compress(data):
bytes = _string_to_bytearr(data)
ms = IO.MemoryStream()
gz = IO.Compression.DeflateStream(ms, IO.Compression.CompressionMode.Compress, True)
gz.Write(bytes, 0, bytes.Length)
gz.Close()
ms.Position = 0 # fseek 0
bytes = ms.ToArray()
retval = _bytearr_to_string(bytes)
ms.Close()
return retval
class FlateDecode(object):
def decode(data, decodeParms):
data = decompress(data)
predictor = 1
if decodeParms:
predictor = decodeParms.get("/Predictor", 1)
# predictor 1 == no predictor
if predictor != 1:
columns = decodeParms["/Columns"]
# PNG prediction:
if predictor >= 10 and predictor <= 15:
output = StringIO()
# PNG prediction can vary from row to row
rowlength = columns + 1
assert len(data) % rowlength == 0
prev_rowdata = (0,) * rowlength
for row in xrange(len(data) / rowlength):
rowdata = [ord(x) for x in data[(row*rowlength):((row+1)*rowlength)]]
filterByte = rowdata[0]
if filterByte == 0:
pass
elif filterByte == 1:
for i in range(2, rowlength):
rowdata[i] = (rowdata[i] + rowdata[i-1]) % 256
elif filterByte == 2:
for i in range(1, rowlength):
rowdata[i] = (rowdata[i] + prev_rowdata[i]) % 256
else:
# unsupported PNG filter
raise PdfReadError("Unsupported PNG filter %r" % filterByte)
prev_rowdata = rowdata
output.write(''.join([chr(x) for x in rowdata[1:]]))
data = output.getvalue()
else:
# unsupported predictor
raise PdfReadError("Unsupported flatedecode predictor %r" % predictor)
return data
decode = staticmethod(decode)
def encode(data):
return compress(data)
encode = staticmethod(encode)
class ASCIIHexDecode(object):
def decode(data, decodeParms=None):
retval = ""
char = ""
x = 0
while True:
c = data[x]
if c == ">":
break
elif c.isspace():
x += 1
continue
char += c
if len(char) == 2:
retval += chr(int(char, base=16))
char = ""
x += 1
assert char == ""
return retval
decode = staticmethod(decode)
class ASCII85Decode(object):
def decode(data, decodeParms=None):
retval = ""
group = []
x = 0
hitEod = False
# remove all whitespace from data
data = [y for y in data if not (y in ' \n\r\t')]
while not hitEod:
c = data[x]
if len(retval) == 0 and c == "<" and data[x+1] == "~":
x += 2
continue
#elif c.isspace():
# x += 1
# continue
elif c == 'z':
assert len(group) == 0
retval += '\x00\x00\x00\x00'
continue
elif c == "~" and data[x+1] == ">":
if len(group) != 0:
# cannot have a final group of just 1 char
assert len(group) > 1
cnt = len(group) - 1
group += [ 85, 85, 85 ]
hitEod = cnt
else:
break
else:
c = ord(c) - 33
assert c >= 0 and c < 85
group += [ c ]
if len(group) >= 5:
b = group[0] * (85**4) + \
group[1] * (85**3) + \
group[2] * (85**2) + \
group[3] * 85 + \
group[4]
assert b < (2**32 - 1)
c4 = chr((b >> 0) % 256)
c3 = chr((b >> 8) % 256)
c2 = chr((b >> 16) % 256)
c1 = chr(b >> 24)
retval += (c1 + c2 + c3 + c4)
if hitEod:
retval = retval[:-4+hitEod]
group = []
x += 1
return retval
decode = staticmethod(decode)
def decodeStreamData(stream):
from generic import NameObject
filters = stream.get("/Filter", ())
if len(filters) and not isinstance(filters[0], NameObject):
# we have a single filter instance
filters = (filters,)
data = stream._data
for filterType in filters:
if filterType == "/FlateDecode":
data = FlateDecode.decode(data, stream.get("/DecodeParms"))
elif filterType == "/ASCIIHexDecode":
data = ASCIIHexDecode.decode(data)
elif filterType == "/ASCII85Decode":
data = ASCII85Decode.decode(data)
elif filterType == "/Crypt":
decodeParams = stream.get("/DecodeParams", {})
if "/Name" not in decodeParams and "/Type" not in decodeParams:
pass
else:
raise NotImplementedError("/Crypt filter with /Name or /Type not supported yet")
else:
# unsupported filter
raise NotImplementedError("unsupported filter %s" % filterType)
return data
if __name__ == "__main__":
assert "abc" == ASCIIHexDecode.decode('61\n626\n3>')
ascii85Test = """
<~9jqo^BlbD-BleB1DJ+*+F(f,q/0JhKF<GL>Cj@.4Gp$d7F!,L7@<6@)/0JDEF<G%<+EV:2F!,
O<DJ+*.@<*K0@<6L(Df-\\0Ec5e;DffZ(EZee.Bl.9pF"AGXBPCsi+DGm>@3BB/F*&OCAfu2/AKY
i(DIb:@FD,*)+C]U=@3BN#EcYf8ATD3s@q?d$AftVqCh[NqF<G:8+EV:.+Cf>-FD5W8ARlolDIa
l(DId<j@<?3r@:F%a+D58'ATD4$Bl@l3De:,-DJs`8ARoFb/0JMK@qB4^F!,R<AKZ&-DfTqBG%G
>uD.RTpAKYo'+CT/5+Cei#DII?(E,9)oF*2M7/c~>
"""
ascii85_originalText="Man is distinguished, not only by his reason, but by this singular passion from other animals, which is a lust of the mind, that by a perseverance of delight in the continued and indefatigable generation of knowledge, exceeds the short vehemence of any carnal pleasure."
assert ASCII85Decode.decode(ascii85Test) == ascii85_originalText
# vim: sw=4:expandtab:foldmethod=marker
#
# Copyright (c) 2006, Mathieu Fenniak
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# * The name of the author may not be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
"""
Implementation of stream filters for PDF.
"""
__author__ = "Mathieu Fenniak"
__author_email__ = "biziqe@mathieu.fenniak.net"
from utils import PdfReadError
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
try:
import zlib
def decompress(data):
return zlib.decompress(data)
def compress(data):
return zlib.compress(data)
except ImportError:
# Unable to import zlib. Attempt to use the System.IO.Compression
# library from the .NET framework. (IronPython only)
import System
from System import IO, Collections, Array
def _string_to_bytearr(buf):
retval = Array.CreateInstance(System.Byte, len(buf))
for i in range(len(buf)):
retval[i] = ord(buf[i])
return retval
def _bytearr_to_string(bytes):
retval = ""
for i in range(bytes.Length):
retval += chr(bytes[i])
return retval
def _read_bytes(stream):
ms = IO.MemoryStream()
buf = Array.CreateInstance(System.Byte, 2048)
while True:
bytes = stream.Read(buf, 0, buf.Length)
if bytes == 0:
break
else:
ms.Write(buf, 0, bytes)
retval = ms.ToArray()
ms.Close()
return retval
def decompress(data):
bytes = _string_to_bytearr(data)
ms = IO.MemoryStream()
ms.Write(bytes, 0, bytes.Length)
ms.Position = 0 # fseek 0
gz = IO.Compression.DeflateStream(ms, IO.Compression.CompressionMode.Decompress)
bytes = _read_bytes(gz)
retval = _bytearr_to_string(bytes)
gz.Close()
return retval
def compress(data):
bytes = _string_to_bytearr(data)
ms = IO.MemoryStream()
gz = IO.Compression.DeflateStream(ms, IO.Compression.CompressionMode.Compress, True)
gz.Write(bytes, 0, bytes.Length)
gz.Close()
ms.Position = 0 # fseek 0
bytes = ms.ToArray()
retval = _bytearr_to_string(bytes)
ms.Close()
return retval
class FlateDecode(object):
def decode(data, decodeParms):
data = decompress(data)
predictor = 1
if decodeParms:
predictor = decodeParms.get("/Predictor", 1)
# predictor 1 == no predictor
if predictor != 1:
columns = decodeParms["/Columns"]
# PNG prediction:
if predictor >= 10 and predictor <= 15:
output = StringIO()
# PNG prediction can vary from row to row
rowlength = columns + 1
assert len(data) % rowlength == 0
prev_rowdata = (0,) * rowlength
for row in xrange(len(data) / rowlength):
rowdata = [ord(x) for x in data[(row*rowlength):((row+1)*rowlength)]]
filterByte = rowdata[0]
if filterByte == 0:
pass
elif filterByte == 1:
for i in range(2, rowlength):
rowdata[i] = (rowdata[i] + rowdata[i-1]) % 256
elif filterByte == 2:
for i in range(1, rowlength):
rowdata[i] = (rowdata[i] + prev_rowdata[i]) % 256
else:
# unsupported PNG filter
raise PdfReadError("Unsupported PNG filter %r" % filterByte)
prev_rowdata = rowdata
output.write(''.join([chr(x) for x in rowdata[1:]]))
data = output.getvalue()
else:
# unsupported predictor
raise PdfReadError("Unsupported flatedecode predictor %r" % predictor)
return data
decode = staticmethod(decode)
def encode(data):
return compress(data)
encode = staticmethod(encode)
class ASCIIHexDecode(object):
def decode(data, decodeParms=None):
retval = ""
char = ""
x = 0
while True:
c = data[x]
if c == ">":
break
elif c.isspace():
x += 1
continue
char += c
if len(char) == 2:
retval += chr(int(char, base=16))
char = ""
x += 1
assert char == ""
return retval
decode = staticmethod(decode)
class ASCII85Decode(object):
def decode(data, decodeParms=None):
retval = ""
group = []
x = 0
hitEod = False
# remove all whitespace from data
data = [y for y in data if not (y in ' \n\r\t')]
while not hitEod:
c = data[x]
if len(retval) == 0 and c == "<" and data[x+1] == "~":
x += 2
continue
#elif c.isspace():
# x += 1
# continue
elif c == 'z':
assert len(group) == 0
retval += '\x00\x00\x00\x00'
continue
elif c == "~" and data[x+1] == ">":
if len(group) != 0:
# cannot have a final group of just 1 char
assert len(group) > 1
cnt = len(group) - 1
group += [ 85, 85, 85 ]
hitEod = cnt
else:
break
else:
c = ord(c) - 33
assert c >= 0 and c < 85
group += [ c ]
if len(group) >= 5:
b = group[0] * (85**4) + \
group[1] * (85**3) + \
group[2] * (85**2) + \
group[3] * 85 + \
group[4]
assert b < (2**32 - 1)
c4 = chr((b >> 0) % 256)
c3 = chr((b >> 8) % 256)
c2 = chr((b >> 16) % 256)
c1 = chr(b >> 24)
retval += (c1 + c2 + c3 + c4)
if hitEod:
retval = retval[:-4+hitEod]
group = []
x += 1
return retval
decode = staticmethod(decode)
def decodeStreamData(stream):
from generic import NameObject
filters = stream.get("/Filter", ())
if len(filters) and not isinstance(filters[0], NameObject):
# we have a single filter instance
filters = (filters,)
data = stream._data
for filterType in filters:
if filterType == "/FlateDecode":
data = FlateDecode.decode(data, stream.get("/DecodeParms"))
elif filterType == "/ASCIIHexDecode":
data = ASCIIHexDecode.decode(data)
elif filterType == "/ASCII85Decode":
data = ASCII85Decode.decode(data)
elif filterType == "/Crypt":
decodeParams = stream.get("/DecodeParams", {})
if "/Name" not in decodeParams and "/Type" not in decodeParams:
pass
else:
raise NotImplementedError("/Crypt filter with /Name or /Type not supported yet")
else:
# unsupported filter
raise NotImplementedError("unsupported filter %s" % filterType)
return data
if __name__ == "__main__":
assert "abc" == ASCIIHexDecode.decode('61\n626\n3>')
ascii85Test = """
<~9jqo^BlbD-BleB1DJ+*+F(f,q/0JhKF<GL>Cj@.4Gp$d7F!,L7@<6@)/0JDEF<G%<+EV:2F!,
O<DJ+*.@<*K0@<6L(Df-\\0Ec5e;DffZ(EZee.Bl.9pF"AGXBPCsi+DGm>@3BB/F*&OCAfu2/AKY
i(DIb:@FD,*)+C]U=@3BN#EcYf8ATD3s@q?d$AftVqCh[NqF<G:8+EV:.+Cf>-FD5W8ARlolDIa
l(DId<j@<?3r@:F%a+D58'ATD4$Bl@l3De:,-DJs`8ARoFb/0JMK@qB4^F!,R<AKZ&-DfTqBG%G
>uD.RTpAKYo'+CT/5+Cei#DII?(E,9)oF*2M7/c~>
"""
ascii85_originalText="Man is distinguished, not only by his reason, but by this singular passion from other animals, which is a lust of the mind, that by a perseverance of delight in the continued and indefatigable generation of knowledge, exceeds the short vehemence of any carnal pleasure."
assert ASCII85Decode.decode(ascii85Test) == ascii85_originalText

View File

@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
# vim: sw=4:expandtab:foldmethod=marker
#
# Copyright (c) 2006, Mathieu Fenniak
@ -207,15 +206,18 @@ class FloatObject(decimal.Decimal, PdfObject):
def __new__(cls, value="0", context=None):
return decimal.Decimal.__new__(cls, str(value), context)
def __repr__(self):
return str(self)
if self == self.to_integral():
return str(self.quantize(decimal.Decimal(1)))
else:
# XXX: this adds useless extraneous zeros.
return "%.5f" % self
def writeToStream(self, stream, encryption_key):
stream.write(str(self))
stream.write(repr(self))
class NumberObject(int, PdfObject):
def __init__(self, value):
int.__init__(self)
self = value
int.__init__(value)
def writeToStream(self, stream, encryption_key):
stream.write(repr(self))
@ -301,7 +303,7 @@ def readStringFromStream(stream):
elif tok == "t":
tok = "\t"
elif tok == "b":
tok == "\b"
tok = "\b"
elif tok == "f":
tok = "\f"
elif tok == "(":
@ -311,7 +313,17 @@ def readStringFromStream(stream):
elif tok == "\\":
tok = "\\"
elif tok.isdigit():
tok += stream.read(2)
# "The number ddd may consist of one, two, or three
# octal digits; high-order overflow shall be ignored.
# Three octal digits shall be used, with leading zeros
# as needed, if the next character of the string is also
# a digit." (PDF reference 7.3.4.2, p 16)
for i in range(2):
ntok = stream.read(1)
if ntok.isdigit():
tok += ntok
else:
break
tok = chr(int(tok, base=8))
elif tok in "\n\r":
# This case is hit when a backslash followed by a line
@ -405,8 +417,7 @@ class NameObject(str, PdfObject):
delimiterCharacters = "(", ")", "<", ">", "[", "]", "{", "}", "/", "%"
def __init__(self, data):
str.__init__(self)
self = data
str.__init__(data)
def writeToStream(self, stream, encryption_key):
stream.write(self)
@ -710,6 +721,12 @@ class RectangleObject(ArrayObject):
def setUpperRight(self, value):
self[2], self[3] = [self.ensureIsNumber(x) for x in value]
def getWidth(self):
return self.getUpperRight_x() - self.getLowerLeft_x()
def getHeight(self):
return self.getUpperRight_y() - self.getLowerLeft_x()
lowerLeft = property(getLowerLeft, setLowerLeft, None, None)
lowerRight = property(getLowerRight, setLowerRight, None, None)
upperLeft = property(getUpperLeft, setUpperLeft, None, None)

View File

@ -1,4 +1,5 @@
# -*- coding: utf-8 -*-
#
# vim: sw=4:expandtab:foldmethod=marker
#
# Copyright (c) 2006, Mathieu Fenniak
@ -39,7 +40,9 @@ It may be a solid base for future PDF file work in Python.
__author__ = "Mathieu Fenniak"
__author_email__ = "biziqe@mathieu.fenniak.net"
import math
import struct
from sys import version_info
try:
from cStringIO import StringIO
except ImportError:
@ -51,6 +54,14 @@ import warnings
from generic import *
from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList
if version_info < ( 2, 4 ):
from sets import ImmutableSet as frozenset
if version_info < ( 2, 5 ):
from md5 import md5
else:
from hashlib import md5
##
# This class supports writing PDF files out, given pages produced by another
# class (typically {@link #PdfFileReader PdfFileReader}).
@ -92,6 +103,21 @@ class PdfFileWriter(object):
raise ValueError("pdf must be self")
return self._objects[ido.idnum - 1]
##
# Common method for inserting or adding a page to this PDF file.
#
# @param page The page to add to the document. This argument should be
# an instance of {@link #PageObject PageObject}.
# @param action The function which will insert the page in the dictionnary.
# Takes: page list, page to add.
def _addPage(self, page, action):
assert page["/Type"] == "/Page"
page[NameObject("/Parent")] = self._pages
page = self._addObject(page)
pages = self.getObject(self._pages)
action(pages["/Kids"], page)
pages[NameObject("/Count")] = NumberObject(pages["/Count"] + 1)
##
# Adds a page to this PDF file. The page is usually acquired from a
# {@link #PdfFileReader PdfFileReader} instance.
@ -101,12 +127,64 @@ class PdfFileWriter(object):
# @param page The page to add to the document. This argument should be
# an instance of {@link #PageObject PageObject}.
def addPage(self, page):
assert page["/Type"] == "/Page"
page[NameObject("/Parent")] = self._pages
page = self._addObject(page)
self._addPage(page, list.append)
##
# Insert a page in this PDF file. The page is usually acquired from a
# {@link #PdfFileReader PdfFileReader} instance.
#
# @param page The page to add to the document. This argument should be
# an instance of {@link #PageObject PageObject}.
# @param index Position at which the page will be inserted.
def insertPage(self, page, index=0):
self._addPage(page, lambda l, p: l.insert(index, p))
##
# Retrieves a page by number from this PDF file.
# @return Returns a {@link #PageObject PageObject} instance.
def getPage(self, pageNumber):
pages = self.getObject(self._pages)
pages["/Kids"].append(page)
pages[NameObject("/Count")] = NumberObject(pages["/Count"] + 1)
# XXX: crude hack
return pages["/Kids"][pageNumber].getObject()
##
# Return the number of pages.
# @return The number of pages.
def getNumPages(self):
pages = self.getObject(self._pages)
return int(pages[NameObject("/Count")])
##
# Append a blank page to this PDF file and returns it. If no page size
# is specified, use the size of the last page; throw
# PageSizeNotDefinedError if it doesn't exist.
# @param width The width of the new page expressed in default user
# space units.
# @param height The height of the new page expressed in default user
# space units.
def addBlankPage(self, width=None, height=None):
page = PageObject.createBlankPage(self, width, height)
self.addPage(page)
return page
##
# Insert a blank page to this PDF file and returns it. If no page size
# is specified, use the size of the page in the given index; throw
# PageSizeNotDefinedError if it doesn't exist.
# @param width The width of the new page expressed in default user
# space units.
# @param height The height of the new page expressed in default user
# space units.
# @param index Position to add the page.
def insertBlankPage(self, width=None, height=None, index=0):
if width is None or height is None and \
(self.getNumPages() - 1) >= index:
oldpage = self.getPage(index)
width = oldpage.mediaBox.getWidth()
height = oldpage.mediaBox.getHeight()
page = PageObject.createBlankPage(self, width, height)
self.insertPage(page, index)
return page
##
# Encrypt this PDF file with the PDF Standard encryption handler.
@ -119,7 +197,7 @@ class PdfFileWriter(object):
# encryption. When false, 40bit encryption will be used. By default, this
# flag is on.
def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True):
import md5, time, random
import time, random
if owner_pwd == None:
owner_pwd = user_pwd
if use_128bit:
@ -133,8 +211,8 @@ class PdfFileWriter(object):
# permit everything:
P = -1
O = ByteStringObject(_alg33(owner_pwd, user_pwd, rev, keylen))
ID_1 = md5.new(repr(time.time())).digest()
ID_2 = md5.new(repr(random.random())).digest()
ID_1 = md5(repr(time.time())).digest()
ID_2 = md5(repr(random.random())).digest()
self._ID = ArrayObject((ByteStringObject(ID_1), ByteStringObject(ID_2)))
if rev == 2:
U, key = _alg34(user_pwd, O, P, ID_1)
@ -160,9 +238,28 @@ class PdfFileWriter(object):
# @param stream An object to write the file to. The object must support
# the write method, and the tell method, similar to a file object.
def write(self, stream):
import struct, md5
import struct
externalReferenceMap = {}
# PDF objects sometimes have circular references to their /Page objects
# inside their object tree (for example, annotations). Those will be
# indirect references to objects that we've recreated in this PDF. To
# address this problem, PageObject's store their original object
# reference number, and we add it to the external reference map before
# we sweep for indirect references. This forces self-page-referencing
# trees to reference the correct new object location, rather than
# copying in a new copy of the page object.
for objIndex in xrange(len(self._objects)):
obj = self._objects[objIndex]
if isinstance(obj, PageObject) and obj.indirectRef != None:
data = obj.indirectRef
if not externalReferenceMap.has_key(data.pdf):
externalReferenceMap[data.pdf] = {}
if not externalReferenceMap[data.pdf].has_key(data.generation):
externalReferenceMap[data.pdf][data.generation] = {}
externalReferenceMap[data.pdf][data.generation][data.idnum] = IndirectObject(objIndex + 1, 0, self)
self.stack = []
self._sweepIndirectReferences(externalReferenceMap, self._root)
del self.stack
@ -181,7 +278,7 @@ class PdfFileWriter(object):
pack2 = struct.pack("<i", 0)[:2]
key = self._encrypt_key + pack1 + pack2
assert len(key) == (len(self._encrypt_key) + 5)
md5_hash = md5.new(key).digest()
md5_hash = md5(key).digest()
key = md5_hash[:min(16, len(self._encrypt_key) + 5)]
obj.writeToStream(stream, key)
stream.write("\nendobj\n")
@ -487,7 +584,7 @@ class PdfFileReader(object):
pages = property(lambda self: ConvertFunctionsToVirtualList(self.getNumPages, self.getPage),
None, None)
def _flatten(self, pages=None, inherit=None):
def _flatten(self, pages=None, inherit=None, indirectRef=None):
inheritablePageAttributes = (
NameObject("/Resources"), NameObject("/MediaBox"),
NameObject("/CropBox"), NameObject("/Rotate")
@ -504,14 +601,17 @@ class PdfFileReader(object):
if pages.has_key(attr):
inherit[attr] = pages[attr]
for page in pages["/Kids"]:
self._flatten(page.getObject(), inherit)
addt = {}
if isinstance(page, IndirectObject):
addt["indirectRef"] = page
self._flatten(page.getObject(), inherit, **addt)
elif t == "/Page":
for attr,value in inherit.items():
# if the page has it's own value, it does not inherit the
# parent's value:
if not pages.has_key(attr):
pages[attr] = value
pageObj = PageObject(self)
pageObj = PageObject(self, indirectRef)
pageObj.update(pages)
self.flattenedPages.append(pageObj)
@ -554,12 +654,12 @@ class PdfFileReader(object):
if not hasattr(self, '_decryption_key'):
raise Exception, "file has not been decrypted"
# otherwise, decrypt here...
import struct, md5
import struct
pack1 = struct.pack("<i", indirectReference.idnum)[:3]
pack2 = struct.pack("<i", indirectReference.generation)[:2]
key = self._decryption_key + pack1 + pack2
assert len(key) == (len(self._decryption_key) + 5)
md5_hash = md5.new(key).digest()
md5_hash = md5(key).digest()
key = md5_hash[:min(16, len(self._decryption_key) + 5)]
retval = self._decryptObject(retval, key)
@ -890,11 +990,46 @@ def createRectangleAccessor(name, fallback):
##
# This class represents a single page within a PDF file. Typically this object
# will be created by accessing the {@link #PdfFileReader.getPage getPage}
# function of the {@link #PdfFileReader PdfFileReader} class.
# function of the {@link #PdfFileReader PdfFileReader} class, but it is
# also possible to create an empty page with the createBlankPage static
# method.
# @param pdf PDF file the page belongs to (optional, defaults to None).
class PageObject(DictionaryObject):
def __init__(self, pdf):
def __init__(self, pdf=None, indirectRef=None):
DictionaryObject.__init__(self)
self.pdf = pdf
# Stores the original indirect reference to this object in its source PDF
self.indirectRef = indirectRef
##
# Returns a new blank page.
# If width or height is None, try to get the page size from the
# last page of pdf. If pdf is None or contains no page, a
# PageSizeNotDefinedError is raised.
# @param pdf PDF file the page belongs to
# @param width The width of the new page expressed in default user
# space units.
# @param height The height of the new page expressed in default user
# space units.
def createBlankPage(pdf=None, width=None, height=None):
page = PageObject(pdf)
# Creates a new page (cf PDF Reference 7.7.3.3)
page.__setitem__(NameObject('/Type'), NameObject('/Page'))
page.__setitem__(NameObject('/Parent'), NullObject())
page.__setitem__(NameObject('/Resources'), DictionaryObject())
if width is None or height is None:
if pdf is not None and pdf.getNumPages() > 0:
lastpage = pdf.getPage(pdf.getNumPages() - 1)
width = lastpage.mediaBox.getWidth()
height = lastpage.mediaBox.getHeight()
else:
raise utils.PageSizeNotDefinedError()
page.__setitem__(NameObject('/MediaBox'),
RectangleObject([0, 0, width, height]))
return page
createBlankPage = staticmethod(createBlankPage)
##
# Rotates a page clockwise by increments of 90 degrees.
@ -931,7 +1066,7 @@ class PageObject(DictionaryObject):
renameRes[key] = newname
newRes[newname] = page2Res[key]
elif not newRes.has_key(key):
newRes[key] = page2Res[key]
newRes[key] = page2Res.raw_get(key)
return newRes, renameRes
_mergeResources = staticmethod(_mergeResources)
@ -957,6 +1092,26 @@ class PageObject(DictionaryObject):
return stream
_pushPopGS = staticmethod(_pushPopGS)
def _addTransformationMatrix(contents, pdf, ctm):
# adds transformation matrix at the beginning of the given
# contents stream.
a, b, c, d, e, f = ctm
contents = ContentStream(contents, pdf)
contents.operations.insert(0, [[FloatObject(a), FloatObject(b),
FloatObject(c), FloatObject(d), FloatObject(e),
FloatObject(f)], " cm"])
return contents
_addTransformationMatrix = staticmethod(_addTransformationMatrix)
##
# Returns the /Contents object, or None if it doesn't exist.
# /Contents is optionnal, as described in PDF Reference 7.7.3.3
def getContents(self):
if self.has_key("/Contents"):
return self["/Contents"].getObject()
else:
return None
##
# Merges the content streams of two pages into one. Resource references
# (i.e. fonts) are maintained from both pages. The mediabox/cropbox/etc
@ -968,7 +1123,23 @@ class PageObject(DictionaryObject):
# @param page2 An instance of {@link #PageObject PageObject} to be merged
# into this one.
def mergePage(self, page2):
self._mergePage(page2)
##
# Actually merges the content streams of two pages into one. Resource
# references (i.e. fonts) are maintained from both pages. The
# mediabox/cropbox/etc of this page are not altered. The parameter page's
# content stream will be added to the end of this page's content stream,
# meaning that it will be drawn after, or "on top" of this page.
#
# @param page2 An instance of {@link #PageObject PageObject} to be merged
# into this one.
# @param page2transformation A fuction which applies a transformation to
# the content stream of page2. Takes: page2
# contents stream. Must return: new contents
# stream. If omitted, the content stream will
# not be modified.
def _mergePage(self, page2, page2transformation=None):
# First we work on merging the resource dictionaries. This allows us
# to find out what symbols in the content streams we might need to
# rename.
@ -978,7 +1149,7 @@ class PageObject(DictionaryObject):
originalResources = self["/Resources"].getObject()
page2Resources = page2["/Resources"].getObject()
for res in "/ExtGState", "/Font", "/XObject", "/ColorSpace", "/Pattern", "/Shading":
for res in "/ExtGState", "/Font", "/XObject", "/ColorSpace", "/Pattern", "/Shading", "/Properties":
new, newrename = PageObject._mergeResources(originalResources, page2Resources, res)
if new:
newResources[NameObject(res)] = new
@ -993,17 +1164,191 @@ class PageObject(DictionaryObject):
newContentArray = ArrayObject()
originalContent = self["/Contents"].getObject()
newContentArray.append(PageObject._pushPopGS(originalContent, self.pdf))
originalContent = self.getContents()
if originalContent is not None:
newContentArray.append(PageObject._pushPopGS(
originalContent, self.pdf))
page2Content = page2['/Contents'].getObject()
page2Content = PageObject._contentStreamRename(page2Content, rename, self.pdf)
page2Content = PageObject._pushPopGS(page2Content, self.pdf)
newContentArray.append(page2Content)
page2Content = page2.getContents()
if page2Content is not None:
if page2transformation is not None:
page2Content = page2transformation(page2Content)
page2Content = PageObject._contentStreamRename(
page2Content, rename, self.pdf)
page2Content = PageObject._pushPopGS(page2Content, self.pdf)
newContentArray.append(page2Content)
self[NameObject('/Contents')] = ContentStream(newContentArray, self.pdf)
self[NameObject('/Resources')] = newResources
##
# This is similar to mergePage, but a transformation matrix is
# applied to the merged stream.
#
# @param page2 An instance of {@link #PageObject PageObject} to be merged.
# @param ctm A 6 elements tuple containing the operands of the
# transformation matrix
def mergeTransformedPage(self, page2, ctm):
self._mergePage(page2, lambda page2Content:
PageObject._addTransformationMatrix(page2Content, page2.pdf, ctm))
##
# This is similar to mergePage, but the stream to be merged is scaled
# by appling a transformation matrix.
#
# @param page2 An instance of {@link #PageObject PageObject} to be merged.
# @param factor The scaling factor
def mergeScaledPage(self, page2, factor):
# CTM to scale : [ sx 0 0 sy 0 0 ]
return self.mergeTransformedPage(page2, [factor, 0,
0, factor,
0, 0])
##
# This is similar to mergePage, but the stream to be merged is rotated
# by appling a transformation matrix.
#
# @param page2 An instance of {@link #PageObject PageObject} to be merged.
# @param rotation The angle of the rotation, in degrees
def mergeRotatedPage(self, page2, rotation):
rotation = math.radians(rotation)
return self.mergeTransformedPage(page2,
[math.cos(rotation), math.sin(rotation),
-math.sin(rotation), math.cos(rotation),
0, 0])
##
# This is similar to mergePage, but the stream to be merged is translated
# by appling a transformation matrix.
#
# @param page2 An instance of {@link #PageObject PageObject} to be merged.
# @param tx The translation on X axis
# @param tx The translation on Y axis
def mergeTranslatedPage(self, page2, tx, ty):
return self.mergeTransformedPage(page2, [1, 0,
0, 1,
tx, ty])
##
# This is similar to mergePage, but the stream to be merged is rotated
# and scaled by appling a transformation matrix.
#
# @param page2 An instance of {@link #PageObject PageObject} to be merged.
# @param rotation The angle of the rotation, in degrees
# @param factor The scaling factor
def mergeRotatedScaledPage(self, page2, rotation, scale):
rotation = math.radians(rotation)
rotating = [[math.cos(rotation), math.sin(rotation),0],
[-math.sin(rotation),math.cos(rotation), 0],
[0, 0, 1]]
scaling = [[scale,0, 0],
[0, scale,0],
[0, 0, 1]]
ctm = utils.matrixMultiply(rotating, scaling)
return self.mergeTransformedPage(page2,
[ctm[0][0], ctm[0][1],
ctm[1][0], ctm[1][1],
ctm[2][0], ctm[2][1]])
##
# This is similar to mergePage, but the stream to be merged is translated
# and scaled by appling a transformation matrix.
#
# @param page2 An instance of {@link #PageObject PageObject} to be merged.
# @param scale The scaling factor
# @param tx The translation on X axis
# @param tx The translation on Y axis
def mergeScaledTranslatedPage(self, page2, scale, tx, ty):
translation = [[1, 0, 0],
[0, 1, 0],
[tx,ty,1]]
scaling = [[scale,0, 0],
[0, scale,0],
[0, 0, 1]]
ctm = utils.matrixMultiply(scaling, translation)
return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1],
ctm[1][0], ctm[1][1],
ctm[2][0], ctm[2][1]])
##
# This is similar to mergePage, but the stream to be merged is translated,
# rotated and scaled by appling a transformation matrix.
#
# @param page2 An instance of {@link #PageObject PageObject} to be merged.
# @param tx The translation on X axis
# @param ty The translation on Y axis
# @param rotation The angle of the rotation, in degrees
# @param scale The scaling factor
def mergeRotatedScaledTranslatedPage(self, page2, rotation, scale, tx, ty):
translation = [[1, 0, 0],
[0, 1, 0],
[tx,ty,1]]
rotation = math.radians(rotation)
rotating = [[math.cos(rotation), math.sin(rotation),0],
[-math.sin(rotation),math.cos(rotation), 0],
[0, 0, 1]]
scaling = [[scale,0, 0],
[0, scale,0],
[0, 0, 1]]
ctm = utils.matrixMultiply(rotating, scaling)
ctm = utils.matrixMultiply(ctm, translation)
return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1],
ctm[1][0], ctm[1][1],
ctm[2][0], ctm[2][1]])
##
# Applys a transformation matrix the page.
#
# @param ctm A 6 elements tuple containing the operands of the
# transformation matrix
def addTransformation(self, ctm):
originalContent = self.getContents()
if originalContent is not None:
newContent = PageObject._addTransformationMatrix(
originalContent, self.pdf, ctm)
newContent = PageObject._pushPopGS(newContent, self.pdf)
self[NameObject('/Contents')] = newContent
##
# Scales a page by the given factors by appling a transformation
# matrix to its content and updating the page size.
#
# @param sx The scaling factor on horizontal axis
# @param sy The scaling factor on vertical axis
def scale(self, sx, sy):
self.addTransformation([sx, 0,
0, sy,
0, 0])
self.mediaBox = RectangleObject([
float(self.mediaBox.getLowerLeft_x()) * sx,
float(self.mediaBox.getLowerLeft_y()) * sy,
float(self.mediaBox.getUpperRight_x()) * sx,
float(self.mediaBox.getUpperRight_y()) * sy])
##
# Scales a page by the given factor by appling a transformation
# matrix to its content and updating the page size.
#
# @param factor The scaling factor
def scaleBy(self, factor):
self.scale(factor, factor)
##
# Scales a page to the specified dimentions by appling a
# transformation matrix to its content and updating the page size.
#
# @param width The new width
# @param height The new heigth
def scaleTo(self, width, height):
sx = width / (self.mediaBox.getUpperRight_x() -
self.mediaBox.getLowerLeft_x ())
sy = height / (self.mediaBox.getUpperRight_y() -
self.mediaBox.getLowerLeft_x ())
self.scale(sx, sy)
##
# Compresses the size of this page by joining all content streams and
# applying a FlateDecode filter.
@ -1012,10 +1357,11 @@ class PageObject(DictionaryObject):
# However, it is possible that this function will perform no action if
# content stream compression becomes "automatic" for some reason.
def compressContentStreams(self):
content = self["/Contents"].getObject()
if not isinstance(content, ContentStream):
content = ContentStream(content, self.pdf)
self[NameObject("/Contents")] = content.flateEncode()
content = self.getContents()
if content is not None:
if not isinstance(content, ContentStream):
content = ContentStream(content, self.pdf)
self[NameObject("/Contents")] = content.flateEncode()
##
# Locate all text drawing commands, in the order they are provided in the
@ -1369,8 +1715,8 @@ def _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encr
password = (password + _encryption_padding)[:32]
# 2. Initialize the MD5 hash function and pass the result of step 1 as
# input to this function.
import md5, struct
m = md5.new(password)
import struct
m = md5(password)
# 3. Pass the value of the encryption dictionary's /O entry to the MD5 hash
# function.
m.update(owner_entry)
@ -1394,7 +1740,7 @@ def _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encr
# /Length entry.
if rev >= 3:
for i in range(50):
md5_hash = md5.new(md5_hash[:keylen]).digest()
md5_hash = md5(md5_hash[:keylen]).digest()
# 9. Set the encryption key to the first n bytes of the output from the
# final MD5 hash, where n is always 5 for revision 2 but, for revision 3 or
# greater, depends on the value of the encryption dictionary's /Length
@ -1436,14 +1782,13 @@ def _alg33_1(password, rev, keylen):
password = (password + _encryption_padding)[:32]
# 2. Initialize the MD5 hash function and pass the result of step 1 as
# input to this function.
import md5
m = md5.new(password)
m = md5(password)
# 3. (Revision 3 or greater) Do the following 50 times: Take the output
# from the previous MD5 hash and pass it as input into a new MD5 hash.
md5_hash = m.digest()
if rev >= 3:
for i in range(50):
md5_hash = md5.new(md5_hash).digest()
md5_hash = md5(md5_hash).digest()
# 4. Create an RC4 encryption key using the first n bytes of the output
# from the final MD5 hash, where n is always 5 for revision 2 but, for
# revision 3 or greater, depends on the value of the encryption
@ -1473,8 +1818,7 @@ def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encr
key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry)
# 2. Initialize the MD5 hash function and pass the 32-byte padding string
# shown in step 1 of Algorithm 3.2 as input to this function.
import md5
m = md5.new()
m = md5()
m.update(_encryption_padding)
# 3. Pass the first element of the file's file identifier array (the value
# of the ID entry in the document's trailer dictionary; see Table 3.13 on

View File

@ -1,111 +1,122 @@
# -*- coding: utf-8 -*-
# vim: sw=4:expandtab:foldmethod=marker
#
# Copyright (c) 2006, Mathieu Fenniak
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# * The name of the author may not be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
"""
Utility functions for PDF library.
"""
__author__ = "Mathieu Fenniak"
__author_email__ = "biziqe@mathieu.fenniak.net"
#ENABLE_PSYCO = False
#if ENABLE_PSYCO:
# try:
# import psyco
# except ImportError:
# ENABLE_PSYCO = False
#
#if not ENABLE_PSYCO:
# class psyco:
# def proxy(func):
# return func
# proxy = staticmethod(proxy)
def readUntilWhitespace(stream, maxchars=None):
txt = ""
while True:
tok = stream.read(1)
if tok.isspace() or not tok:
break
txt += tok
if len(txt) == maxchars:
break
return txt
def readNonWhitespace(stream):
tok = ' '
while tok == '\n' or tok == '\r' or tok == ' ' or tok == '\t':
tok = stream.read(1)
return tok
class ConvertFunctionsToVirtualList(object):
def __init__(self, lengthFunction, getFunction):
self.lengthFunction = lengthFunction
self.getFunction = getFunction
def __len__(self):
return self.lengthFunction()
def __getitem__(self, index):
if not isinstance(index, int):
raise TypeError, "sequence indices must be integers"
len_self = len(self)
if index < 0:
# support negative indexes
index = len_self + index
if index < 0 or index >= len_self:
raise IndexError, "sequence index out of range"
return self.getFunction(index)
def RC4_encrypt(key, plaintext):
S = [i for i in range(256)]
j = 0
for i in range(256):
j = (j + S[i] + ord(key[i % len(key)])) % 256
S[i], S[j] = S[j], S[i]
i, j = 0, 0
retval = ""
for x in range(len(plaintext)):
i = (i + 1) % 256
j = (j + S[i]) % 256
S[i], S[j] = S[j], S[i]
t = S[(S[i] + S[j]) % 256]
retval += chr(ord(plaintext[x]) ^ t)
return retval
class PdfReadError(Exception):
pass
if __name__ == "__main__":
# test RC4
out = RC4_encrypt("Key", "Plaintext")
print repr(out)
pt = RC4_encrypt("Key", out)
print repr(pt)
# vim: sw=4:expandtab:foldmethod=marker
#
# Copyright (c) 2006, Mathieu Fenniak
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# * The name of the author may not be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
"""
Utility functions for PDF library.
"""
__author__ = "Mathieu Fenniak"
__author_email__ = "biziqe@mathieu.fenniak.net"
#ENABLE_PSYCO = False
#if ENABLE_PSYCO:
# try:
# import psyco
# except ImportError:
# ENABLE_PSYCO = False
#
#if not ENABLE_PSYCO:
# class psyco:
# def proxy(func):
# return func
# proxy = staticmethod(proxy)
def readUntilWhitespace(stream, maxchars=None):
txt = ""
while True:
tok = stream.read(1)
if tok.isspace() or not tok:
break
txt += tok
if len(txt) == maxchars:
break
return txt
def readNonWhitespace(stream):
tok = ' '
while tok == '\n' or tok == '\r' or tok == ' ' or tok == '\t':
tok = stream.read(1)
return tok
class ConvertFunctionsToVirtualList(object):
def __init__(self, lengthFunction, getFunction):
self.lengthFunction = lengthFunction
self.getFunction = getFunction
def __len__(self):
return self.lengthFunction()
def __getitem__(self, index):
if not isinstance(index, int):
raise TypeError, "sequence indices must be integers"
len_self = len(self)
if index < 0:
# support negative indexes
index = len_self + index
if index < 0 or index >= len_self:
raise IndexError, "sequence index out of range"
return self.getFunction(index)
def RC4_encrypt(key, plaintext):
S = [i for i in range(256)]
j = 0
for i in range(256):
j = (j + S[i] + ord(key[i % len(key)])) % 256
S[i], S[j] = S[j], S[i]
i, j = 0, 0
retval = ""
for x in range(len(plaintext)):
i = (i + 1) % 256
j = (j + S[i]) % 256
S[i], S[j] = S[j], S[i]
t = S[(S[i] + S[j]) % 256]
retval += chr(ord(plaintext[x]) ^ t)
return retval
def matrixMultiply(a, b):
return [[sum([float(i)*float(j)
for i, j in zip(row, col)]
) for col in zip(*b)]
for row in a]
class PyPdfError(Exception):
pass
class PdfReadError(PyPdfError):
pass
class PageSizeNotDefinedError(PyPdfError):
pass
if __name__ == "__main__":
# test RC4
out = RC4_encrypt("Key", "Plaintext")
print repr(out)
pt = RC4_encrypt("Key", out)
print repr(pt)

View File

@ -1,356 +1,355 @@
# -*- coding: utf-8 -*-
import re
import datetime
import decimal
from generic import PdfObject
from xml.dom import getDOMImplementation
from xml.dom.minidom import parseString
RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
DC_NAMESPACE = "http://purl.org/dc/elements/1.1/"
XMP_NAMESPACE = "http://ns.adobe.com/xap/1.0/"
PDF_NAMESPACE = "http://ns.adobe.com/pdf/1.3/"
XMPMM_NAMESPACE = "http://ns.adobe.com/xap/1.0/mm/"
# What is the PDFX namespace, you might ask? I might ask that too. It's
# a completely undocumented namespace used to place "custom metadata"
# properties, which are arbitrary metadata properties with no semantic or
# documented meaning. Elements in the namespace are key/value-style storage,
# where the element name is the key and the content is the value. The keys
# are transformed into valid XML identifiers by substituting an invalid
# identifier character with \u2182 followed by the unicode hex ID of the
# original character. A key like "my car" is therefore "my\u21820020car".
#
# \u2182, in case you're wondering, is the unicode character
# \u{ROMAN NUMERAL TEN THOUSAND}, a straightforward and obvious choice for
# escaping characters.
#
# Intentional users of the pdfx namespace should be shot on sight. A
# custom data schema and sensical XML elements could be used instead, as is
# suggested by Adobe's own documentation on XMP (under "Extensibility of
# Schemas").
#
# Information presented here on the /pdfx/ schema is a result of limited
# reverse engineering, and does not constitute a full specification.
PDFX_NAMESPACE = "http://ns.adobe.com/pdfx/1.3/"
iso8601 = re.compile("""
(?P<year>[0-9]{4})
(-
(?P<month>[0-9]{2})
(-
(?P<day>[0-9]+)
(T
(?P<hour>[0-9]{2}):
(?P<minute>[0-9]{2})
(:(?P<second>[0-9]{2}(.[0-9]+)?))?
(?P<tzd>Z|[-+][0-9]{2}:[0-9]{2})
)?
)?
)?
""", re.VERBOSE)
##
# An object that represents Adobe XMP metadata.
class XmpInformation(PdfObject):
def __init__(self, stream):
self.stream = stream
docRoot = parseString(self.stream.getData())
self.rdfRoot = docRoot.getElementsByTagNameNS(RDF_NAMESPACE, "RDF")[0]
self.cache = {}
def writeToStream(self, stream, encryption_key):
self.stream.writeToStream(stream, encryption_key)
def getElement(self, aboutUri, namespace, name):
for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri:
attr = desc.getAttributeNodeNS(namespace, name)
if attr != None:
yield attr
for element in desc.getElementsByTagNameNS(namespace, name):
yield element
def getNodesInNamespace(self, aboutUri, namespace):
for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri:
for i in range(desc.attributes.length):
attr = desc.attributes.item(i)
if attr.namespaceURI == namespace:
yield attr
for child in desc.childNodes:
if child.namespaceURI == namespace:
yield child
def _getText(self, element):
text = ""
for child in element.childNodes:
if child.nodeType == child.TEXT_NODE:
text += child.data
return text
def _converter_string(value):
return value
def _converter_date(value):
m = iso8601.match(value)
year = int(m.group("year"))
month = int(m.group("month") or "1")
day = int(m.group("day") or "1")
hour = int(m.group("hour") or "0")
minute = int(m.group("minute") or "0")
second = decimal.Decimal(m.group("second") or "0")
seconds = second.to_integral(decimal.ROUND_FLOOR)
milliseconds = (second - seconds) * 1000000
tzd = m.group("tzd") or "Z"
dt = datetime.datetime(year, month, day, hour, minute, seconds, milliseconds)
if tzd != "Z":
tzd_hours, tzd_minutes = [int(x) for x in tzd.split(":")]
tzd_hours *= -1
if tzd_hours < 0:
tzd_minutes *= -1
dt = dt + datetime.timedelta(hours=tzd_hours, minutes=tzd_minutes)
return dt
_test_converter_date = staticmethod(_converter_date)
def _getter_bag(namespace, name, converter):
def get(self):
cached = self.cache.get(namespace, {}).get(name)
if cached:
return cached
retval = []
for element in self.getElement("", namespace, name):
bags = element.getElementsByTagNameNS(RDF_NAMESPACE, "Bag")
if len(bags):
for bag in bags:
for item in bag.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
value = self._getText(item)
value = converter(value)
retval.append(value)
ns_cache = self.cache.setdefault(namespace, {})
ns_cache[name] = retval
return retval
return get
def _getter_seq(namespace, name, converter):
def get(self):
cached = self.cache.get(namespace, {}).get(name)
if cached:
return cached
retval = []
for element in self.getElement("", namespace, name):
seqs = element.getElementsByTagNameNS(RDF_NAMESPACE, "Seq")
if len(seqs):
for seq in seqs:
for item in seq.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
value = self._getText(item)
value = converter(value)
retval.append(value)
else:
value = converter(self._getText(element))
retval.append(value)
ns_cache = self.cache.setdefault(namespace, {})
ns_cache[name] = retval
return retval
return get
def _getter_langalt(namespace, name, converter):
def get(self):
cached = self.cache.get(namespace, {}).get(name)
if cached:
return cached
retval = {}
for element in self.getElement("", namespace, name):
alts = element.getElementsByTagNameNS(RDF_NAMESPACE, "Alt")
if len(alts):
for alt in alts:
for item in alt.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
value = self._getText(item)
value = converter(value)
retval[item.getAttribute("xml:lang")] = value
else:
retval["x-default"] = converter(self._getText(element))
ns_cache = self.cache.setdefault(namespace, {})
ns_cache[name] = retval
return retval
return get
def _getter_single(namespace, name, converter):
def get(self):
cached = self.cache.get(namespace, {}).get(name)
if cached:
return cached
value = None
for element in self.getElement("", namespace, name):
if element.nodeType == element.ATTRIBUTE_NODE:
value = element.nodeValue
else:
value = self._getText(element)
break
if value != None:
value = converter(value)
ns_cache = self.cache.setdefault(namespace, {})
ns_cache[name] = value
return value
return get
##
# Contributors to the resource (other than the authors). An unsorted
# array of names.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
dc_contributor = property(_getter_bag(DC_NAMESPACE, "contributor", _converter_string))
##
# Text describing the extent or scope of the resource.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
dc_coverage = property(_getter_single(DC_NAMESPACE, "coverage", _converter_string))
##
# A sorted array of names of the authors of the resource, listed in order
# of precedence.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
dc_creator = property(_getter_seq(DC_NAMESPACE, "creator", _converter_string))
##
# A sorted array of dates (datetime.datetime instances) of signifigance to
# the resource. The dates and times are in UTC.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
dc_date = property(_getter_seq(DC_NAMESPACE, "date", _converter_date))
##
# A language-keyed dictionary of textual descriptions of the content of the
# resource.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
dc_description = property(_getter_langalt(DC_NAMESPACE, "description", _converter_string))
##
# The mime-type of the resource.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
dc_format = property(_getter_single(DC_NAMESPACE, "format", _converter_string))
##
# Unique identifier of the resource.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
dc_identifier = property(_getter_single(DC_NAMESPACE, "identifier", _converter_string))
##
# An unordered array specifying the languages used in the resource.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
dc_language = property(_getter_bag(DC_NAMESPACE, "language", _converter_string))
##
# An unordered array of publisher names.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
dc_publisher = property(_getter_bag(DC_NAMESPACE, "publisher", _converter_string))
##
# An unordered array of text descriptions of relationships to other
# documents.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
dc_relation = property(_getter_bag(DC_NAMESPACE, "relation", _converter_string))
##
# A language-keyed dictionary of textual descriptions of the rights the
# user has to this resource.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
dc_rights = property(_getter_langalt(DC_NAMESPACE, "rights", _converter_string))
##
# Unique identifier of the work from which this resource was derived.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
dc_source = property(_getter_single(DC_NAMESPACE, "source", _converter_string))
##
# An unordered array of descriptive phrases or keywrods that specify the
# topic of the content of the resource.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
dc_subject = property(_getter_bag(DC_NAMESPACE, "subject", _converter_string))
##
# A language-keyed dictionary of the title of the resource.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
dc_title = property(_getter_langalt(DC_NAMESPACE, "title", _converter_string))
##
# An unordered array of textual descriptions of the document type.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
dc_type = property(_getter_bag(DC_NAMESPACE, "type", _converter_string))
##
# An unformatted text string representing document keywords.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
pdf_keywords = property(_getter_single(PDF_NAMESPACE, "Keywords", _converter_string))
##
# The PDF file version, for example 1.0, 1.3.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
pdf_pdfversion = property(_getter_single(PDF_NAMESPACE, "PDFVersion", _converter_string))
##
# The name of the tool that created the PDF document.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
pdf_producer = property(_getter_single(PDF_NAMESPACE, "Producer", _converter_string))
##
# The date and time the resource was originally created. The date and
# time are returned as a UTC datetime.datetime object.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
xmp_createDate = property(_getter_single(XMP_NAMESPACE, "CreateDate", _converter_date))
##
# The date and time the resource was last modified. The date and time
# are returned as a UTC datetime.datetime object.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
xmp_modifyDate = property(_getter_single(XMP_NAMESPACE, "ModifyDate", _converter_date))
##
# The date and time that any metadata for this resource was last
# changed. The date and time are returned as a UTC datetime.datetime
# object.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
xmp_metadataDate = property(_getter_single(XMP_NAMESPACE, "MetadataDate", _converter_date))
##
# The name of the first known tool used to create the resource.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
xmp_creatorTool = property(_getter_single(XMP_NAMESPACE, "CreatorTool", _converter_string))
##
# The common identifier for all versions and renditions of this resource.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
xmpmm_documentId = property(_getter_single(XMPMM_NAMESPACE, "DocumentID", _converter_string))
##
# An identifier for a specific incarnation of a document, updated each
# time a file is saved.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
xmpmm_instanceId = property(_getter_single(XMPMM_NAMESPACE, "InstanceID", _converter_string))
def custom_properties(self):
if not hasattr(self, "_custom_properties"):
self._custom_properties = {}
for node in self.getNodesInNamespace("", PDFX_NAMESPACE):
key = node.localName
while True:
# see documentation about PDFX_NAMESPACE earlier in file
idx = key.find(u"\u2182")
if idx == -1:
break
key = key[:idx] + chr(int(key[idx+1:idx+5], base=16)) + key[idx+5:]
if node.nodeType == node.ATTRIBUTE_NODE:
value = node.nodeValue
else:
value = self._getText(node)
self._custom_properties[key] = value
return self._custom_properties
##
# Retrieves custom metadata properties defined in the undocumented pdfx
# metadata schema.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
# @return Returns a dictionary of key/value items for custom metadata
# properties.
custom_properties = property(custom_properties)
import re
import datetime
import decimal
from generic import PdfObject
from xml.dom import getDOMImplementation
from xml.dom.minidom import parseString
RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
DC_NAMESPACE = "http://purl.org/dc/elements/1.1/"
XMP_NAMESPACE = "http://ns.adobe.com/xap/1.0/"
PDF_NAMESPACE = "http://ns.adobe.com/pdf/1.3/"
XMPMM_NAMESPACE = "http://ns.adobe.com/xap/1.0/mm/"
# What is the PDFX namespace, you might ask? I might ask that too. It's
# a completely undocumented namespace used to place "custom metadata"
# properties, which are arbitrary metadata properties with no semantic or
# documented meaning. Elements in the namespace are key/value-style storage,
# where the element name is the key and the content is the value. The keys
# are transformed into valid XML identifiers by substituting an invalid
# identifier character with \u2182 followed by the unicode hex ID of the
# original character. A key like "my car" is therefore "my\u21820020car".
#
# \u2182, in case you're wondering, is the unicode character
# \u{ROMAN NUMERAL TEN THOUSAND}, a straightforward and obvious choice for
# escaping characters.
#
# Intentional users of the pdfx namespace should be shot on sight. A
# custom data schema and sensical XML elements could be used instead, as is
# suggested by Adobe's own documentation on XMP (under "Extensibility of
# Schemas").
#
# Information presented here on the /pdfx/ schema is a result of limited
# reverse engineering, and does not constitute a full specification.
PDFX_NAMESPACE = "http://ns.adobe.com/pdfx/1.3/"
iso8601 = re.compile("""
(?P<year>[0-9]{4})
(-
(?P<month>[0-9]{2})
(-
(?P<day>[0-9]+)
(T
(?P<hour>[0-9]{2}):
(?P<minute>[0-9]{2})
(:(?P<second>[0-9]{2}(.[0-9]+)?))?
(?P<tzd>Z|[-+][0-9]{2}:[0-9]{2})
)?
)?
)?
""", re.VERBOSE)
##
# An object that represents Adobe XMP metadata.
class XmpInformation(PdfObject):
def __init__(self, stream):
self.stream = stream
docRoot = parseString(self.stream.getData())
self.rdfRoot = docRoot.getElementsByTagNameNS(RDF_NAMESPACE, "RDF")[0]
self.cache = {}
def writeToStream(self, stream, encryption_key):
self.stream.writeToStream(stream, encryption_key)
def getElement(self, aboutUri, namespace, name):
for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri:
attr = desc.getAttributeNodeNS(namespace, name)
if attr != None:
yield attr
for element in desc.getElementsByTagNameNS(namespace, name):
yield element
def getNodesInNamespace(self, aboutUri, namespace):
for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri:
for i in range(desc.attributes.length):
attr = desc.attributes.item(i)
if attr.namespaceURI == namespace:
yield attr
for child in desc.childNodes:
if child.namespaceURI == namespace:
yield child
def _getText(self, element):
text = ""
for child in element.childNodes:
if child.nodeType == child.TEXT_NODE:
text += child.data
return text
def _converter_string(value):
return value
def _converter_date(value):
m = iso8601.match(value)
year = int(m.group("year"))
month = int(m.group("month") or "1")
day = int(m.group("day") or "1")
hour = int(m.group("hour") or "0")
minute = int(m.group("minute") or "0")
second = decimal.Decimal(m.group("second") or "0")
seconds = second.to_integral(decimal.ROUND_FLOOR)
milliseconds = (second - seconds) * 1000000
tzd = m.group("tzd") or "Z"
dt = datetime.datetime(year, month, day, hour, minute, seconds, milliseconds)
if tzd != "Z":
tzd_hours, tzd_minutes = [int(x) for x in tzd.split(":")]
tzd_hours *= -1
if tzd_hours < 0:
tzd_minutes *= -1
dt = dt + datetime.timedelta(hours=tzd_hours, minutes=tzd_minutes)
return dt
_test_converter_date = staticmethod(_converter_date)
def _getter_bag(namespace, name, converter):
def get(self):
cached = self.cache.get(namespace, {}).get(name)
if cached:
return cached
retval = []
for element in self.getElement("", namespace, name):
bags = element.getElementsByTagNameNS(RDF_NAMESPACE, "Bag")
if len(bags):
for bag in bags:
for item in bag.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
value = self._getText(item)
value = converter(value)
retval.append(value)
ns_cache = self.cache.setdefault(namespace, {})
ns_cache[name] = retval
return retval
return get
def _getter_seq(namespace, name, converter):
def get(self):
cached = self.cache.get(namespace, {}).get(name)
if cached:
return cached
retval = []
for element in self.getElement("", namespace, name):
seqs = element.getElementsByTagNameNS(RDF_NAMESPACE, "Seq")
if len(seqs):
for seq in seqs:
for item in seq.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
value = self._getText(item)
value = converter(value)
retval.append(value)
else:
value = converter(self._getText(element))
retval.append(value)
ns_cache = self.cache.setdefault(namespace, {})
ns_cache[name] = retval
return retval
return get
def _getter_langalt(namespace, name, converter):
def get(self):
cached = self.cache.get(namespace, {}).get(name)
if cached:
return cached
retval = {}
for element in self.getElement("", namespace, name):
alts = element.getElementsByTagNameNS(RDF_NAMESPACE, "Alt")
if len(alts):
for alt in alts:
for item in alt.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
value = self._getText(item)
value = converter(value)
retval[item.getAttribute("xml:lang")] = value
else:
retval["x-default"] = converter(self._getText(element))
ns_cache = self.cache.setdefault(namespace, {})
ns_cache[name] = retval
return retval
return get
def _getter_single(namespace, name, converter):
def get(self):
cached = self.cache.get(namespace, {}).get(name)
if cached:
return cached
value = None
for element in self.getElement("", namespace, name):
if element.nodeType == element.ATTRIBUTE_NODE:
value = element.nodeValue
else:
value = self._getText(element)
break
if value != None:
value = converter(value)
ns_cache = self.cache.setdefault(namespace, {})
ns_cache[name] = value
return value
return get
##
# Contributors to the resource (other than the authors). An unsorted
# array of names.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
dc_contributor = property(_getter_bag(DC_NAMESPACE, "contributor", _converter_string))
##
# Text describing the extent or scope of the resource.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
dc_coverage = property(_getter_single(DC_NAMESPACE, "coverage", _converter_string))
##
# A sorted array of names of the authors of the resource, listed in order
# of precedence.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
dc_creator = property(_getter_seq(DC_NAMESPACE, "creator", _converter_string))
##
# A sorted array of dates (datetime.datetime instances) of signifigance to
# the resource. The dates and times are in UTC.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
dc_date = property(_getter_seq(DC_NAMESPACE, "date", _converter_date))
##
# A language-keyed dictionary of textual descriptions of the content of the
# resource.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
dc_description = property(_getter_langalt(DC_NAMESPACE, "description", _converter_string))
##
# The mime-type of the resource.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
dc_format = property(_getter_single(DC_NAMESPACE, "format", _converter_string))
##
# Unique identifier of the resource.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
dc_identifier = property(_getter_single(DC_NAMESPACE, "identifier", _converter_string))
##
# An unordered array specifying the languages used in the resource.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
dc_language = property(_getter_bag(DC_NAMESPACE, "language", _converter_string))
##
# An unordered array of publisher names.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
dc_publisher = property(_getter_bag(DC_NAMESPACE, "publisher", _converter_string))
##
# An unordered array of text descriptions of relationships to other
# documents.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
dc_relation = property(_getter_bag(DC_NAMESPACE, "relation", _converter_string))
##
# A language-keyed dictionary of textual descriptions of the rights the
# user has to this resource.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
dc_rights = property(_getter_langalt(DC_NAMESPACE, "rights", _converter_string))
##
# Unique identifier of the work from which this resource was derived.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
dc_source = property(_getter_single(DC_NAMESPACE, "source", _converter_string))
##
# An unordered array of descriptive phrases or keywrods that specify the
# topic of the content of the resource.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
dc_subject = property(_getter_bag(DC_NAMESPACE, "subject", _converter_string))
##
# A language-keyed dictionary of the title of the resource.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
dc_title = property(_getter_langalt(DC_NAMESPACE, "title", _converter_string))
##
# An unordered array of textual descriptions of the document type.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
dc_type = property(_getter_bag(DC_NAMESPACE, "type", _converter_string))
##
# An unformatted text string representing document keywords.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
pdf_keywords = property(_getter_single(PDF_NAMESPACE, "Keywords", _converter_string))
##
# The PDF file version, for example 1.0, 1.3.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
pdf_pdfversion = property(_getter_single(PDF_NAMESPACE, "PDFVersion", _converter_string))
##
# The name of the tool that created the PDF document.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
pdf_producer = property(_getter_single(PDF_NAMESPACE, "Producer", _converter_string))
##
# The date and time the resource was originally created. The date and
# time are returned as a UTC datetime.datetime object.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
xmp_createDate = property(_getter_single(XMP_NAMESPACE, "CreateDate", _converter_date))
##
# The date and time the resource was last modified. The date and time
# are returned as a UTC datetime.datetime object.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
xmp_modifyDate = property(_getter_single(XMP_NAMESPACE, "ModifyDate", _converter_date))
##
# The date and time that any metadata for this resource was last
# changed. The date and time are returned as a UTC datetime.datetime
# object.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
xmp_metadataDate = property(_getter_single(XMP_NAMESPACE, "MetadataDate", _converter_date))
##
# The name of the first known tool used to create the resource.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
xmp_creatorTool = property(_getter_single(XMP_NAMESPACE, "CreatorTool", _converter_string))
##
# The common identifier for all versions and renditions of this resource.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
xmpmm_documentId = property(_getter_single(XMPMM_NAMESPACE, "DocumentID", _converter_string))
##
# An identifier for a specific incarnation of a document, updated each
# time a file is saved.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
xmpmm_instanceId = property(_getter_single(XMPMM_NAMESPACE, "InstanceID", _converter_string))
def custom_properties(self):
if not hasattr(self, "_custom_properties"):
self._custom_properties = {}
for node in self.getNodesInNamespace("", PDFX_NAMESPACE):
key = node.localName
while True:
# see documentation about PDFX_NAMESPACE earlier in file
idx = key.find(u"\u2182")
if idx == -1:
break
key = key[:idx] + chr(int(key[idx+1:idx+5], base=16)) + key[idx+5:]
if node.nodeType == node.ATTRIBUTE_NODE:
value = node.nodeValue
else:
value = self._getText(node)
self._custom_properties[key] = value
return self._custom_properties
##
# Retrieves custom metadata properties defined in the undocumented pdfx
# metadata schema.
# <p>Stability: Added in v1.12, will exist for all future v1.x releases.
# @return Returns a dictionary of key/value items for custom metadata
# properties.
custom_properties = property(custom_properties)