Skip to content

Collection

A higher-level class that stores Document objects in a list in the .docs attribute.

Example
import fitz  # pymupdf
from pdfstruct.collection import Collection
collection = Collection.from_pdf(filename="my_pdf_file.pdf")

It can either be instanciated with a single Document object that will be split into several Documents, or a list of Documents that was already made, or with a "raw" input provided by PyMuPdf. The metadata obtained via PyMuPdf is also available though the metadata attribute.

Source code in pdfstruct/collection.py
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
class Collection:
    """A higher-level class that stores `Document` objects in a list in the `.docs` attribute.

    Example:
        ```python
        import fitz  # pymupdf
        from pdfstruct.collection import Collection
        collection = Collection.from_pdf(filename="my_pdf_file.pdf")
        ```


    It can either be instanciated with a single ```Document``` object that will be split into several Documents,
    or a list of Documents that was already made, or with a "raw" input provided by PyMuPdf.
    The metadata obtained via PyMuPdf is also available though the ```metadata``` attribute.


    """

    def __init__(self, raw, page=None, n=None, log=None):
        if type(raw) is Document:
            # in some cases, we already instanciated the Document object, we then split it through the .split() method.
            self.docs = raw.split()
            """The input that is split into several documents."""
        elif type(raw) in (list, tuple) and type(raw[0]) is Document:
            # in other cases, the Documents are already split
            self.docs = list(raw)
        else:
            # but most of the time, we pass a raw input,instanciate a Document from it and split it through
            # the .split() method.
            self.docs = Document(raw, page=page, n=n).split()

        metadata = []
        # retrieving the metadata for each Document
        for doc in self.docs:
            meta = doc.metadata
            if meta is not None and tuple(meta.items()) not in metadata:
                metadata.append(tuple(meta.items()))
        self.metadata = metadata
        """the list of all PyMuPdf metadata for each `Document`"""

    @classmethod
    def from_json(cls, filename, **kwargs):
        """Creates a Collection object from a PyMuPdf json file. The json file must follow the structure
        explained in the PyMuPdf documentation :

        <https://pymupdf.readthedocs.io/en/latest/textpage.html#structure-of-dictionary-outputs>

         Returns:
          (Collection): a Collection object from a json file.

        """
        return cls(Document.from_json(filename, **kwargs))

    @classmethod
    def from_pdf(cls, filename, **kwargs):
        """
        Returns:
            (Collection): a Collection object from a pdf file.
        """
        return cls(Document.from_pdf(filename, **kwargs))

    def __len__(self):
        """
        Returns:
            (int): The number of Document objects contained in the Collection."""
        return len(self.docs)

    def __str__(self):
        """Displays the basic information about a Collection.

        Returns:
            (str): A summary with the number of Document objects, the first 5 (if existing) Documents
            in the Collection and their respective data (number of pages and the 10 first lines).
        """
        # TODO:>Display Merge __str__ and display
        str = f"Number of documents in the Collection: {len(self)}\n"
        str += "First 5 (if existing) documents in the Collection:\n"
        for i, doc in enumerate(self.docs[:5]):
            str += f"Document {i+1}:\n"
            str += f"  Number of pages: {len(doc)}\n"
            str += "  First 10 lines of first page:\n"
            for line in doc.pages[0].lines[:10]:
                str += f"    {line.display()}\n"
        return str

    def to_html(self, path="./pages", pdf=None):
        """Gives the HTML rendering of each document. Creates a directory structure of one folder
        per ```Document`` and one html file per page."""
        for i, doc in enumerate(self.docs):
            doc.to_html(path=f"{path}/doc_{i+1}", pdf=pdf)

    def display(self, **kwargs):
        """Displays all ```Document``` objects using their ```display()``` method."""
        for doc in self.docs:
            doc.display(**kwargs)

    def add_documents(self, doc):
        """Adds a ```Document``` to the `Collection` : must receive a ```Document``` object as input."""
        self.docs.append(doc)

docs = raw.split() instance-attribute ⚓︎

The input that is split into several documents.

metadata = metadata instance-attribute ⚓︎

the list of all PyMuPdf metadata for each Document

__len__() ⚓︎

Returns:

Type Description
int

The number of Document objects contained in the Collection.

Source code in pdfstruct/collection.py
81
82
83
84
85
def __len__(self):
    """
    Returns:
        (int): The number of Document objects contained in the Collection."""
    return len(self.docs)

__str__() ⚓︎

Displays the basic information about a Collection.

Returns:

Type Description
str

A summary with the number of Document objects, the first 5 (if existing) Documents

in the Collection and their respective data (number of pages and the 10 first lines).

Source code in pdfstruct/collection.py
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
def __str__(self):
    """Displays the basic information about a Collection.

    Returns:
        (str): A summary with the number of Document objects, the first 5 (if existing) Documents
        in the Collection and their respective data (number of pages and the 10 first lines).
    """
    # TODO:>Display Merge __str__ and display
    str = f"Number of documents in the Collection: {len(self)}\n"
    str += "First 5 (if existing) documents in the Collection:\n"
    for i, doc in enumerate(self.docs[:5]):
        str += f"Document {i+1}:\n"
        str += f"  Number of pages: {len(doc)}\n"
        str += "  First 10 lines of first page:\n"
        for line in doc.pages[0].lines[:10]:
            str += f"    {line.display()}\n"
    return str

add_documents(doc) ⚓︎

Adds a Document to the Collection : must receive a Document object as input.

Source code in pdfstruct/collection.py
116
117
118
def add_documents(self, doc):
    """Adds a ```Document``` to the `Collection` : must receive a ```Document``` object as input."""
    self.docs.append(doc)

display(**kwargs) ⚓︎

Displays all Document objects using their display() method.

Source code in pdfstruct/collection.py
111
112
113
114
def display(self, **kwargs):
    """Displays all ```Document``` objects using their ```display()``` method."""
    for doc in self.docs:
        doc.display(**kwargs)

from_json(filename, **kwargs) classmethod ⚓︎

Creates a Collection object from a PyMuPdf json file. The json file must follow the structure explained in the PyMuPdf documentation :

https://pymupdf.readthedocs.io/en/latest/textpage.html#structure-of-dictionary-outputs

Returns: (Collection): a Collection object from a json file.

Source code in pdfstruct/collection.py
60
61
62
63
64
65
66
67
68
69
70
71
@classmethod
def from_json(cls, filename, **kwargs):
    """Creates a Collection object from a PyMuPdf json file. The json file must follow the structure
    explained in the PyMuPdf documentation :

    <https://pymupdf.readthedocs.io/en/latest/textpage.html#structure-of-dictionary-outputs>

     Returns:
      (Collection): a Collection object from a json file.

    """
    return cls(Document.from_json(filename, **kwargs))

from_pdf(filename, **kwargs) classmethod ⚓︎

Returns:

Type Description
Collection

a Collection object from a pdf file.

Source code in pdfstruct/collection.py
73
74
75
76
77
78
79
@classmethod
def from_pdf(cls, filename, **kwargs):
    """
    Returns:
        (Collection): a Collection object from a pdf file.
    """
    return cls(Document.from_pdf(filename, **kwargs))

to_html(path='./pages', pdf=None) ⚓︎

Gives the HTML rendering of each document. Creates a directory structure of one folder per `Document and one html file per page.

Source code in pdfstruct/collection.py
105
106
107
108
109
def to_html(self, path="./pages", pdf=None):
    """Gives the HTML rendering of each document. Creates a directory structure of one folder
    per ```Document`` and one html file per page."""
    for i, doc in enumerate(self.docs):
        doc.to_html(path=f"{path}/doc_{i+1}", pdf=pdf)