Collection

A higher-level class that stores Document objects in a list in the .docs attribute.

Example

import fitz  # pymupdf
from pdfstruct.collection import Collection
collection = Collection.from_pdf(filename="my_pdf_file.pdf")

It can either be instanciated with a single Document object that will be split into several Documents, or a list of Documents that was already made, or with a "raw" input provided by PyMuPdf. The metadata obtained via PyMuPdf is also available though the metadata attribute.

Source code in pdfstruct/collection.py

class Collection:
    """A higher-level class that stores `Document` objects in a list in the `.docs` attribute.

    Example:
        ```python
        import fitz  # pymupdf
        from pdfstruct.collection import Collection
        collection = Collection.from_pdf(filename="my_pdf_file.pdf")
        ```


    It can either be instanciated with a single ```Document``` object that will be split into several Documents,
    or a list of Documents that was already made, or with a "raw" input provided by PyMuPdf.
    The metadata obtained via PyMuPdf is also available though the ```metadata``` attribute.


    """

    def __init__(self, raw, page=None, n=None, log=None):
        if type(raw) is Document:
            # in some cases, we already instanciated the Document object, we then split it through the .split() method.
            self.docs = raw.split()
            """The input that is split into several documents."""
        elif type(raw) in (list, tuple) and type(raw[0]) is Document:
            # in other cases, the Documents are already split
            self.docs = list(raw)
        else:
            # but most of the time, we pass a raw input,instanciate a Document from it and split it through
            # the .split() method.
            self.docs = Document(raw, page=page, n=n).split()

        metadata = []
        # retrieving the metadata for each Document
        for doc in self.docs:
            meta = doc.metadata
            if meta is not None and tuple(meta.items()) not in metadata:
                metadata.append(tuple(meta.items()))
        self.metadata = metadata
        """the list of all PyMuPdf metadata for each `Document`"""

    @classmethod
    def from_json(cls, filename, **kwargs):
        """Creates a Collection object from a PyMuPdf json file. The json file must follow the structure
        explained in the PyMuPdf documentation :

        <https://pymupdf.readthedocs.io/en/latest/textpage.html#structure-of-dictionary-outputs>

         Returns:
          (Collection): a Collection object from a json file.

        """
        return cls(Document.from_json(filename, **kwargs))

    @classmethod
    def from_pdf(cls, filename, **kwargs):
        """
        Returns:
            (Collection): a Collection object from a pdf file.
        """
        return cls(Document.from_pdf(filename, **kwargs))

    def __len__(self):
        """
        Returns:
            (int): The number of Document objects contained in the Collection."""
        return len(self.docs)

    def __str__(self):
        """Displays the basic information about a Collection.

        Returns:
            (str): A summary with the number of Document objects, the first 5 (if existing) Documents
            in the Collection and their respective data (number of pages and the 10 first lines).
        """
        # TODO:>Display Merge __str__ and display
        str = f"Number of documents in the Collection: {len(self)}\n"
        str += "First 5 (if existing) documents in the Collection:\n"
        for i, doc in enumerate(self.docs[:5]):
            str += f"Document {i+1}:\n"
            str += f"  Number of pages: {len(doc)}\n"
            str += "  First 10 lines of first page:\n"
            for line in doc.pages[0].lines[:10]:
                str += f"    {line.display()}\n"
        return str

    def to_html(self, path="./pages", pdf=None):
        """Gives the HTML rendering of each document. Creates a directory structure of one folder
        per ```Document`` and one html file per page."""
        for i, doc in enumerate(self.docs):
            doc.to_html(path=f"{path}/doc_{i+1}", pdf=pdf)

    def display(self, **kwargs):
        """Displays all ```Document``` objects using their ```display()``` method."""
        for doc in self.docs:
            doc.display(**kwargs)

    def add_documents(self, doc):
        """Adds a ```Document``` to the `Collection` : must receive a ```Document``` object as input."""
        self.docs.append(doc)

`docs = raw.split()` `instance-attribute` ⚓︎

The input that is split into several documents.

`metadata = metadata` `instance-attribute` ⚓︎

the list of all PyMuPdf metadata for each Document

`len()` ⚓︎

Returns:

Type	Description
`int`	The number of Document objects contained in the Collection.

Source code in pdfstruct/collection.py

def __len__(self):
    """
    Returns:
        (int): The number of Document objects contained in the Collection."""
    return len(self.docs)

`str()` ⚓︎

Displays the basic information about a Collection.

Returns:

Type	Description
`str`	A summary with the number of Document objects, the first 5 (if existing) Documents
	in the Collection and their respective data (number of pages and the 10 first lines).

Source code in pdfstruct/collection.py

def __str__(self):
    """Displays the basic information about a Collection.

    Returns:
        (str): A summary with the number of Document objects, the first 5 (if existing) Documents
        in the Collection and their respective data (number of pages and the 10 first lines).
    """
    # TODO:>Display Merge __str__ and display
    str = f"Number of documents in the Collection: {len(self)}\n"
    str += "First 5 (if existing) documents in the Collection:\n"
    for i, doc in enumerate(self.docs[:5]):
        str += f"Document {i+1}:\n"
        str += f"  Number of pages: {len(doc)}\n"
        str += "  First 10 lines of first page:\n"
        for line in doc.pages[0].lines[:10]:
            str += f"    {line.display()}\n"
    return str

`add_documents(doc)` ⚓︎

Adds a Document to the Collection : must receive a Document object as input.

Source code in pdfstruct/collection.py

def add_documents(self, doc):
    """Adds a ```Document``` to the `Collection` : must receive a ```Document``` object as input."""
    self.docs.append(doc)

`display(**kwargs)` ⚓︎

Displays all Document objects using their display() method.

Source code in pdfstruct/collection.py

def display(self, **kwargs):
    """Displays all ```Document``` objects using their ```display()``` method."""
    for doc in self.docs:
        doc.display(**kwargs)

`from_json(filename, **kwargs)` `classmethod` ⚓︎

Creates a Collection object from a PyMuPdf json file. The json file must follow the structure explained in the PyMuPdf documentation :

https://pymupdf.readthedocs.io/en/latest/textpage.html#structure-of-dictionary-outputs

Returns: (Collection): a Collection object from a json file.

Source code in pdfstruct/collection.py

@classmethod
def from_json(cls, filename, **kwargs):
    """Creates a Collection object from a PyMuPdf json file. The json file must follow the structure
    explained in the PyMuPdf documentation :

    <https://pymupdf.readthedocs.io/en/latest/textpage.html#structure-of-dictionary-outputs>

     Returns:
      (Collection): a Collection object from a json file.

    """
    return cls(Document.from_json(filename, **kwargs))

`from_pdf(filename, **kwargs)` `classmethod` ⚓︎

Returns:

Type	Description
`Collection`	a Collection object from a pdf file.

Source code in pdfstruct/collection.py

@classmethod
def from_pdf(cls, filename, **kwargs):
    """
    Returns:
        (Collection): a Collection object from a pdf file.
    """
    return cls(Document.from_pdf(filename, **kwargs))

`to_html(path='./pages', pdf=None)` ⚓︎

Gives the HTML rendering of each document. Creates a directory structure of one folder per `Document and one html file per page.

Source code in pdfstruct/collection.py

def to_html(self, path="./pages", pdf=None):
    """Gives the HTML rendering of each document. Creates a directory structure of one folder
    per ```Document`` and one html file per page."""
    for i, doc in enumerate(self.docs):
        doc.to_html(path=f"{path}/doc_{i+1}", pdf=pdf)

Collection

docs = raw.split() instance-attribute ⚓︎

metadata = metadata instance-attribute ⚓︎

__len__() ⚓︎

__str__() ⚓︎

add_documents(doc) ⚓︎

display(**kwargs) ⚓︎

from_json(filename, **kwargs) classmethod ⚓︎

from_pdf(filename, **kwargs) classmethod ⚓︎

to_html(path='./pages', pdf=None) ⚓︎

`docs = raw.split()` `instance-attribute` ⚓︎

`metadata = metadata` `instance-attribute` ⚓︎

`len()` ⚓︎

`str()` ⚓︎

`add_documents(doc)` ⚓︎

`display(**kwargs)` ⚓︎

`from_json(filename, **kwargs)` `classmethod` ⚓︎

`from_pdf(filename, **kwargs)` `classmethod` ⚓︎

`to_html(path='./pages', pdf=None)` ⚓︎