A higher-level class that stores Document
objects in a list in the .docs
attribute.
Example
import fitz # pymupdf
from pdfstruct.collection import Collection
collection = Collection.from_pdf(filename="my_pdf_file.pdf")
It can either be instanciated with a single Document
object that will be split into several Documents,
or a list of Documents that was already made, or with a "raw" input provided by PyMuPdf.
The metadata obtained via PyMuPdf is also available though the metadata
attribute.
Source code in pdfstruct/collection.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118 | class Collection:
"""A higher-level class that stores `Document` objects in a list in the `.docs` attribute.
Example:
```python
import fitz # pymupdf
from pdfstruct.collection import Collection
collection = Collection.from_pdf(filename="my_pdf_file.pdf")
```
It can either be instanciated with a single ```Document``` object that will be split into several Documents,
or a list of Documents that was already made, or with a "raw" input provided by PyMuPdf.
The metadata obtained via PyMuPdf is also available though the ```metadata``` attribute.
"""
def __init__(self, raw, page=None, n=None, log=None):
if type(raw) is Document:
# in some cases, we already instanciated the Document object, we then split it through the .split() method.
self.docs = raw.split()
"""The input that is split into several documents."""
elif type(raw) in (list, tuple) and type(raw[0]) is Document:
# in other cases, the Documents are already split
self.docs = list(raw)
else:
# but most of the time, we pass a raw input,instanciate a Document from it and split it through
# the .split() method.
self.docs = Document(raw, page=page, n=n).split()
metadata = []
# retrieving the metadata for each Document
for doc in self.docs:
meta = doc.metadata
if meta is not None and tuple(meta.items()) not in metadata:
metadata.append(tuple(meta.items()))
self.metadata = metadata
"""the list of all PyMuPdf metadata for each `Document`"""
@classmethod
def from_json(cls, filename, **kwargs):
"""Creates a Collection object from a PyMuPdf json file. The json file must follow the structure
explained in the PyMuPdf documentation :
<https://pymupdf.readthedocs.io/en/latest/textpage.html#structure-of-dictionary-outputs>
Returns:
(Collection): a Collection object from a json file.
"""
return cls(Document.from_json(filename, **kwargs))
@classmethod
def from_pdf(cls, filename, **kwargs):
"""
Returns:
(Collection): a Collection object from a pdf file.
"""
return cls(Document.from_pdf(filename, **kwargs))
def __len__(self):
"""
Returns:
(int): The number of Document objects contained in the Collection."""
return len(self.docs)
def __str__(self):
"""Displays the basic information about a Collection.
Returns:
(str): A summary with the number of Document objects, the first 5 (if existing) Documents
in the Collection and their respective data (number of pages and the 10 first lines).
"""
# TODO:>Display Merge __str__ and display
str = f"Number of documents in the Collection: {len(self)}\n"
str += "First 5 (if existing) documents in the Collection:\n"
for i, doc in enumerate(self.docs[:5]):
str += f"Document {i+1}:\n"
str += f" Number of pages: {len(doc)}\n"
str += " First 10 lines of first page:\n"
for line in doc.pages[0].lines[:10]:
str += f" {line.display()}\n"
return str
def to_html(self, path="./pages", pdf=None):
"""Gives the HTML rendering of each document. Creates a directory structure of one folder
per ```Document`` and one html file per page."""
for i, doc in enumerate(self.docs):
doc.to_html(path=f"{path}/doc_{i+1}", pdf=pdf)
def display(self, **kwargs):
"""Displays all ```Document``` objects using their ```display()``` method."""
for doc in self.docs:
doc.display(**kwargs)
def add_documents(self, doc):
"""Adds a ```Document``` to the `Collection` : must receive a ```Document``` object as input."""
self.docs.append(doc)
|
docs = raw.split()
instance-attribute
The input that is split into several documents.
the list of all PyMuPdf metadata for each Document
__len__()
Returns:
Type |
Description |
int
|
The number of Document objects contained in the Collection.
|
Source code in pdfstruct/collection.py
| def __len__(self):
"""
Returns:
(int): The number of Document objects contained in the Collection."""
return len(self.docs)
|
__str__()
Displays the basic information about a Collection.
Returns:
Type |
Description |
str
|
A summary with the number of Document objects, the first 5 (if existing) Documents
|
|
in the Collection and their respective data (number of pages and the 10 first lines).
|
Source code in pdfstruct/collection.py
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103 | def __str__(self):
"""Displays the basic information about a Collection.
Returns:
(str): A summary with the number of Document objects, the first 5 (if existing) Documents
in the Collection and their respective data (number of pages and the 10 first lines).
"""
# TODO:>Display Merge __str__ and display
str = f"Number of documents in the Collection: {len(self)}\n"
str += "First 5 (if existing) documents in the Collection:\n"
for i, doc in enumerate(self.docs[:5]):
str += f"Document {i+1}:\n"
str += f" Number of pages: {len(doc)}\n"
str += " First 10 lines of first page:\n"
for line in doc.pages[0].lines[:10]:
str += f" {line.display()}\n"
return str
|
add_documents(doc)
Adds a Document
to the Collection
: must receive a Document
object as input.
Source code in pdfstruct/collection.py
| def add_documents(self, doc):
"""Adds a ```Document``` to the `Collection` : must receive a ```Document``` object as input."""
self.docs.append(doc)
|
display(**kwargs)
Displays all Document
objects using their display()
method.
Source code in pdfstruct/collection.py
| def display(self, **kwargs):
"""Displays all ```Document``` objects using their ```display()``` method."""
for doc in self.docs:
doc.display(**kwargs)
|
from_json(filename, **kwargs)
classmethod
Creates a Collection object from a PyMuPdf json file. The json file must follow the structure
explained in the PyMuPdf documentation :
https://pymupdf.readthedocs.io/en/latest/textpage.html#structure-of-dictionary-outputs
Returns:
(Collection): a Collection object from a json file.
Source code in pdfstruct/collection.py
60
61
62
63
64
65
66
67
68
69
70
71 | @classmethod
def from_json(cls, filename, **kwargs):
"""Creates a Collection object from a PyMuPdf json file. The json file must follow the structure
explained in the PyMuPdf documentation :
<https://pymupdf.readthedocs.io/en/latest/textpage.html#structure-of-dictionary-outputs>
Returns:
(Collection): a Collection object from a json file.
"""
return cls(Document.from_json(filename, **kwargs))
|
from_pdf(filename, **kwargs)
classmethod
Returns:
Type |
Description |
Collection
|
a Collection object from a pdf file.
|
Source code in pdfstruct/collection.py
| @classmethod
def from_pdf(cls, filename, **kwargs):
"""
Returns:
(Collection): a Collection object from a pdf file.
"""
return cls(Document.from_pdf(filename, **kwargs))
|
to_html(path='./pages', pdf=None)
Gives the HTML rendering of each document. Creates a directory structure of one folder
per `Document
and one html file per page.
Source code in pdfstruct/collection.py
| def to_html(self, path="./pages", pdf=None):
"""Gives the HTML rendering of each document. Creates a directory structure of one folder
per ```Document`` and one html file per page."""
for i, doc in enumerate(self.docs):
doc.to_html(path=f"{path}/doc_{i+1}", pdf=pdf)
|