Skip to content

Markers

For a description of the markers, see the explanation of Pdfstruct Data Structure

Kind

The Base Class to define a Line.

Source code in pdfstruct/marker.py
 8
 9
10
11
12
13
14
15
16
17
18
19
class Kind:
    """The Base Class to define a `Line`."""

    def __init__(self):
        pass

    @property
    def color(self):
        return "black"

    def display(self, content):
        return f"\033[{self.color}m{content}\033[0m"

Section

Bases: Kind

The class used to label the Line as a title of a section.

Source code in pdfstruct/marker.py
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
class Section(Kind):
    """The class used to label the `Line` as a title of a section."""

    def __init__(self, m):
        regular = True
        self.title = m.group("title")
        try:
            self.numbering = m.group("numbering")
        except IndexError:
            self.numbering = ""
            regular = False
        self.intro = m.group("intro") if regular else ""
        self.sep = m.group("sep") if regular else ""
        self.post = m.captures("post") if regular else []
        self.nums = m.captures("num") if regular else []
        self.numbering_span = m.span("numbering") if regular else (-1, -1)
        self.title_span = m.span("title")

        start2info = {}
        if regular:
            for i, (s, e) in enumerate(m.spans("num")):
                start2info[s] = [s, e, self.nums[i]]
            for t in {"digit", "letter", "LETTER", "roman", "ROMAN"}:
                try:
                    for s, e in m.spans(t):
                        if s in start2info:
                            start2info[s].append(t)
                except Exception:
                    pass

        post = self.post
        if len(post) < len(self.nums):
            post += [""] * (len(self.nums) - len(post))

        self.components = NumSeq([(x[3], x[2]) for x in start2info.values()], model=(self.intro, post))
        # logging.info(f"components are {self.components}")

        model = ""
        if self.intro:
            model += self.intro + " "
        s0, e0 = self.numbering_span
        numbering = self.numbering
        prev = s0
        for x in start2info.values():
            model += numbering[prev - s0 : x[0] - s0] + f"<{x[3]}>"
            prev = x[1]
        model += numbering[prev - s0 :]
        self.model = model

    @property
    def color(self):
        # return 'red'
        return 91

    @property
    def index(self):
        """returns a normalized version of the Section : the normalized numeration and normalized text.
        This is to check if these elements also appear in the normalized table of content of a document (`doc._tdm`).

        Returns:
            (tuple(str)) : the normalized components and body of text.
        """
        return (  # re.sub(r'\p{Pd}', '-', self.numbering.lower()),
            # re.sub(r'[^\w\s]', '-', self.numbering.lower()),
            self.components.normalized,
            re.sub(r"[^\p{L}]", "", unidecode(self.title.lower())),
        )

    def display(self, content):
        start, end = self.numbering_span
        content = [content[0:start], "\033[4m" + content[start:end] + "\033[0m", content[end:]]
        content = "".join(Kind.display(self, c) for c in content)
        return content

    @property
    def level(self):
        return len(self.nums)

    @property
    def tag(self):
        return f"h{self.level}"

index property ⚓︎

returns a normalized version of the Section : the normalized numeration and normalized text. This is to check if these elements also appear in the normalized table of content of a document (doc._tdm).

Returns:

Type Description

(tuple(str)) : the normalized components and body of text.

SecIndex

Bases: Section

Source code in pdfstruct/marker.py
105
106
107
108
109
110
111
112
113
class SecIndex(Section):
    def __init__(self, m):
        super().__init__(m)
        self.page = int(m.group("page"))

    @property
    def color(self):
        # return 'green'
        return 92

Other

Bases: Section

Source code in pdfstruct/marker.py
149
150
151
152
153
154
155
156
class Other(Section):
    @property
    def color(self):
        return 36

    @property
    def tag(self):
        return "other"

OtherIndex

Bases: SecIndex

Source code in pdfstruct/marker.py
159
160
161
162
class OtherIndex(SecIndex):
    @property
    def color(self):
        return 96

Header

Bases: Kind

Source code in pdfstruct/marker.py
129
130
131
132
133
134
135
136
class Header(Kind):
    @property
    def color(self):
        return 94

    @property
    def tag(self):
        return "header"

Footer

Bases: Kind

Source code in pdfstruct/marker.py
139
140
141
142
143
144
145
146
class Footer(Kind):
    @property
    def color(self):
        return 95

    @property
    def tag(self):
        return "footer"

Cell

Bases: Kind

Source code in pdfstruct/marker.py
116
117
118
119
120
121
122
123
124
125
126
class Cell(Kind):
    def __init__(self, cols=None):
        self.cols = cols

    @property
    def color(self):
        return 93

    @property
    def tag(self):
        return "td"