Line

This class is used to handle lines on a Page. It stores basic physical information (coordinates, text, links, styles, spacings). It also allows some manipulation on lines (see try_merge() and join_next()).

On instanciation, the regex patterns are matched with the textual content of the line (see is_potential_section() and is_potential_other_index().)

Source code in pdfstruct/line.py

class Line:
    """
    This class is used to handle lines on a `Page`.
    It stores basic physical information (coordinates, text, links, styles, spacings).
    It also allows some manipulation on lines (see `try_merge()` and `join_next()`).

    On instanciation, the regex patterns are matched with the textual content of the
    line (see `is_potential_section()` and `is_potential_other_index()`.)
    """

    def __init__(self, content, raw, page=None, doc=None):
        # content = re.sub(r'[●••·◘○◙‣⁃⁌⁍■□∙]','*',content)
        self.content = content
        """The textual content of the line."""
        self.raw = raw
        """The PyMuPdf raw dictionary value for the line."""
        self.raws = [self.raw]
        self.where = None
        self.kind = None
        self.prev: Optional[Line] = None
        self.next: Optional[Line] = None
        self.page = page
        self.id = uuid.uuid4()
        """The unique identifier for the line."""

        self.links = []
        """The links provided by PyMuPdf,if any."""

        self.update(doc=doc)

    def update(self, doc=None):
        """Checks if the line is a potential Section and a potential OtherIndex."""
        self.is_potential_section(doc=doc)
        self.is_potential_other_index()

        # self.is_potential_paragraph_title(doc=doc)

    @property
    def most_common_styles(self):
        """
        Computes the style that is used on most portions of the text (joining the spans and summing their lenghts)

        Returns:
            (dict): a dictionary containing the most common styles (font, color, size and flags.)

        """

        def get_common(style, linedict):
            style_dic = {}
            for span in linedict["spans"]:
                if span[style] not in style_dic:
                    style_dic[span[style]] = [span["text"]]
                else:
                    style_dic[span[style]].append(span["text"])
            lengths = {k: len("".join(v)) for k, v in style_dic.items()}
            longest = max(lengths)
            return longest

        styles = ["font", "size", "flags", "color"]
        return {style: get_common(style, self.raw) for style in styles}

    def same_style(self, other):
        """Compares the styles of two lines

        Returns:
            (bool): if True, the two lines have the same styles"""

        style = self.most_common_styles
        other_style = other.most_common_styles
        return style == other_style

    def __str__(self):
        """Displays the text according to a color scheme and the localization of the line (column).
        The colors depend on the type of marker. (ex: Section is red)

        Returns:
            (str): the text content of the line
        """
        # TODO:>Display Merge __str__ and display
        content = self.content
        if self.kind:
            content = self.kind.display(content)
        return f"[{self.where}] {content}"

    def display(self):
        """Displays the text according to a color scheme. The colors depend on the type of marker. (ex: Section is red)

        Returns:
            (str): the text content of the line
        """
        # TODO:>Display: Make consistant all the display methods (sometimes return, sometimes print)
        content = self.content
        if self.kind:
            content = self.kind.display(content)
        return content

    def display_html(self):
        """Displays the text in HTML format.

        Returns:
            (str): the HTML tagged text of the line.
        """
        content = self.content
        if self.kind:
            # content = colored( content,self.kind.color)
            tag = self.kind.tag
            color = HTML_COLORS[self.kind.color]
            styles = [("style", color)]
        else:
            tag = "p"
            styles = [("style", "color:black;")]
        tagged = make_html(content, tag, attval=styles)
        return tagged

    @property
    def bbox(self):
        """
        The PYMUPDF bbox.
        """
        return self.raw["bbox"]

    @property
    def origin(self):
        """The PYMUPDF origin."""
        return self.raw["spans"][0]["origin"]

    @property
    def base_y(self):
        """The PYMUPDF y coordinate of the origin."""
        return self.origin[1]

    @property
    def x0(self):
        """The PYMUPDF x0"""
        return self.bbox[0]

    @property
    def y0(self):
        """The PYMUPDF y0"""
        return self.bbox[1]

    @property
    def x1(self):
        """The PYMUPDF x1."""
        return self.bbox[2]

    @property
    def y1(self):
        """The PYMUPDF y1."""
        return self.bbox[3]

    @property
    def width(self):
        """The difference between x1 and x0."""
        return self.x1 - self.x0

    @property
    def height(self):
        """The difference between y1 and y0."""
        return self.y1 - self.y0

    @property
    def orientation(self):
        """The PYMUPDF direction/orientation of the line."""
        return self.raw["dir"]

    # these properties return info about the space between a line and its surrounding lines
    # they're used for list and paragraph detection
    # Eric: they should only apply if the two lines being used to compute a vertical spacing are on the same page
    # and otherwise return a default value or raise an exception
    @property
    def prev_spacing(self):
        """Computes the distance between a line an the previous line.

        Returns:
            (int): the distance or spacing between a line and the previous line
        """
        return abs((round(self.y0) - round(self.prev.y1))) if self.prev and self.page == self.prev.page else 0

    @property
    def next_spacing(self):
        """Computes the distance between a line an the next line.

        Returns:
            (int): the distance or spacing between a line and the next line
        """

        return abs((round(self.next.y0) - round(self.y1))) if self.next and self.page == self.next.page else 0

    def potential_merge(self, other, doc=None, col_sep=None):
        """Checks whether two lines could be merged into one.

        Returns:
            (bool): if True, the two lines can be merged safely.
        """
        # logger.debug(f"test merge {self} with {other} : {self.base_y=}  {other.base_y=} {self.x1=} {other.x0=}")
        list_bug = PATTERNS["list_bug"]
        if col_sep is not None:
            if not isinstance(col_sep, list):
                col_sep = [col_sep]
            if any(((self.x1 < sep and other.x0 > sep) or (self.x0 > sep and other.x1 < sep)) for sep in col_sep):
                return False
        if len(self.content) + len(other.content) > 200:
            logger.warning(
                f"Aborting potential merge because of very long content {len(self.content) + len(other.content)}"
            )
            return False
        if type(self.kind) in {SecIndex, OtherIndex}:
            return False
        if (
            abs(self.base_y - other.base_y) < 4
            and (self.x1 < other.x0 or self.x1 < other.x0 + 2)  # some cases with a low overlapping
            and (
                other.x0 < self.x1 + 10
                or (other.x0 < self.x1 + 30 and list_bug.match(self.content))
                or self.is_potential_section_aux(content=self.content + " " + other.content, doc=doc)
            )
            and self.height < 30
        ):
            return True
        # logger.debug('HERE0')
        # multi-line section
        if (
            type(self.kind) is Section
            and (type(other.kind) is not SecIndex or not self.acceptable_numbering(other.kind.numbering))
            and re.search(r"[._\p{Pd}]{5,}\s*\d+$", other.content)
        ):
            # if type(self.kind) is not SecIndex  and type(other.kind) is not SecIndex and
            # re.search(r'[._\p{Pd}]{5,}\s*\d+$',other.content) and
            # self.is_potential_section_aux(content=self.content + ' ' + other.content,doc=doc):
            # logger.debug(f"MERGE other {other} kind={other.kind}")
            return True
        # multi-line section
        if (
            not self.kind
            and type(other.kind) not in {Section, SecIndex, OtherIndex}
            and re.search(r"[._\p{Pd}]{5,}\s*\d+$", other.content)
            and self.is_potential_other_aux(self.content + " " + other.content)
        ):
            logger.debug(f"potential merge {self.kind=} {other.kind=}")
            return True
        # sometimes list symbols are identified as single lines (split from the following text)
        if False and list_bug.match(self.content):
            return True

    def try_merge(self, other, doc=None, col_sep=None):
        """Merges two lines into one and updates the new `Line` object.

        Returns:
            (bool): if True, the merge has been done.
        """
        # TODO: Ask Eric the difference with join_next()
        if self.potential_merge(other, doc=doc, col_sep=col_sep):
            logger.debug(
                f"*** Page {self.page} merging {self.bbox}:<{self.content}> with {other.bbox}:<{other.content}>"
            )
            spaces = " " * max(
                1,
                int((len(self.content) + len(other.content)) / (self.width + other.width)),
            )
            self.content += spaces + other.content
            self.raws.append(other.raw)
            # self.bbox[2] = other.x1
            # self.bbox[1] = min(self.y0, other.y0)
            # self.bbox[3] = max(self.y1, other.y1)
            self.raw["bbox"] = (self.x0, min(self.y0, other.y0), other.x1, max(self.y1, other.y1))
            self.update()
            return True
        else:
            return False

    def acceptable_numbering(self, numbering):
        """Compares the line with a set of regular expressions that are deemed to be acceptable
        numerations (for more details, see the page about Patterns).

        Returns:
            (bool): if True, the line has an acceptable numeration at the beginning of the text.
        """
        # TODO: ask Eric if these patterns could be stored somewhere else - like in an object or directly in PATTERNS.

        # logger.debug(f"Page {self.page} test numbering {numbering}")
        if re.match(r"^(\w+[.\p{Pd}/])+\w+[.\p{Pd}/]?$", numbering):
            # complex numbering without internal spaces
            return True
        if re.match(r"^(\w+[.\p{Pd}/]\s+)+\w+[.\p{Pd}/]$", numbering):
            # complex numbering possibly with internal spaces but with a final post element (such as . or -)
            return True
        if re.match(r"^(\w+\s*[\p{Pd}]\s*)+\w+[.\p{Pd}/]?$", numbering):
            # complex numbering with no internal .
            # to avoid some confusions with float numbers
            return True
        if re.match(r"^(\w+\s*[\p{Pd}]\s*)+\w+[.\p{Pd}/]?$", numbering):
            return True
        if re.match(r"^(\d{1,3}|[ivxl]{1,4}|[IVXL]{1,4})[.\p{Pd}/]$", numbering):
            # simple numeric mumbering with a final post element
            # include a confusion case with numbers followed by . (eg. 123.)
            return True
        if re.match(r"^(\w+\s*[.\p{Pd}]\s*)*(?:[a-zA-Z]|\d+)\.?\s*\)$", numbering):
            # numbering terminated by ) (such as 'b)')
            return True
        return False

    def is_potential_section_aux(self, content=None, doc=None):
        """Filters the regexp with conditions before returning it to is_potential_section(), otherwise returns None.

        Returns:
            (regex.Match): an regex match object.
        """

        def acceptable_numbering_with_intro(numbering):
            # if there are several letters at the beggining of the numeration but
            # they are not roman letters, it is not accepted.
            if re.match(r"^[\p{L}]{2,}$", numbering) and not re.match(r"^[ivxlIVXL]{1,4}", numbering):
                return False
            return True

        if not content:
            content = self.content
        # matching the line with the regexp (+ handling TimeOut exceptions )
        logger.debug(f"try potential section {self} content {content}")
        try:
            m = SECTION_RE.match(content, timeout=1)

        except Exception:
            logger.exception(f"Timeout on SECTION_RE regexp on content {content}")
            m = None
        logger.debug(f"Page {self.page} test section on {content} => {m}")
        if m:
            logger.debug(
                f"<{m.group('sep')=}> <{m.group('sep_spaces')=}> <{m.group('intro')=}> <{m.group('post')=}> \
                    <{m.group('numbering')=}> <{m.group('page')=}> <{m.group('sep_spaces')=}> <{m.group('dot')=}>"
            )
        # checking if the match has enough valid groups to either be a Section or a Secindex.
        # It should have at least acceptable intro, numbering and separators
        if m and (
            m.group("sep")
            or (m.group("intro") and acceptable_numbering_with_intro(m.group("numbering")))
            or m.group("post")
            or (m.group("page") and m.group("dot"))
            or self.acceptable_numbering(m.group("numbering"))
        ):
            logger.debug(
                f"acceptable section numbering={m.group('numbering')} is_acceptable={self.acceptable_numbering(m.group('numbering'))}"  # noqa E501
            )
            # then, we can further check if the Section was already seen in the table of content in the SecIndex format.
            # if this is the first time we iterate over the document, this condition is ignored.
            if not doc or doc.is_section_in_index(Section(m)):
                logger.debug(f"Page {self.page} potential section {content}")
                return m
        return None

    def is_potential_section(self, doc=None):
        """check if the line could be a section title, possibly with some constraints
        return the numbering if true otherwise None.

        Returns:
            (str): the "num" group of the Match object
        """
        m = self.is_potential_section_aux()
        logger.debug(f"potential section HERE0 {m=}")
        # if m:
        #    logger.debug(f"got potential section {m.group('sep')=} {m.group('intro')=}
        #    {m.group('numbering')=} => {self.acceptable_numbering(m.group('numbering'))}")

        # We first check there are acceptable groups
        if m and (
            m.group("sep")
            or m.group("intro")
            or self.acceptable_numbering(m.group("numbering"))
            or (m.group("page") and m.group("dot"))
        ):
            logger.debug("potential section HERE1")

            # If so, we first check for a SecIndex : if there are dots and a page number,
            # it's probably an entry in a table of contents.
            if m.group("page") and m.group("dot") and len(m.group("dot")) > 2 and len(m.group("sep_spaces")):
                # tdm
                logger.debug(
                    f"Page {self.page} TDM <{self.content}> intro={m.group('intro')} "
                    f"numbering={m.group('numbering')} sep={m.group('sep')} \
                        sep_space={m.group('sep_spaces')} title={m.group('title')}"
                )
                self.kind = SecIndex(m)

            # If not, we check for a Section. There should be at least some spaces separating
            # the numbering from the body of text.
            elif not m.group("dot") and m.group("sep_spaces"):
                logger.debug(
                    f"Page {self.page} SECTION <{self.content}> intro={m.group('intro')} "
                    f"numbering={m.group('numbering')} sep={m.group('sep')} title={m.group('title')}"
                )
                self.kind = Section(m)
            else:
                logger.debug("potential section REJECTED")
        # logger.debug(f"{self}")

    def is_potential_other_aux(self, content=None):
        """Filters the regexp with conditions before returning it to is_potential_other_index(), otherwise returns None.

        Returns:
            (regex.Match): a regex Match object.
        """

        if not content:
            content = self.content
        # logger.debug(f"Page {self.page} test section on {content}")
        # matching the line with the regexp (+ handling TimeOut exceptions )
        try:
            m = OTHER_RE.match(content, timeout=1)
        except Exception:
            logger.exception("regexp timeout")
            m = None
        # checking if the match has enough valid groups to either be a caption.
        # It should not have roman numbers in the intro and have a page number and dots
        if (
            m
            and not re.match(r"^[IVXLC]+\.?$", m.group("intro"))
            and m.group("page")
            and m.group("dot")
            # exclude some frequent words in titles that could be confused with numbering
            and m.group("numbering").lower() not in {"du", "de", "le", "la", "et", "on", "il", "se", "ne", "un", "n.d."}
        ):
            logger.debug(f"Page {self.page} potential other index {content}")
            return m
        else:
            return None

    def check_with_other_index(self, index=None):
        """Checks whether a Line is a caption by comparing it to a normalized TOC entry.
            If so, the Line becomes a caption (`Other`).

        Returns:
            (regex.Match): a regex Match object.

        """
        content = self.content
        # logger.debug(f"Page {self.page} test section on {content}")
        m = OTHER_RE.match(content)
        if m and m.group("intro").lower() == index.kind.intro.lower() and m.group("numbering") == index.kind.numbering:
            self.kind = Other(m)
            logger.debug(f"Page {self.page} {self} caption for {index}")
            return m
        else:
            return None

    def check_with_section_index(self, index=None):
        """Compares a Line with another SecIndex Line found in a TOC by normalizing their
        contents and checking if the components are the same.
        If so, the Line becomes a Section.

        Returns:
            (regex.Match): a regex Match object.
        """
        content = self.content
        # logger.debug(f"Page {self.page} test section on {content}")
        try:
            m = (
                SECTION_RE.match(content, timeout=1)
                if index.kind.numbering
                else SPECIAL_SECTION_RE.match(content, timeout=1)
            )
        except Exception:
            logger.exception("regular timeout")
            m = None
        # checking if the Line and the SecIndex have the same numberings and the same acceptable titles
        if (
            m
            and (not index.kind.numbering or m.group("numbering") == index.kind.numbering)
            and len(m.groups("title")) > 1
            and normalize_content(m.group("title")) in normalize_content(index.kind.title)
        ):
            self.kind = Section(m)
            logger.debug(f"Page {self.page} {self} section for {index}")
            return m
        else:
            return None

    def light_check_with_section_index(self, index=None):
        """Behaves like `check_with_section_index()` but with less constraints.

            If so, the Line becomes a Section.

        Returns:
            (regex.Match): a regex Match object.

        """
        content = self.content
        # logger.debug(f"Page {self.page} test section on {content}")
        if normalize_content(content) == normalize_content(index.kind.title):
            try:
                m = SPECIAL_SECTION_RE.match(index.kind.title, timeout=1)
            except Exception:
                logger.exception("regex timout")
                m = None
            if m:
                self.kind = Section(m)
                self.kind.numbering = index.kind.numbering
                self.kind.intro = index.kind.intro
                logger.debug(f"Page {self.page} {self} light section for {index}")
            return m
        else:
            return None

    def is_potential_other_index(self, constraints=None):
        """checks whether a line is an OtherIndex.
        If so, the Line becomes an OtherIndex.

        """
        # we skip if the Line has already been attributed a marker.
        if self.kind:
            return
        # first filter
        m = self.is_potential_other_aux()
        # second filter : there should at least be a page number.
        if m and m.group("page"):
            self.kind = OtherIndex(m)

    def delete(self):
        """Deletes the line and updates the `next` and `prev` attributes of surrounding lines."""
        if self.next:
            self.next.prev = self.prev
        if self.prev:
            self.prev.next = self.next
        self.prev = None
        self.next = None

    def join_next(self):
        """Joins a line with the next line. Updates the new `Line` object attributes."""
        next_line: Line = self.next
        logger.debug(f"Join next {self} with {next_line}")
        spaces = " " * max(
            1,
            int((len(self.content) + len(next_line.content)) / (self.width + next_line.width)),
        )
        self.content += spaces + next_line.content
        # keep raw of joined line
        self.raws.append(next_line.raw)
        # adjust bounding box
        self.raw["bbox"] = (self.x0, min(self.y0, next_line.y0), next_line.x1, max(self.y1, next_line.y1))
        self.links += next_line.links
        # adjust prev and next pointers
        self.next = next_line.next
        if next_line.next is not None:
            next_line.next.prev = self
        next_line.next = None
        next_line.prev = None
        # logger.debug(f"update {self}")
        self.update()
        # logger.debug(f"done updating {self}")

    @property
    def next_close_line(self):
        """return next line if on same page and same column

        Returns:
            (Line) : the next line.
        """
        next_line = self.next
        if next_line and next_line.page == self.page and next_line.where == self.where:
            return next_line
        else:
            return None

    def vdelta(self, other=None):
        """Computes the vertical delta between a line and the closest next line.

        Returns:
            (int): the vertical delta
        """
        if other is None:
            other = self.next_close_line
        if other:
            return other.base_y - self.base_y
        else:
            return 10000

    @property
    def normalize(self):
        """Normalizes the text of a line : removes everything that is not a letter and converts to lowercase

        Returns:
            (str): the normalized text of the line
        """
        return normalize_content(self.content)

`base_y` `property` ⚓︎

The PYMUPDF y coordinate of the origin.

`bbox` `property` ⚓︎

The PYMUPDF bbox.

`content = content` `instance-attribute` ⚓︎

The textual content of the line.

`height` `property` ⚓︎

The difference between y1 and y0.

`id = uuid.uuid4()` `instance-attribute` ⚓︎

The unique identifier for the line.

`kind = None` `instance-attribute` ⚓︎

`links = []` `instance-attribute` ⚓︎

The links provided by PyMuPdf,if any.

`most_common_styles` `property` ⚓︎

Computes the style that is used on most portions of the text (joining the spans and summing their lenghts)

Returns:

Type	Description
`dict`	a dictionary containing the most common styles (font, color, size and flags.)

`next: Optional[Line] = None` `instance-attribute` ⚓︎

`next_close_line` `property` ⚓︎

return next line if on same page and same column

Returns:

Type	Description
	(Line) : the next line.

`next_spacing` `property` ⚓︎

Computes the distance between a line an the next line.

Returns:

Type	Description
`int`	the distance or spacing between a line and the next line

`normalize` `property` ⚓︎

Normalizes the text of a line : removes everything that is not a letter and converts to lowercase

Returns:

Type	Description
`str`	the normalized text of the line

`orientation` `property` ⚓︎

The PYMUPDF direction/orientation of the line.

`origin` `property` ⚓︎

The PYMUPDF origin.

`page = page` `instance-attribute` ⚓︎

`prev: Optional[Line] = None` `instance-attribute` ⚓︎

`prev_spacing` `property` ⚓︎

Computes the distance between a line an the previous line.

Returns:

Type	Description
`int`	the distance or spacing between a line and the previous line

`raw = raw` `instance-attribute` ⚓︎

The PyMuPdf raw dictionary value for the line.

`raws = [self.raw]` `instance-attribute` ⚓︎

`where = None` `instance-attribute` ⚓︎

`width` `property` ⚓︎

The difference between x1 and x0.

`x0` `property` ⚓︎

The PYMUPDF x0

`x1` `property` ⚓︎

The PYMUPDF x1.

`y0` `property` ⚓︎

The PYMUPDF y0

`y1` `property` ⚓︎

The PYMUPDF y1.

`init(content, raw, page=None, doc=None)` ⚓︎

Source code in pdfstruct/line.py

def __init__(self, content, raw, page=None, doc=None):
    # content = re.sub(r'[●••·◘○◙‣⁃⁌⁍■□∙]','*',content)
    self.content = content
    """The textual content of the line."""
    self.raw = raw
    """The PyMuPdf raw dictionary value for the line."""
    self.raws = [self.raw]
    self.where = None
    self.kind = None
    self.prev: Optional[Line] = None
    self.next: Optional[Line] = None
    self.page = page
    self.id = uuid.uuid4()
    """The unique identifier for the line."""

    self.links = []
    """The links provided by PyMuPdf,if any."""

    self.update(doc=doc)

`str()` ⚓︎

Displays the text according to a color scheme and the localization of the line (column). The colors depend on the type of marker. (ex: Section is red)

Returns:

Type	Description
`str`	the text content of the line

Source code in pdfstruct/line.py

def __str__(self):
    """Displays the text according to a color scheme and the localization of the line (column).
    The colors depend on the type of marker. (ex: Section is red)

    Returns:
        (str): the text content of the line
    """
    # TODO:>Display Merge __str__ and display
    content = self.content
    if self.kind:
        content = self.kind.display(content)
    return f"[{self.where}] {content}"

`acceptable_numbering(numbering)` ⚓︎

Compares the line with a set of regular expressions that are deemed to be acceptable numerations (for more details, see the page about Patterns).

Returns:

Type	Description
`bool`	if True, the line has an acceptable numeration at the beginning of the text.

Source code in pdfstruct/line.py

def acceptable_numbering(self, numbering):
    """Compares the line with a set of regular expressions that are deemed to be acceptable
    numerations (for more details, see the page about Patterns).

    Returns:
        (bool): if True, the line has an acceptable numeration at the beginning of the text.
    """
    # TODO: ask Eric if these patterns could be stored somewhere else - like in an object or directly in PATTERNS.

    # logger.debug(f"Page {self.page} test numbering {numbering}")
    if re.match(r"^(\w+[.\p{Pd}/])+\w+[.\p{Pd}/]?$", numbering):
        # complex numbering without internal spaces
        return True
    if re.match(r"^(\w+[.\p{Pd}/]\s+)+\w+[.\p{Pd}/]$", numbering):
        # complex numbering possibly with internal spaces but with a final post element (such as . or -)
        return True
    if re.match(r"^(\w+\s*[\p{Pd}]\s*)+\w+[.\p{Pd}/]?$", numbering):
        # complex numbering with no internal .
        # to avoid some confusions with float numbers
        return True
    if re.match(r"^(\w+\s*[\p{Pd}]\s*)+\w+[.\p{Pd}/]?$", numbering):
        return True
    if re.match(r"^(\d{1,3}|[ivxl]{1,4}|[IVXL]{1,4})[.\p{Pd}/]$", numbering):
        # simple numeric mumbering with a final post element
        # include a confusion case with numbers followed by . (eg. 123.)
        return True
    if re.match(r"^(\w+\s*[.\p{Pd}]\s*)*(?:[a-zA-Z]|\d+)\.?\s*\)$", numbering):
        # numbering terminated by ) (such as 'b)')
        return True
    return False

`check_with_other_index(index=None)` ⚓︎

Checks whether a Line is a caption by comparing it to a normalized TOC entry. If so, the Line becomes a caption (Other).

Returns:

Type	Description
`Match`	a regex Match object.

Source code in pdfstruct/line.py

def check_with_other_index(self, index=None):
    """Checks whether a Line is a caption by comparing it to a normalized TOC entry.
        If so, the Line becomes a caption (`Other`).

    Returns:
        (regex.Match): a regex Match object.

    """
    content = self.content
    # logger.debug(f"Page {self.page} test section on {content}")
    m = OTHER_RE.match(content)
    if m and m.group("intro").lower() == index.kind.intro.lower() and m.group("numbering") == index.kind.numbering:
        self.kind = Other(m)
        logger.debug(f"Page {self.page} {self} caption for {index}")
        return m
    else:
        return None

`check_with_section_index(index=None)` ⚓︎

Compares a Line with another SecIndex Line found in a TOC by normalizing their contents and checking if the components are the same. If so, the Line becomes a Section.

Returns:

Type	Description
`Match`	a regex Match object.

Source code in pdfstruct/line.py

def check_with_section_index(self, index=None):
    """Compares a Line with another SecIndex Line found in a TOC by normalizing their
    contents and checking if the components are the same.
    If so, the Line becomes a Section.

    Returns:
        (regex.Match): a regex Match object.
    """
    content = self.content
    # logger.debug(f"Page {self.page} test section on {content}")
    try:
        m = (
            SECTION_RE.match(content, timeout=1)
            if index.kind.numbering
            else SPECIAL_SECTION_RE.match(content, timeout=1)
        )
    except Exception:
        logger.exception("regular timeout")
        m = None
    # checking if the Line and the SecIndex have the same numberings and the same acceptable titles
    if (
        m
        and (not index.kind.numbering or m.group("numbering") == index.kind.numbering)
        and len(m.groups("title")) > 1
        and normalize_content(m.group("title")) in normalize_content(index.kind.title)
    ):
        self.kind = Section(m)
        logger.debug(f"Page {self.page} {self} section for {index}")
        return m
    else:
        return None

`delete()` ⚓︎

Deletes the line and updates the next and prev attributes of surrounding lines.

Source code in pdfstruct/line.py

def delete(self):
    """Deletes the line and updates the `next` and `prev` attributes of surrounding lines."""
    if self.next:
        self.next.prev = self.prev
    if self.prev:
        self.prev.next = self.next
    self.prev = None
    self.next = None

`display()` ⚓︎

Displays the text according to a color scheme. The colors depend on the type of marker. (ex: Section is red)

Returns:

Type	Description
`str`	the text content of the line

Source code in pdfstruct/line.py

def display(self):
    """Displays the text according to a color scheme. The colors depend on the type of marker. (ex: Section is red)

    Returns:
        (str): the text content of the line
    """
    # TODO:>Display: Make consistant all the display methods (sometimes return, sometimes print)
    content = self.content
    if self.kind:
        content = self.kind.display(content)
    return content

`display_html()` ⚓︎

Displays the text in HTML format.

Returns:

Type	Description
`str`	the HTML tagged text of the line.

Source code in pdfstruct/line.py

def display_html(self):
    """Displays the text in HTML format.

    Returns:
        (str): the HTML tagged text of the line.
    """
    content = self.content
    if self.kind:
        # content = colored( content,self.kind.color)
        tag = self.kind.tag
        color = HTML_COLORS[self.kind.color]
        styles = [("style", color)]
    else:
        tag = "p"
        styles = [("style", "color:black;")]
    tagged = make_html(content, tag, attval=styles)
    return tagged

`is_potential_other_aux(content=None)` ⚓︎

Filters the regexp with conditions before returning it to is_potential_other_index(), otherwise returns None.

Returns:

Type	Description
`Match`	a regex Match object.

Source code in pdfstruct/line.py

def is_potential_other_aux(self, content=None):
    """Filters the regexp with conditions before returning it to is_potential_other_index(), otherwise returns None.

    Returns:
        (regex.Match): a regex Match object.
    """

    if not content:
        content = self.content
    # logger.debug(f"Page {self.page} test section on {content}")
    # matching the line with the regexp (+ handling TimeOut exceptions )
    try:
        m = OTHER_RE.match(content, timeout=1)
    except Exception:
        logger.exception("regexp timeout")
        m = None
    # checking if the match has enough valid groups to either be a caption.
    # It should not have roman numbers in the intro and have a page number and dots
    if (
        m
        and not re.match(r"^[IVXLC]+\.?$", m.group("intro"))
        and m.group("page")
        and m.group("dot")
        # exclude some frequent words in titles that could be confused with numbering
        and m.group("numbering").lower() not in {"du", "de", "le", "la", "et", "on", "il", "se", "ne", "un", "n.d."}
    ):
        logger.debug(f"Page {self.page} potential other index {content}")
        return m
    else:
        return None

`is_potential_other_index(constraints=None)` ⚓︎

checks whether a line is an OtherIndex. If so, the Line becomes an OtherIndex.

Source code in pdfstruct/line.py

def is_potential_other_index(self, constraints=None):
    """checks whether a line is an OtherIndex.
    If so, the Line becomes an OtherIndex.

    """
    # we skip if the Line has already been attributed a marker.
    if self.kind:
        return
    # first filter
    m = self.is_potential_other_aux()
    # second filter : there should at least be a page number.
    if m and m.group("page"):
        self.kind = OtherIndex(m)

`is_potential_section(doc=None)` ⚓︎

check if the line could be a section title, possibly with some constraints return the numbering if true otherwise None.

Returns:

Type	Description
`str`	the "num" group of the Match object

Source code in pdfstruct/line.py

def is_potential_section(self, doc=None):
    """check if the line could be a section title, possibly with some constraints
    return the numbering if true otherwise None.

    Returns:
        (str): the "num" group of the Match object
    """
    m = self.is_potential_section_aux()
    logger.debug(f"potential section HERE0 {m=}")
    # if m:
    #    logger.debug(f"got potential section {m.group('sep')=} {m.group('intro')=}
    #    {m.group('numbering')=} => {self.acceptable_numbering(m.group('numbering'))}")

    # We first check there are acceptable groups
    if m and (
        m.group("sep")
        or m.group("intro")
        or self.acceptable_numbering(m.group("numbering"))
        or (m.group("page") and m.group("dot"))
    ):
        logger.debug("potential section HERE1")

        # If so, we first check for a SecIndex : if there are dots and a page number,
        # it's probably an entry in a table of contents.
        if m.group("page") and m.group("dot") and len(m.group("dot")) > 2 and len(m.group("sep_spaces")):
            # tdm
            logger.debug(
                f"Page {self.page} TDM <{self.content}> intro={m.group('intro')} "
                f"numbering={m.group('numbering')} sep={m.group('sep')} \
                    sep_space={m.group('sep_spaces')} title={m.group('title')}"
            )
            self.kind = SecIndex(m)

        # If not, we check for a Section. There should be at least some spaces separating
        # the numbering from the body of text.
        elif not m.group("dot") and m.group("sep_spaces"):
            logger.debug(
                f"Page {self.page} SECTION <{self.content}> intro={m.group('intro')} "
                f"numbering={m.group('numbering')} sep={m.group('sep')} title={m.group('title')}"
            )
            self.kind = Section(m)
        else:
            logger.debug("potential section REJECTED")

`is_potential_section_aux(content=None, doc=None)` ⚓︎

Filters the regexp with conditions before returning it to is_potential_section(), otherwise returns None.

Returns:

Type	Description
`Match`	an regex match object.

Source code in pdfstruct/line.py

def is_potential_section_aux(self, content=None, doc=None):
    """Filters the regexp with conditions before returning it to is_potential_section(), otherwise returns None.

    Returns:
        (regex.Match): an regex match object.
    """

    def acceptable_numbering_with_intro(numbering):
        # if there are several letters at the beggining of the numeration but
        # they are not roman letters, it is not accepted.
        if re.match(r"^[\p{L}]{2,}$", numbering) and not re.match(r"^[ivxlIVXL]{1,4}", numbering):
            return False
        return True

    if not content:
        content = self.content
    # matching the line with the regexp (+ handling TimeOut exceptions )
    logger.debug(f"try potential section {self} content {content}")
    try:
        m = SECTION_RE.match(content, timeout=1)

    except Exception:
        logger.exception(f"Timeout on SECTION_RE regexp on content {content}")
        m = None
    logger.debug(f"Page {self.page} test section on {content} => {m}")
    if m:
        logger.debug(
            f"<{m.group('sep')=}> <{m.group('sep_spaces')=}> <{m.group('intro')=}> <{m.group('post')=}> \
                <{m.group('numbering')=}> <{m.group('page')=}> <{m.group('sep_spaces')=}> <{m.group('dot')=}>"
        )
    # checking if the match has enough valid groups to either be a Section or a Secindex.
    # It should have at least acceptable intro, numbering and separators
    if m and (
        m.group("sep")
        or (m.group("intro") and acceptable_numbering_with_intro(m.group("numbering")))
        or m.group("post")
        or (m.group("page") and m.group("dot"))
        or self.acceptable_numbering(m.group("numbering"))
    ):
        logger.debug(
            f"acceptable section numbering={m.group('numbering')} is_acceptable={self.acceptable_numbering(m.group('numbering'))}"  # noqa E501
        )
        # then, we can further check if the Section was already seen in the table of content in the SecIndex format.
        # if this is the first time we iterate over the document, this condition is ignored.
        if not doc or doc.is_section_in_index(Section(m)):
            logger.debug(f"Page {self.page} potential section {content}")
            return m
    return None

`join_next()` ⚓︎

Joins a line with the next line. Updates the new Line object attributes.

Source code in pdfstruct/line.py

def join_next(self):
    """Joins a line with the next line. Updates the new `Line` object attributes."""
    next_line: Line = self.next
    logger.debug(f"Join next {self} with {next_line}")
    spaces = " " * max(
        1,
        int((len(self.content) + len(next_line.content)) / (self.width + next_line.width)),
    )
    self.content += spaces + next_line.content
    # keep raw of joined line
    self.raws.append(next_line.raw)
    # adjust bounding box
    self.raw["bbox"] = (self.x0, min(self.y0, next_line.y0), next_line.x1, max(self.y1, next_line.y1))
    self.links += next_line.links
    # adjust prev and next pointers
    self.next = next_line.next
    if next_line.next is not None:
        next_line.next.prev = self
    next_line.next = None
    next_line.prev = None
    # logger.debug(f"update {self}")
    self.update()

`light_check_with_section_index(index=None)` ⚓︎

Behaves like check_with_section_index() but with less constraints.

If so, the Line becomes a Section.

Returns:

Type	Description
`Match`	a regex Match object.

Source code in pdfstruct/line.py

def light_check_with_section_index(self, index=None):
    """Behaves like `check_with_section_index()` but with less constraints.

        If so, the Line becomes a Section.

    Returns:
        (regex.Match): a regex Match object.

    """
    content = self.content
    # logger.debug(f"Page {self.page} test section on {content}")
    if normalize_content(content) == normalize_content(index.kind.title):
        try:
            m = SPECIAL_SECTION_RE.match(index.kind.title, timeout=1)
        except Exception:
            logger.exception("regex timout")
            m = None
        if m:
            self.kind = Section(m)
            self.kind.numbering = index.kind.numbering
            self.kind.intro = index.kind.intro
            logger.debug(f"Page {self.page} {self} light section for {index}")
        return m
    else:
        return None

`potential_merge(other, doc=None, col_sep=None)` ⚓︎

Checks whether two lines could be merged into one.

Returns:

Type	Description
`bool`	if True, the two lines can be merged safely.

Source code in pdfstruct/line.py

def potential_merge(self, other, doc=None, col_sep=None):
    """Checks whether two lines could be merged into one.

    Returns:
        (bool): if True, the two lines can be merged safely.
    """
    # logger.debug(f"test merge {self} with {other} : {self.base_y=}  {other.base_y=} {self.x1=} {other.x0=}")
    list_bug = PATTERNS["list_bug"]
    if col_sep is not None:
        if not isinstance(col_sep, list):
            col_sep = [col_sep]
        if any(((self.x1 < sep and other.x0 > sep) or (self.x0 > sep and other.x1 < sep)) for sep in col_sep):
            return False
    if len(self.content) + len(other.content) > 200:
        logger.warning(
            f"Aborting potential merge because of very long content {len(self.content) + len(other.content)}"
        )
        return False
    if type(self.kind) in {SecIndex, OtherIndex}:
        return False
    if (
        abs(self.base_y - other.base_y) < 4
        and (self.x1 < other.x0 or self.x1 < other.x0 + 2)  # some cases with a low overlapping
        and (
            other.x0 < self.x1 + 10
            or (other.x0 < self.x1 + 30 and list_bug.match(self.content))
            or self.is_potential_section_aux(content=self.content + " " + other.content, doc=doc)
        )
        and self.height < 30
    ):
        return True
    # logger.debug('HERE0')
    # multi-line section
    if (
        type(self.kind) is Section
        and (type(other.kind) is not SecIndex or not self.acceptable_numbering(other.kind.numbering))
        and re.search(r"[._\p{Pd}]{5,}\s*\d+$", other.content)
    ):
        # if type(self.kind) is not SecIndex  and type(other.kind) is not SecIndex and
        # re.search(r'[._\p{Pd}]{5,}\s*\d+$',other.content) and
        # self.is_potential_section_aux(content=self.content + ' ' + other.content,doc=doc):
        # logger.debug(f"MERGE other {other} kind={other.kind}")
        return True
    # multi-line section
    if (
        not self.kind
        and type(other.kind) not in {Section, SecIndex, OtherIndex}
        and re.search(r"[._\p{Pd}]{5,}\s*\d+$", other.content)
        and self.is_potential_other_aux(self.content + " " + other.content)
    ):
        logger.debug(f"potential merge {self.kind=} {other.kind=}")
        return True
    # sometimes list symbols are identified as single lines (split from the following text)
    if False and list_bug.match(self.content):
        return True

`same_style(other)` ⚓︎

Compares the styles of two lines

Returns:

Type	Description
`bool`	if True, the two lines have the same styles

Source code in pdfstruct/line.py

def same_style(self, other):
    """Compares the styles of two lines

    Returns:
        (bool): if True, the two lines have the same styles"""

    style = self.most_common_styles
    other_style = other.most_common_styles
    return style == other_style

`try_merge(other, doc=None, col_sep=None)` ⚓︎

Merges two lines into one and updates the new Line object.

Returns:

Type	Description
`bool`	if True, the merge has been done.

Source code in pdfstruct/line.py

def try_merge(self, other, doc=None, col_sep=None):
    """Merges two lines into one and updates the new `Line` object.

    Returns:
        (bool): if True, the merge has been done.
    """
    # TODO: Ask Eric the difference with join_next()
    if self.potential_merge(other, doc=doc, col_sep=col_sep):
        logger.debug(
            f"*** Page {self.page} merging {self.bbox}:<{self.content}> with {other.bbox}:<{other.content}>"
        )
        spaces = " " * max(
            1,
            int((len(self.content) + len(other.content)) / (self.width + other.width)),
        )
        self.content += spaces + other.content
        self.raws.append(other.raw)
        # self.bbox[2] = other.x1
        # self.bbox[1] = min(self.y0, other.y0)
        # self.bbox[3] = max(self.y1, other.y1)
        self.raw["bbox"] = (self.x0, min(self.y0, other.y0), other.x1, max(self.y1, other.y1))
        self.update()
        return True
    else:
        return False

`update(doc=None)` ⚓︎

Checks if the line is a potential Section and a potential OtherIndex.

Source code in pdfstruct/line.py

def update(self, doc=None):
    """Checks if the line is a potential Section and a potential OtherIndex."""
    self.is_potential_section(doc=doc)
    self.is_potential_other_index()

`vdelta(other=None)` ⚓︎

Computes the vertical delta between a line and the closest next line.

Returns:

Type	Description
`int`	the vertical delta

Source code in pdfstruct/line.py

def vdelta(self, other=None):
    """Computes the vertical delta between a line and the closest next line.

    Returns:
        (int): the vertical delta
    """
    if other is None:
        other = self.next_close_line
    if other:
        return other.base_y - self.base_y
    else:
        return 10000

Line

base_y property ⚓︎

bbox property ⚓︎

content = content instance-attribute ⚓︎

height property ⚓︎

id = uuid.uuid4() instance-attribute ⚓︎

kind = None instance-attribute ⚓︎

links = [] instance-attribute ⚓︎

most_common_styles property ⚓︎

next: Optional[Line] = None instance-attribute ⚓︎

next_close_line property ⚓︎

next_spacing property ⚓︎

normalize property ⚓︎

orientation property ⚓︎

origin property ⚓︎

page = page instance-attribute ⚓︎

prev: Optional[Line] = None instance-attribute ⚓︎

prev_spacing property ⚓︎

raw = raw instance-attribute ⚓︎

raws = [self.raw] instance-attribute ⚓︎

where = None instance-attribute ⚓︎

width property ⚓︎

x0 property ⚓︎

x1 property ⚓︎

y0 property ⚓︎

y1 property ⚓︎

__init__(content, raw, page=None, doc=None) ⚓︎

__str__() ⚓︎

acceptable_numbering(numbering) ⚓︎

check_with_other_index(index=None) ⚓︎

check_with_section_index(index=None) ⚓︎

delete() ⚓︎

display() ⚓︎

display_html() ⚓︎

is_potential_other_aux(content=None) ⚓︎

is_potential_other_index(constraints=None) ⚓︎

is_potential_section(doc=None) ⚓︎

is_potential_section_aux(content=None, doc=None) ⚓︎

join_next() ⚓︎

light_check_with_section_index(index=None) ⚓︎

potential_merge(other, doc=None, col_sep=None) ⚓︎

same_style(other) ⚓︎

try_merge(other, doc=None, col_sep=None) ⚓︎

update(doc=None) ⚓︎

vdelta(other=None) ⚓︎

`base_y` `property` ⚓︎

`bbox` `property` ⚓︎

`content = content` `instance-attribute` ⚓︎

`height` `property` ⚓︎

`id = uuid.uuid4()` `instance-attribute` ⚓︎

`kind = None` `instance-attribute` ⚓︎

`links = []` `instance-attribute` ⚓︎

`most_common_styles` `property` ⚓︎

`next: Optional[Line] = None` `instance-attribute` ⚓︎

`next_close_line` `property` ⚓︎

`next_spacing` `property` ⚓︎

`normalize` `property` ⚓︎

`orientation` `property` ⚓︎

`origin` `property` ⚓︎

`page = page` `instance-attribute` ⚓︎

`prev: Optional[Line] = None` `instance-attribute` ⚓︎

`prev_spacing` `property` ⚓︎

`raw = raw` `instance-attribute` ⚓︎

`raws = [self.raw]` `instance-attribute` ⚓︎

`where = None` `instance-attribute` ⚓︎

`width` `property` ⚓︎

`x0` `property` ⚓︎

`x1` `property` ⚓︎

`y0` `property` ⚓︎

`y1` `property` ⚓︎

`init(content, raw, page=None, doc=None)` ⚓︎

`str()` ⚓︎

`acceptable_numbering(numbering)` ⚓︎

`check_with_other_index(index=None)` ⚓︎

`check_with_section_index(index=None)` ⚓︎

`delete()` ⚓︎

`display()` ⚓︎

`display_html()` ⚓︎

`is_potential_other_aux(content=None)` ⚓︎

`is_potential_other_index(constraints=None)` ⚓︎

`is_potential_section(doc=None)` ⚓︎

`is_potential_section_aux(content=None, doc=None)` ⚓︎

`join_next()` ⚓︎

`light_check_with_section_index(index=None)` ⚓︎

`potential_merge(other, doc=None, col_sep=None)` ⚓︎

`same_style(other)` ⚓︎

`try_merge(other, doc=None, col_sep=None)` ⚓︎

`update(doc=None)` ⚓︎

`vdelta(other=None)` ⚓︎