Patterns⚓︎

Patterns are used to match the text of a Line with a type of Marker. They are stored in a dictionary of regular expressions made with the python regex library .

Tip

For more info on regular expressions, you can check this useful website.

To test some of the expressions, you can use this online tool.

The dictionary stores regular expressions components. These components are used to form complex patterns. For example, the title component is used to detect a Section, a SecIndex or an OtherIndex.

Section and SecIndex Patterns⚓︎

Components⚓︎

A Section or SecIndex title is composed of potential introductory words (ex : "Chapter", "Part n°", ...), and a body.

# regular expression for a usual Section or a SecIndex
    ^
    (?P<intro>
      (?:(?i:section|partie|chapitre|annexe|appendix|appendice|volume|tome|pièce|piece)
      (\s+[Nn][°º])?
      )?
    )
    \s*
    {{body}}

The body of a title usually contains several parts : a numeration part with some types of separators (ex : "II.2-a", "A.2.c", ...), and a textual part - the actual title.

In the case of SecIndex (a title in a table of contents), there are also new separators such as dots and the page number. (ex : "Part 2 - Some Title ...... 4")

# regular expression for the body of a Section or SecIndex.

 {{numbering}}
    (?P<sep_spaces>
    \s?
    {{sep}}?
    \s+
    )
    {{title}}
    (:?\s*{{dot}}\s*{{page}})?
    \s*
    $

Usage in pdfstruct⚓︎

The patterns are combined with conditions in the methods of the Line object, specifically in is_potential_section_aux() and is_potential_section().

Source code in line.Line.is_potential_section_aux

def is_potential_section_aux(self, content=None, doc=None):
        """Filters the regexp with conditions before returning it to is_potential_section(), otherwise returns None. 

        Returns:
            regex.Match : an match object.
           """
        def acceptable_numbering_with_intro(numbering):
            # if there are several letters at the beggining of the numeration but they are not roman letters, it is not accepted.
            if re.match(r"^[\p{L}]{2,}$", numbering) and not re.match(r"^[ivxlIVXL]{1,4}", numbering):
                return False
            return True

        if not content:
            content = self.content

        # matching the line with the regexp (+ handling TimeOut exceptions )
        logger.debug(f"try potential section {self} content {content}")
        try:
            m = SECTION_RE.match(content, timeout=1)

        except Exception:
            logger.exception(f"Timeout on SECTION_RE regexp on content {content}")
            m = None
        logger.debug(f"Page {self.page} test section on {content} => {m}")
        if m:
            logger.debug(
                f"<{m.group('sep')=}> <{m.group('sep_spaces')=}> <{m.group('intro')=}> <{m.group('post')=}> <{m.group('numbering')=}> <{m.group('page')=}> <{m.group('sep_spaces')=}> <{m.group('dot')=}>"
            )
        # checking if the match has enough valid groups to either be a Section or a Secindex.
        # It should have at least acceptable intro, numbering and separators 
        if m and (
            m.group("sep")
            or (m.group("intro") and acceptable_numbering_with_intro(m.group("numbering")))
            or m.group("post")
            or (m.group("page") and m.group("dot"))
            or self.acceptable_numbering(m.group("numbering"))
        ):
            logger.debug(
                f"acceptable section numbering={m.group('numbering')} is_acceptable={self.acceptable_numbering(m.group('numbering'))}"
            )
            # then, we can further check if the Section was already seen in the table of content in the SecIndex format.
            # if this is the first time we iterate over the document, this condition is ignored.
            if not doc or doc.is_section_in_index(Section(m)):
                logger.debug(f"Page {self.page} potential section {content}")
                return m
        return None

Source code in line.Line.is_potential_section

def is_potential_section(self, doc=None):
        """check if the line could be a section title, possibly with some constraints
        return the numbering if true otherwise None.

        Returns:
            str: the "num" group of the Match object
        """
        m = self.is_potential_section_aux()
        logger.debug(f"potential section HERE0 {m=}")
        # if m:
        #    logger.debug(f"got potential section {m.group('sep')=} {m.group('intro')=}
        #    {m.group('numbering')=} => {self.acceptable_numbering(m.group('numbering'))}")

        # We first check there are acceptable groups 
        if m and (
            m.group("sep")
            or m.group("intro")
            or self.acceptable_numbering(m.group("numbering"))
            or (m.group("page") and m.group("dot"))
        ):

            logger.debug("potential section HERE1")

            # If so, we first check for a SecIndex : if there are dots and a page number, it's probably an entry in a table of contents.
            if m.group("page") and m.group("dot") and len(m.group("dot")) > 2 and len(m.group("sep_spaces")):
                # tdm
                logger.debug(
                    f"Page {self.page} TDM <{self.content}> intro={m.group('intro')} "
                    f"numbering={m.group('numbering')} sep={m.group('sep')} sep_space={m.group('sep_spaces')} title={m.group('title')}"
                )
                self.kind = SecIndex(m)

            # If not, we check for a Section. There should be at least some spaces separating the numbering from the body of text.
            elif not m.group("dot") and m.group("sep_spaces"):
                logger.debug(
                    f"Page {self.page} SECTION <{self.content}> intro={m.group('intro')} "
                    f"numbering={m.group('numbering')} sep={m.group('sep')} title={m.group('title')}"
                )
                self.kind = Section(m)
            else:
                logger.debug("potential section REJECTED")

Special Section Patterns⚓︎

Some Sections and SecIndexes don't have numeration, only a title (eg:"Introduction").

# special SecIndex (TOC entry)

    {{title}}
    (:?\s*{{dot}}\s*{{page}})
    \s*
    $

# special Section 

    {{title}}
    \s*
    $

Usage in Pdfstruct⚓︎

They are used to find content that may correspond to what was already found in the Table of Contents.

Source code in line.Line.check_with_section_index

def check_with_section_index(self, index=None):
        """Compares a Line with another SecIndex Line found in a TOC by normalizing their contents and checking if the components are the same.
         If so, the Line becomes a Section. """
        content = self.content
        # logger.debug(f"Page {self.page} test section on {content}")
        try:
            m = (
                SECTION_RE.match(content, timeout=1)
                if index.kind.numbering
                else SPECIAL_SECTION_RE.match(content, timeout=1)
            )
        except Exception:
            logger.exception("regular timeout")
            m = None
        # checking if the Line and the SecIndex have the same numberings and the same acceptable titles
        if (
            m
            and (not index.kind.numbering or m.group("numbering") == index.kind.numbering)
            and len(m.groups("title")) > 1
            and normalize_content(m.group("title")) in normalize_content(index.kind.title)
        ):
            self.kind = Section(m)
            logger.debug(f"Page {self.page} {self} section for {index}")
            return m
        else:
            return None

Source code in line.Line.light_check_with_section_index

    def light_check_with_section_index(self, index=None):
        content = self.content
        # logger.debug(f"Page {self.page} test section on {content}")
        if normalize_content(content) == normalize_content(index.kind.title):
            try:
                m = SPECIAL_SECTION_RE.match(index.kind.title, timeout=1)
            except Exception:
                logger.exception("regex timout")
                m = None
            if m:
                self.kind = Section(m)
                self.kind.numbering = index.kind.numbering
                self.kind.intro = index.kind.intro
                logger.debug(f"Page {self.page} {self} light section for {index}")
            return m
        else:
            return None

Other and OtherIndex Patterns⚓︎

Some elements (mostly captions) do resemble section titles. They might have a less strict introduction (no words such as "Chapter") but have a similar body structure (eg : "Table 1.2.3 : Some Text").

# regular expression for Other and OtherIndex objects
^
    {{intro}}
    \s*
    {{body}}

Usage in pdfstruct⚓︎

We check if the Line is an OtherIndex with the is_potential_other_aux() and is_potential_other_index()methods. We then check if the element appears later in the document with the check_with_other_index() method.

Source code in line.Line.is_potential_other_index

    def is_potential_other_index(self, constraints=None):
    """checks whether a line is an OtherIndex"""
    # we skip if the Line has already been attributed a marker.
    if self.kind:
    return
    # first filter
    m = self.is_potential_other_aux()
    # second filter : there should at least be a page number.
    if m and m.group("page"):
    self.kind = OtherIndex(m)

Source code in line.Line.is_potential_other_aux

def is_potential_other_aux(self, content=None):
        """Filters the regexp with conditions before returning it to is_potential_other_index(), otherwise returns None. 

        Returns:
            regex.Match : an match object.
           """

        if not content:
            content = self.content
        # logger.debug(f"Page {self.page} test section on {content}")
        # matching the line with the regexp (+ handling TimeOut exceptions )
        try:
            m = OTHER_RE.match(content, timeout=1)
        except Exception:
            logger.exception("regexp timeout")
            m = None
        # checking if the match has enough valid groups to either be a caption.
        # It should not have roman numbers in the intro and have a page number and dots
        if (
            m
            and not re.match(r"^[IVXLC]+\.?$", m.group("intro"))
            and m.group("page")
            and m.group("dot")
            # exclude some frequent words in titles that could be confused with numbering
            and m.group("numbering").lower() not in {"du", "de", "le", "la", "et", "on", "il", "se", "ne", "un", "n.d."}
        ):
            logger.debug(f"Page {self.page} potential other index {content}")
            return m
        else:
            return None

Source code in line.Line.check_with_other_index

     def check_with_other_index(self, index=None):
        """checks whether a Line is a caption by comparing it to a normalized TOC entry."""
        content = self.content
        # logger.debug(f"Page {self.page} test section on {content}")
        m = OTHER_RE.match(content)
        if m and m.group("intro").lower() == index.kind.intro.lower() and m.group("numbering") == index.kind.numbering:
            self.kind = Other(m)
            logger.debug(f"Page {self.page} {self} caption for {index}")
            return m
        else:
            return None

List Patterns⚓︎

The Dictionary also contains a set of symbols that are commonly used as bullet points in lists:

## From "classic_symbols" to "list_bug": patterns for detection of bullet lists.
    "classic_symbols": r"[◘○◙‣⁃⁌⁍■□∙●••·>→\p{Pd}]",
    "puas": "[\ue000-\uf8ff]",
    "arrows": "[\u2190-\u21ff]|[\u27F0-\u27FF]|[\u2900-\u297F]",
    "other_shapes": "[\u25A0-\u25FF]|[\u2600-\u26FF]|[\u2B00-\u2BFF]",
    "dingbats": "[\u2700-\u27BF]",
    "list_symbols": r"{{classic_symbols}}|{{arrows}}|{{other_shapes}}|{{dingbats}}|{{puas}}",
    "list_sep": r"[.:°\p{Pd}\)/]{1,3}",
    "list_start": r"{{list_symbols}}|{{num}}\s*{{list_sep}}",
    "list_bug": r"^({{list_symbols}}\s*)$"

Full Patterns Dictionary⚓︎

For a complete overview of the patterns, click on the element below.

See Full dictionary of Patterns

{
### From "digit" to "LETTER": different kinds of numeration patterns ###
    "digit": r"[\d]{1,2}",
    "roman": r"(?<![\p{L}])(?:" + "|".join(RomanNumbering.RomanNumbers).lower() + r")(?![\p{L}])",
    "ROMAN": r"(?<![\p{L}])(?:" + "|".join(RomanNumbering.RomanNumbers) + r")(?![\p{L}])",
    "letter": r"(?<![\p{L}])[a-z]{1,2}(?![\p{L}])",
    "LETTER": r"(?<![\p{L}])[A-Z]{1,2}(?![\p{L}])",
    "num": r"{{digit}}|{{roman}}|{{ROMAN}}|{{letter}}|{{LETTER}}",# stores all previous patterns
    "post": r"[.\p{Pd}/]{1,3}",# symbols after the space
    "final_post": r"[.\p{Pd}/]{1,3}|\.?\s*[)]",# final symbols after the numeration
    "sep": r"[.:\p{Pd}/]{1,3}",
    # numbering is the main group for section/index detection.It should at least contain 1 type of numeration and some kind of separation with the title.
    "numbering": r"""((?:
             {{num}}
             (?P<spaceA>\s*)
             {{post}}
             (?P<space>\s*)
             )
             (?:
             {{num}}
             (?P=spaceA)?
             {{post}}
             (?P=space)?
             )*
             )?
             {{num}}
             {{post:final_post}}?
    """,
    ## TOC entry patterns 
    "page": r"\d+",# the page number in a TOC entry
    "ocrdot": r"[aeionmuxnmwzs]{15,}",# some dots in ocr are rendered as letters
    "dot": r"[._\p{Pd}]*|{{ocrdot}}",# any symbol in a TOC entry that separates the title fro the page number.

    ## Text Patterns
    "intro": r"\p{L}{3,}\.?",  # a group of at least 3 letters and an optional final dot
    "title": r"[\p{L}\d]+.*?\p{L}+.*?",# anything that starts with a letter or a digit and countains some letters inbetween.

    ## body is the main group that contains at least a numbering and a title. It has optional final components to detect SecIndex.
    "body": r"""
    {{numbering}}
    (?P<sep_spaces>
    \s?
    {{sep}}?
    \s+
    )
    {{title}}
    (:?\s*{{dot}}\s*{{page}})?
    \s*
    $
    """,
    "section": r"""
    ^
    (?P<intro>
      (?:(?i:section|partie|chapitre|annexe|appendix|appendice|volume|tome|pièce|piece)
      (\s+[Nn][°º])?
      )?
    )
    \s*
    {{body}}
    """,

    ## "special_section" patterns : some Sections and SecIndex don't have numeration, only a title. (ex:"Introduction")
    "special_section_entry": r"""
    {{title}}
    (:?\s*{{dot}}\s*{{page}})
    \s*
    $
    """,
    "special_section": r"""
    {{title}}
    \s*
    $
    """,
    ## "other" pattern : a loose pattern to detect captions and other kinds of titles that are not Sections.
    "other": r"""
    ^
    {{intro}}
    \s*
    {{body}}
    """,
    ## From "classic_symbols" to "list_bug": patterns for detection of bullet lists.
    "classic_symbols": r"[◘○◙‣⁃⁌⁍■□∙●••·>→\p{Pd}]",
    "puas": "[\ue000-\uf8ff]",
    "arrows": "[\u2190-\u21ff]|[\u27F0-\u27FF]|[\u2900-\u297F]",
    "other_shapes": "[\u25A0-\u25FF]|[\u2600-\u26FF]|[\u2B00-\u2BFF]",
    "dingbats": "[\u2700-\u27BF]",
    "list_symbols": r"{{classic_symbols}}|{{arrows}}|{{other_shapes}}|{{dingbats}}|{{puas}}",
    "list_sep": r"[.:°\p{Pd}\)/]{1,3}",
    "list_start": r"{{list_symbols}}|{{num}}\s*{{list_sep}}",
    "list_bug": r"^({{list_symbols}}\s*)$",
}