Page⚓︎

This class is used to handle the pages of a Document.

The processing of a Page follow these steps:

setting the orientation
checking if there are double columns on the page
extracting images, links and lines
if there are double columns, assiging a column to each line
handling the case where there are a lot of SecIndex and OtherIndex on the page : detecting gaps and filling them.
detecting tables
merging lines if necessary.

Source code in pdfstruct/page.py

class Page:
    """This class is used to handle the pages of a `Document`.

    The processing of a `Page` follow these steps:

    * setting the orientation
    * checking if there are double columns on the page
    * extracting images, links and lines
    * if there are double columns, assiging a column to each line
    * handling the case where there are a lot of SecIndex and OtherIndex on the page : detecting gaps and filling them.
    * detecting tables
    * merging lines if necessary.


    """

    def __init__(self, pageid, raw, doc, links=None, prev=None, next=None, images=None, ocr=None, doc_len=None):
        self.raw = raw
        self.id = pageid
        self.doc = doc
        self.doc_len = len(self.doc) if doc_len is None else doc_len
        self.page_number = None
        self.prev = prev
        self.next = next
        self.links = links

        logger.info(f"Processing page {pageid} {ocr=} {raw['width']=} {raw['height']=}")

        self.set_orientation()

        self.width = raw["width"]
        self.height = raw["height"]

        self.words = doc.words[self.id] if doc.words is not None else None

        logger.debug(f"Page {pageid} trying double column")
        col_sep = self.check_double_column()
        if col_sep:
            logger.info(f"Page {pageid} double column with sep={col_sep}")

        # we have an issue for landscape pages

        self.lines: List[Any] = []
        self.blobs: List[Any] = []

        coord2links = defaultdict(list)

        # extracting links
        if links:
            for link in links:
                coord = link[1]
                # x0 = round(coord.x0)
                # x1 = round(coord.x1)
                y0 = round(coord.y0)
                y1 = round(coord.y1)
                for y in range(y0, y1):
                    coord2links[y].append(link)

        image2coords = defaultdict(list)

        nblocks = len(raw["blocks"])

        # extracting images

        for block in raw["blocks"]:
            if block["type"] != 0:
                # remove non text blocks
                # but keep them in a separate list
                blob = Blob(block)
                self.blobs.append(blob)
                if blob.image:
                    bid = block["number"]
                    outname = f"./images/page{pageid}_bid{bid}.png"
                    pix = fitz.Pixmap(blob.image)
                    pix.save(outname)

        if images:
            for image in images:
                blob = Blob(
                    {
                        "bbox": image[0:4],
                        "ext": 0,
                        "width": image[2] - image[0],
                        "height": image[3] - image[1],
                        "image": image[4],
                    }
                )
                self.blobs.append(blob)

        for blob in self.blobs:
            logger.debug(f"Page {pageid} blob bbox={blob.bbox} height={blob.height} width={blob.width}")
            for y in range(round(blob.y0) + 1, round(blob.y1), 10):
                image2coords[y].append(blob)
            if False and ocr and 5 < blob.height < 30 and blob.image and len(self.blobs) < 20:
                # we try ocr to extract text from small images
                logger.info(
                    f"Page {pageid} try ocr on small image blob height={blob.height} {ocr=} {type(blob.image)} len={len(blob.image)}"  # noqa: E501
                )
                try:
                    res = ocr.readtext(blob.image)
                    logger.info(f"Page {pageid} ocr {res=}")
                    for bbox, text, score in res:
                        logger.debug(f"Page {pageid} blob ocr {text=} {score=} {bbox=}")
                except Exception:
                    logger.exception("pb ocr")

        def sorted_blocks(blocks, col_sep=None):
            def local_sort(blocks):
                return sorted(sorted(blocks, key=lambda b: b["bbox"][0]), key=lambda b: b["bbox"][1])

            if col_sep:
                if not isinstance(col_sep, list):
                    col_sep = [col_sep]
                cols = []
                min_bound = 0
                for sep in col_sep:
                    col = [b for b in blocks if (b["bbox"][2] < sep or b["bbox"][0] < sep) and b["bbox"][0] > min_bound]
                    cols.append(col)
                    min_bound = sep
                col = [b for b in blocks if b["bbox"][0] > min_bound]
                cols.append(col)
                # left_blocks = [b for b in blocks if b['bbox'][2] < col_sep or b['bbox'][0] < col_sep]
                # right_blocks = [b for b in blocks if b['bbox'][0] > col_sep]
                # return chain(local_sort(left_blocks),local_sort(right_blocks))
                return chain.from_iterable(local_sort(col) for col in cols)
            else:
                return local_sort(blocks)

        # extracting lines

        for block in sorted_blocks(raw["blocks"], col_sep=col_sep):
            bid = block["number"]
            block_bbox = block["bbox"]
            if block["type"] != 0:
                # already processed
                continue
            # keeping the original line and block ids
            for i, line in enumerate(block["lines"]):
                line["original_line_id"] = i
                line["original_block_id"] = bid
                # adding the words tuples for each line and checking words and spans are identical
                if self.words is not None:
                    line["words"] = [w for w in self.words if w[5] == bid and w[6] == i]
                    span_text = "".join(span["text"] for span in line["spans"]).strip()
                    word_text = " ".join([w[4] for w in line["words"]])
                    spans, words = re.split(r"\s+", span_text), re.split(r"\s+", word_text)
                    if spans != words:
                        logging.info(
                            f"WARNING (PYMUPDF ISSUE): Page {self.id} Block {bid} Line {i} - SPANS AND WORDS DONT MATCH"
                        )
                        # print(spans,"||", words)

            # logging.info(f"BLOCK type={block['type']} {list(block.keys())}")
            # for line in sorted(sorted(block['lines'],key=lambda l: l['bbox'][1]), key=lambda l: l['bbox'][0]):
            try:
                for line in sorted(
                    sorted(
                        filter(lambda lline: len(lline["spans"]) > 0, block["lines"]),
                        key=lambda lline: lline["bbox"][0],
                    ),
                    key=lambda lline: round(lline["spans"][0]["origin"][1]),
                ):
                    # apply unicode normalization for some characters such as non-breaking spaces
                    # and to avoid some errors related to unicode characters
                    # maybe overkill !
                    def normalize(s: str):
                        return normalize_unicode("NFKD", s).encode("utf-8", errors="ignore").decode("utf-8")

                    content = "".join(normalize(span["text"]) for span in line["spans"]).strip()
                    if not content:
                        continue
                    line = Line("".join(content), raw=line, page=pageid, doc=None)
                    logger.debug(
                        f"Page {self.id} block={bid} {block_bbox} line bbox={line.bbox} base={line.base_y} : |{line}|"
                    )

                    if nblocks > 2000 and round(line.base_y) in image2coords:
                        # there are cases where many textual annotations are attached to images
                        # we skip them to avoid some complexity issues in other processing steps
                        blobs = image2coords[round(line.base_y)]
                        found = False
                        for blob in blobs:
                            if line.x0 <= blob.x0 <= line.x1 or blob.x0 <= line.x0 <= blob.x1:
                                logger.warning(
                                    f"Page {pageid} discard line {line.bbox} embedded in blob {blob.bbox} content={line}"  # noqa: E501
                                )
                                found = True
                                break
                        if found:
                            continue

                    if tuple(line.orientation) != (1.0, 0.0):
                        logger.warning(
                            f"Page {pageid} discard line {line.bbox} because of orientation {line.orientation} content={line.content}"  # noqa: E501
                        )
                        continue

                    if round(line.base_y) in coord2links:
                        logger.debug(f"Page {self.id} potential link for {line}: {coord2links[round(line.base_y)]}")
                        for link in coord2links[round(line.base_y)]:
                            if line.x0 <= link[1].x0 <= line.x1 or link[1].x0 <= line.x0 <= link[1].x1:
                                logger.info(f"Page {pageid} attach link to page {link[0]} to line {line}")
                                line.links.append(link)

                    if nblocks > 2000 and round(line.base_y) in image2coords:
                        # there are cases where many textual annotations are attached to images
                        # we skip them to avoid some complexity issues in other processing steps
                        blobs = image2coords[round(line.base_y)]
                        found = False
                        for blob in blobs:
                            if line.x0 <= blob.x0 <= line.x1 or blob.x0 <= line.x0 <= blob.x1:
                                logger.warning(
                                    f"Page {pageid} discard line {line.bbox} embedded in blob {blob.bbox} content={line}"  # noqa: E501
                                )
                                found = True
                                break
                        if found:
                            continue

                    if tuple(line.orientation) != (1.0, 0.0):
                        logger.warning(
                            f"Page {pageid} discard line {line.bbox} because of orientation {line.orientation} content={line.content}"  # noqa: E501
                        )
                        continue

                    if round(line.base_y) in coord2links:
                        logger.debug(f"Page {self.id} potential link for {line}: {coord2links[round(line.base_y)]}")
                        for link in coord2links[round(line.base_y)]:
                            if line.x0 <= link[1].x0 <= line.x1 or link[1].x0 <= line.x0 <= link[1].x1:
                                logger.info(f"Page {pageid} attach link to page {link[0]} to line {line}")
                                line.links.append(link)

                    if self.lines and self.lines[-1].try_merge(line, doc=None, col_sep=col_sep):
                        pass
                    else:
                        self.add_line(line)
            except Exception:
                logger.exception(f"Pb page {pageid} block={block['lines']}")

                if not self.lines:
                    # could potentially be the sign of a problem
                    logger.warning(f"Empty page {pageid}")
                elif len(self.lines) > 1000:
                    contents = Counter([len(line.content) for line in self.lines])
                    logger.warning(
                        f"Long page {pageid} #lines={len(self.lines)} content distrib={contents.most_common()}"
                    )
                    for i, line in enumerate(self.lines[:100]):
                        logger.warning(f"line {i} raw={line.raw}")

        # assign column info for each line in where field
        # and sort lines in each column
        self.assign_column_info(col_sep)

        # TODO: ask Eric if this is a typo
        # assign column info for each line in where field
        # and sort lines in each column
        self.assign_column_info(col_sep)

        # check if the page is dense in indexes and try to fill gaps if necessary
        self.handle_dense_index_page()
        self.handle_dense_index_page(index=OtherIndex)

        # track potential tables
        self.detect_tables()
        # some lines not in potential tables could now be joined
        changed = False
        vspace = self.vspace

        # joining lines and updating the indexes
        while True:
            for line in self.lines:
                next_line = line.next_close_line
                if next_line and (line.vdelta(next_line) <= vspace) and line.potential_merge(next_line):
                    line.join_next()
                    changed = True
            if changed:
                self.handle_dense_index_page()
                self.handle_dense_index_page(index=OtherIndex)
                self.clean()
                changed = False
            else:
                break

    def check_loop(self):
        # check loops on self.lines
        seen = set([])
        for line in self.lines:
            if line.next and line.next.id in seen:
                logger.warning(f"loop in lines page {self.id} at line {line} next={line.next}")
                return True
            seen.add(line.id)
        logger.debug(f"no loop in lines page {self.id}")
        return False

    def set_orientation(self):
        orientations: Any = Counter()
        for block in self.raw["blocks"]:
            if block["type"] != 0:
                continue
            for line in block["lines"]:
                dir = tuple(line["dir"])
                orientations[dir] += 1
        try:
            orientation = orientations.most_common(1)[0][0]
        except Exception:
            # page with no textual content
            orientation = (0, 1)
        logger.debug(f"Page {self.id} main direction is {orientation}")
        if orientation == (0, -1):
            self.rotate()
            self.orientation = "landscape"
        else:
            self.orientation = "portrait"

        if self.prev and self.prev.orientation and self.prev.orientation != self.orientation:
            logger.info(f"Page {self.id} orientation switched from {self.prev.orientation} to {self.orientation}")
        elif not self.prev:
            logger.info(f"Page {self.id} orientation is {self.orientation}")
        else:
            logger.debug(f"Page {self.id} orientation is {self.orientation}")

    @property
    def most_common_styles(self):
        styles = ["font", "size", "flags", "color"]
        common_styles = {}
        for style in styles:
            lst = [line.most_common_styles[style] for line in self.lines]
            common_style = Counter(lst).most_common()[0][0]
            common_styles[style] = common_style
        return common_styles

    @property
    def most_common_line_spacing(self):
        spacings = []
        for line in self.lines:
            if line.next:
                spacings.append(abs(round(line.next.y0) - round(line.y1)))
        return Counter(spacings).most_common()[0][0] if spacings else 0

    def rotate(self):
        raw = self.raw
        width = raw["width"]
        height = raw["height"]
        raw["width"] = height
        raw["height"] = width

        def bbox_rotate(elt):
            bbox = elt["bbox"]
            elt["bbox"] = (height - bbox[3], bbox[0], height - bbox[1], bbox[2])

        for block in raw["blocks"]:
            bbox_rotate(block)
            if block["type"] != 0:
                continue
            for line in block["lines"]:
                line["dir"] = (1.0, 0.0)
                bbox_rotate(line)
                for span in line["spans"]:
                    bbox_rotate(span)
                    orig = span["origin"]
                    span["origin"] = (height - orig[1], orig[0])

    def add_line(self, line):
        """The equivalent of an `.append()` list method : the `Line` is added at the end of the `.lines` attribute."""
        line.page = self.id
        if self.lines:
            line.prev = self.lines[-1]
        else:
            ppage = self.prev
            while ppage:
                if ppage.lines:
                    line.prev = ppage.lines[-1]
                    break
                ppage = ppage.prev
        self.lines.append(line)
        if line.prev:
            line.prev.next = line

    @property
    def df(self):
        """returns info about lines as a pandas DataFrame"""
        return pd.DataFrame.from_records(
            [
                [
                    line,
                    line.x0,
                    line.x1,
                    line.y0,
                    line.y1,
                    line.base_y,
                    line.width,
                    line.height,
                    line.where,
                    line.content,
                ]
                for line in self.lines
            ],
            columns=[
                "line",
                "x0",
                "x1",
                "y0",
                "y1",
                "base_y",
                "width",
                "height",
                "where",
                "content",
            ],
        )

    def check_double_column(self):
        """check whether the page has double columns and returns the x-separating value between the columns,
        otherwise returns None

        Returns:
            (list(float)): the column separators (x coordinates)
        """
        lines = []
        for block in self.raw["blocks"]:
            if block["type"] != 0:
                continue
            for line in block["lines"]:
                if tuple(line["dir"]) != (1.0, 0.0):
                    continue
                content = "".join([span["text"] for span in line["spans"]])
                if not content or not re.search(r"\S+", content):
                    # keep only the lines with some real content (not just whitespaces)
                    continue
                bbox = line["bbox"]
                lines.append(bbox)
        df = pd.DataFrame(lines, columns=["x0", "y0", "x1", "y1"])
        df = df.assign(width=(df.x1 - df.x0))
        min_x0 = df.x0.min()
        max_x1 = df.x1.max()
        page_width = max_x1 - min_x0
        max_width = df.width.quantile(q=0.9)

        height = self.height
        width = self.width
        logger.debug(f"param double column: {len(df)=} {height=} {width=}")

        # remove potential header and footer lines
        df = df[(df.y0 > 0.05 * height) & (df.y1 < 0.95 * height)]

        logger.debug(
            f"param double column: {len(df)=} {min_x0=} {max_x1=} {max_width=} {0.5*page_width=} {page_width=} {0.8*self.width=}"  # noqa: E501
        )

        if len(df) < 5:
            logger.debug(self.raw["blocks"])

        if not len(df):
            # (almost) empty page
            return None

        # we search a vertical line around the middle of the page
        # with only a small intersection with the horizontal lines
        # (intersection may be non empty because of headers, footers, and some tables)
        potential_sep = 0.5 * self.width
        potential_seps = [
            potential_sep,
            potential_sep - 0.02 * self.width,
            potential_sep + 0.02 * self.width,
        ]
        if max_width < 0.5 * page_width and page_width > 0.8 * self.width:
            potential_seps.append(min_x0 + 0.5 * page_width)
        for potential_sep in potential_seps:
            inter = df[(df.x0 < potential_sep) & (potential_sep < df.x1)]
            ninter = len(inter)
            logger.debug(f"try sep2 {potential_sep=} {ninter=}, {0.1*len(df)=}")
            if ninter < 0.1 * len(df):
                return potential_sep

        # try for 3 columns
        potential_seps = [(0.33 * self.width, 0.66 * self.width)]
        if max_width < 0.5 * page_width and page_width > 0.8 * self.width:
            potential_seps.append((min_x0 + 0.33 * page_width, min_x0 + 0.66 * page_width))
        for sepA, sepB in potential_seps:
            interA = df[(df.x0 < sepA) & (sepA < df.x1)]
            interB = df[(df.x0 < sepB) & (sepB < df.x1)]
            ninter = len(interA) + len(interB)
            logger.debug(f"try sep3 {sepA=} {sepB=} {ninter=}, {0.1*len(df)=}")
            if ninter < 0.1 * len(df):
                logger.info(f"Page {self.id} has 3 columns with {sepA=} {sepB=} {ninter=}")
                return [sepA, sepB]

        return None

    def assign_column_info(self, col_sep):
        """Assigns a column to each `Line` and sort lines per column.
        If there are no columns or if a line is intersecting with a column separator, assumes the info is `None`.
        """
        df = self.df
        df = df.assign(approx_base_y=df.base_y.round())
        min_x0 = df.x0.min()
        max_x1 = df.x1.max()
        max_x1 - min_x0
        df.width.quantile(q=0.9)
        prev_line = self.lines[0].prev if self.lines else None

        def reassign_lines(lines):
            prev = prev_line
            logger.debug(f"reassigning lines prev_line={prev_line}")
            for line in lines:
                line.prev = prev
                if prev:
                    prev.next = line
                prev = line
            if lines:
                # to ensure that the last page line has None has next line
                lines[-1].next = None
            self.lines = lines

        # if self.orientation == 'portrait' and max_width < 0.5 * page_width and page_width > 0.8 * self.width:
        # if max_width < 0.5 * page_width and page_width > 0.8 * self.width:
        if col_sep:
            # sep = min_x0 + 0.5 * page_width
            if not isinstance(col_sep, list):
                col_sep = [col_sep]
            cols = []
            min_bound = 0
            label = "col1"
            collected: Set[Any] = set()
            for i, sep in enumerate(col_sep):
                col = df[(df.x0 > min_bound) & (df.x1 < sep)]
                cols.append(col)
                for ll in col.line:
                    ll.where = label
                min_bound = sep
                label = f"col{i+2}"
                collected = collected.union(col.index) if collected else col.index
            col = df[df.x0 > min_bound]
            cols.append(col)
            for ll in col.line:
                ll.where = label
            collected = collected.union(col.index)
            other = df[~df.index.isin(collected)]
            # other elements are usually header and footer parts
            # but may also include wide tables covering both columns
            # in that case we need to retrieve parts in col1 and col2 that are vertically aligned with these elements
            top = min(col.y0.min() for col in cols)
            bottom = max(col.y1.max() for col in cols)
            middle_other = other[(other.y0 > top) & (other.y1 < bottom)]
            middle_y0 = []
            for y0, _ in middle_other.groupby("y0"):
                middle_y0.append(y0)
            logger.debug(f"Page {self.id} middle other {middle_y0}")
            range_y0: List[Any] = []
            for y0 in sorted(middle_y0):
                if not range_y0 or y0 > range_y0[-1][-1] + 40:
                    range_y0.append((y0, y0))
                else:
                    range_y0[-1] = (range_y0[-1][0], y0)
            range_y0 = [r for r in range_y0 if r[1] > r[0]]
            logger.debug(f"Page {self.id} middle other range {range_y0}")
            displace = []
            for col in cols:
                for i, row in col.iterrows():
                    if any(r[0] <= row.y0 <= r[1] for r in range_y0):
                        displace.append(i)
            if displace:
                logger.debug(f"Page {self.id} middle other displace {displace}")
                for i in displace:
                    logger.debug(f"\t{df.loc[i].line}")
                    df.loc[i].line.where = None
                cols = [col[~col.index.isin(displace)] for col in cols]
                other = pd.concat([other, df[df.index.isin(displace)]])

            cols[-1] = pd.concat([cols[-1], other[other.y0 > bottom]])
            cols[0] = pd.concat([cols[0], other[other.y0 < bottom]])
            lines = []

            def extend(df2):
                if len(df2):
                    lines.extend(list(df2.sort_values(by=["approx_base_y", "x0"]).line))

            for col in cols:
                extend(col)
            reassign_lines(lines)
            return sep
        else:
            lines = list(df.sort_values(by=["approx_base_y", "x0"]).line)
            reassign_lines(lines)
            return None

    def detect_tables(self):
        """Detects tables on a page.

        **Algorithm**:

        * Finding the holes between lines, keeping the ones that make a continuous line across the page
        * Retrieving the columns from the coordinates of the holes.
        * Filtering columns/holes and keeping the potential tables of the page
        * Going over each line one last time and the ones that are parts of a table become `Cells`.

        """
        df = self.df
        min_x0 = df.x0.min()
        max_x1 = df.x1.max()
        min_bin = 0
        max_bin = max(80, max(len(line.content) for line in self.lines)) if self.lines else 80
        delta = (max_x1 - min_x0) / max_bin  # bin size assuming no more than 40 chars
        logger.debug(f"Page {self.id} max bin {max_bin} {delta=}")

        def holes2cols(holes):
            cols = set(range(min_bin, max_bin + 1)).difference(holes)
            starts = []
            ends: List[Any] = []
            for x in sorted(cols):
                if ends and x == ends[-1] + 1:
                    ends[-1] = x
                else:
                    starts.append(x)
                    ends.append(x)
            return list(zip(starts, ends))

        def col_intersect(cols1, cols2):
            mask = {}
            for _id, (u1, v1) in enumerate(cols1):
                for i in range(u1, v1 + 1):
                    mask[i] = _id
            for u2, v2 in cols2:
                masks = [mask[i] for i in range(u2, v2 + 1) if i in mask]
                if len(set(masks)) > 1:
                    return True
            return False

        for g, df2 in df.groupby(by="where", dropna=False):
            logger.debug(f"Page {self.id} processing {g}")
            all_holes: List[Any] = []
            potential_tables = {}
            for i, row in df2.iterrows():
                line = df2.loc[i].line
                if line.kind and type(line.kind) is not Cell:
                    all_holes = []
                    continue
                bin_x0 = int((row.x0 - min_x0) / delta)
                bin_x1 = int((row.x1 - min_x0) / delta)
                holes = set(range(min_bin, bin_x0)).union(range(bin_x1 + 1, max_bin + 1))
                cols = holes2cols(holes)
                old_size = len(all_holes)
                all_holes = [xholes for xholes in all_holes if not col_intersect(holes2cols(xholes[1]), cols)]
                all_holes = [(j, _holes.intersection(holes)) for j, _holes in all_holes]
                all_holes = [xholes for xholes in all_holes if len(holes2cols(xholes[1])) > 1]
                all_cols = [holes2cols(_holes) for _, _holes in all_holes]
                if old_size - len(all_holes) > 6:
                    # a drastic size reduction in the number of potential cells in a table
                    # may indicate that we have reached the end
                    all_holes = []
                all_holes.append((i, holes))
                if False and len(all_holes) > 1:
                    logger.debug(f"line {i} {df2.loc[i].line} : cols={holes2cols(holes)} all_cols={all_cols[-5:]}")
                    pass
                if len(all_holes) > 3:
                    # logger.debug(
                    #    f"add table lines #={len(all_holes)} {i} {df2.loc[i].line} cols={cols} all_cols={all_cols}"
                    # )
                    for j, _holes in all_holes:
                        potential_tables[j] = _holes
            prev = None
            for i in sorted(potential_tables):
                cols = holes2cols(potential_tables[i])
                if not cols:
                    # not in a table (should be handled above)
                    continue
                if not prev or i > prev + 1:
                    # starting a new table: need additionalchecks
                    j = i + 1
                    line_i = df.loc[i].line
                    # line_i_content = df.loc[i].content
                    try:
                        line_j = df.loc[j].line
                        if line_j.y0 > line_i.y1 + 10:
                            continue
                    except Exception:
                        # line_i is maybe the last one in the page !
                        continue
                logger.debug(f"page {self.id} line {i} in table cols={cols}")
                cell = df.loc[i].line
                bin_x0 = int((cell.x0 - min_x0) / delta)
                bin_x1 = int((cell.x1 - min_x0) / delta)
                holes = set(range(min_bin, bin_x0)).union(range(bin_x1 + 1, max_bin + 1))
                cols = holes2cols(holes)
                if not len(cols):
                    logger.warning(f"empty col list for cell {cell} holes={holes}")
                cell.kind = Cell(cols=cols)
                prev = i

    @property
    def vspace(self):
        v: Counter = Counter()
        for line in self.lines:
            next_line = line.next_close_line
            if next_line:
                v[max(0, round(line.vdelta()))] += 1
        if v:
            # return np.median(np.array(v))
            return v.most_common(1)[0][0]
        else:
            return 0

    def is_tdm_page(self, index=SecIndex):
        """Checks whether a Page is actually a Table of Contents.
        The number of SecIndexes on the Page must cover at least 70% of the lines.

        Returns:
            (bool): if True, the Page contains a Table of Contents.

        """
        nlines = len(self.lines)
        indexes = list(self.collect(kind=index))
        return nlines and (len(indexes) > 10 or len(indexes) > 0.7 * nlines)

    def handle_dense_index_page(self, index=SecIndex):
        """handle page dense with indexes, looking for gaps to be corrected (should be rare)"""
        nlines = len(self.lines)
        vspace = self.vspace
        logger.debug(f"vspace {vspace}")
        indexes = list(self.collect(kind=index))
        logger.debug(
            f"handling potential dense index type={index} on page {self.id} #indexes={len(indexes)} #lines={nlines}"
        )
        if nlines and (len(indexes) > 10 or len(indexes) > 0.7 * nlines):
            logger.info(f"Page {self.id} handling dense page for index={index}")
            for line in indexes[:-1]:
                logger.debug(f"dense index {line}")
                gap = []
                next_line = line.next
                while next_line and type(next_line.kind) is not index and next_line.page == line.page:
                    logger.debug(f"expand index gap with <page{next_line.page}> {next_line}")
                    gap.append(next_line)
                    next_line = next_line.next
                if gap:
                    content = " ".join(lline.content for lline in gap)
                    logger.info(f"Page {self.id} gap in indexes for #lines={len(gap)} content={content}")
                    # TO BE COMPLETED
                    for i, xline in enumerate(gap):
                        if type(xline.kind) is index:
                            continue
                        if index == SecIndex:
                            # badly formatted index line
                            try:
                                m = SECTION_RE.match(xline.content, timeout=1)
                            except Exception:
                                logger.exception(f"regex timeout on content {xline.content}")
                                m = None
                            if m and m.group("page"):
                                xline.kind = SecIndex(m)
                                logger.info(f"Page {self.id} switched line {xline} to index")
                                continue
                            # multi-line index entry (3-lines)
                            if type(xline.kind) is Section:
                                next_line = xline.next
                                content = xline.content
                                merge = []
                                prev_line = xline
                                for j in range(i + 1, len(gap)):
                                    yline = gap[j]
                                    if (
                                        type(yline.kind) in {Section, SecIndex}
                                        or yline.base_y > prev_line.base_y + vspace
                                    ):
                                        break
                                    content += " " + yline.content
                                    merge.append(yline)
                                    try:
                                        m = SECTION_RE.match(content, timeout=1)
                                    except Exception:
                                        logger.exception(f"regex timeout on content {content}")
                                        m = None
                                    if m and m.group("page"):
                                        logger.debug(f"found tdm entry merging {merge}")
                                        for _ in merge:
                                            # logger.debug(f"merging")
                                            xline.join_next()
                                        break
                                    prev_line = yline
            logger.debug(f"last index line {indexes[-1]}")
            if index == SecIndex:
                # looking for section entries without numbering as first or last entries
                # such as Introduction, Preamble or Conclusion
                def check_special_section_entry(potential_entry):
                    if potential_entry and potential_entry.page == self.id:
                        m = SPECIAL_SECTION_ENTRY_RE.match(potential_entry.content)
                        if m:
                            logger.debug(f"special toc {m.group('dot')=} {m.group('page')=} {m.group('title')=}")
                            potential_entry.kind = SecIndex(m)
                            logger.info(f"Page {self.id} switched to section entry line {potential_entry}")

                # check_special_section_entry(indexes[0].prev)
                # check_special_section_entry(indexes[-1].next)
                for line in self.lines:
                    logger.debug(f"Page {self.id} try0 special toc line kind={line.kind} {line}")
                    if type(line.kind) not in {SecIndex, OtherIndex}:
                        logger.debug(f"Page {self.id} try special toc line {line}")
                        check_special_section_entry(line)

    def collect(self, kind=Section):
        """Providing a kind of marker, for each line of the `Page`, collects all the lines where `line.kind` = marker.

        Returns:
            (iter): an iterator of all the occurrences of the marker in the Page."""
        return (line for line in self.lines if isinstance(line.kind, kind))

    @property
    def sections(self):
        """Collects all section titles (`Section`) of the Page."""
        return self.collect(kind=Section)

    @property
    def tdm(self):
        """Collects all the section TOC entries (`SecIndex`) of the Page."""
        return self.collect(kind=SecIndex)

    def clean(self):
        """Removes deleted lines"""
        self.lines = [line for line in self.lines if line.prev or line.next]

    def display(self, start=None, end=None):
        if start is None:
            start = 0
        if end is None:
            end = len(self.lines)
        max_chars = 120
        rows: List[Any] = []

        def rows_display(rows):
            # print(rows)

            max_col_offset = max(line.kind.cols[-1][1] for row in rows for line in row)
            min_col_offset = min(line.kind.cols[-1][0] for row in rows for line in row)
            if max_col_offset == min_col_offset:
                max_col_offset += 10
            # logger.info(f"TABLE {min_col_offset=} {max_col_offset=}")

            def char_offset(col_offset):
                return (col_offset - min_col_offset) * max_chars / (max_col_offset - min_col_offset)

            def start(line):
                return char_offset(line.kind.cols[-1][0])

            def end(line):
                return char_offset(line.kind.cols[-1][1])

            for row in rows:
                row = sorted(row, key=lambda line: line.kind.cols[-1][0])
                starts = [start(line) for line in row] + [max_chars]
                ends = [0] + [end(line) for line in row]
                delta = [int((s - e) / 2) for s, e in zip(starts, ends)]
                # logger.info(f"{starts=} {ends=} {delta=}")
                txt = "".join(f"|{' '*d}{line.display()}{' '*d}|" for d, line in zip(delta, row))
                print(f"[{row[0].where}] {txt}")

        for i in range(start, min(end, len(self.lines))):
            line = self.lines[i]
            if type(line.kind) is Cell and line.kind.cols:
                if rows and abs(rows[-1][-1].base_y - line.base_y) < 5:
                    rows[-1].append(line)
                else:
                    rows.append([line])
            else:
                if rows:
                    rows_display(rows)
                    rows = []
                print(line)
        if rows:
            rows_display(rows)

    def display_html(self, start=None, end=None):
        """
        Returns:
            (str) : an HTML version of the Page
        """
        html_id = self.id + 1
        if start is None:
            start = 0
        if end is None:
            end = len(self.lines)
        max_chars = 120
        rows: List[Any] = []
        text: List[Any] = []
        html_display = []

        def rows_display_html(rows):
            try:
                max_col_offset = max(line.kind.cols[-1][1] for row in rows for line in row)
            except Exception as e:
                for i, row in enumerate(rows):
                    for j, line in enumerate(row):
                        logger.debug(
                            f"error {e} row {i} line {j} line={line} marker={type(line.kind)} {line.kind.cols}"
                        )
                raise
            min_col_offset = min(line.kind.cols[-1][0] for row in rows for line in row)
            # logger.debug(f"TABLE {min_col_offset=} {max_col_offset=}")

            def char_offset(col_offset):
                return (col_offset - min_col_offset) * max_chars / (max_col_offset - min_col_offset)

            def start(line):
                return char_offset(line.kind.cols[-1][0])

            def end(line):
                return char_offset(line.kind.cols[-1][1])

            # print(rows)
            trs: List[str] = []
            for row in rows:
                row = sorted(row, key=lambda line: line.kind.cols[-1][0])
                # print(row)
                tds = "\n".join(line.display_html() for line in row)
                tr = make_html(tag="tr", text=tds, parent=True)
                trs.append(tr)
            # trs = "\n".join(trs)
            table = make_html(text="\n".join(trs), tag="table", attval=[("border", "1")], parent=True)
            # print(table)
            html_display.append(table)

        def text_display(texts):
            text = " ".join(t for t in texts)
            p = make_html(tag="p", text=text)
            # print(p)
            html_display.append(p)

        stop = min(end, len(self.lines))
        for i in range(start, stop):
            line = self.lines[i]

            if type(line.kind) is Cell:
                if text:
                    text_display(text)
                    text = []
                if rows and abs(rows[-1][-1].base_y - line.base_y) < 5:
                    rows[-1].append(line)
                else:
                    rows.append([line])
            elif line.kind is None:
                if rows:
                    rows_display_html(rows)
                    rows = []
                # print(line)
                if text == []:
                    c = i
                    next_line = line.next
                    current_line = line
                    text.append(line.content)
                    while next_line and next_line.kind is None and c <= stop - 2:
                        # print(current_line,next_line)
                        text.append(next_line.content)
                        current_line = next_line
                        next_line = current_line.next
                        c += 1
                    # print(text)
            else:
                if rows:
                    rows_display_html(rows)
                    rows = []
                if text:
                    text_display(text)
                    text = []
                html_display.append(line.display_html())
                # print(line.display_html())
        if rows:
            rows_display_html(rows)
        if text:
            # print(text)
            text_display(text)
        footer_div = []

        if (html_id - 1) > 0:
            href_1 = make_html(
                text="Page précédente",
                tag="a",
                attval=[("href", f"./page_{html_id-1}.html"), ("target", "_self")],
            )
        else:
            href_1 = make_html(
                text="Page précédente",
                tag="a",
                attval=[
                    ("href", "./overview.html"),
                    ("target", "_self"),
                ],
            )
        subdiv = make_html(
            text=href_1,
            tag="div",
            attval=[("style", "position:absolute; left:0%; width:50%; height:100%;")],
            parent=True,
        )
        footer_div.append(subdiv)

        if html_id < self.doc_len:
            href_2 = make_html(
                text="Page suivante",
                tag="a",
                attval=[("href", f"./page_{html_id+1}.html"), ("target", "_self")],
            )
        else:
            href_2 = make_html(
                text="Page suivante",
                tag="a",
                attval=[("href", "./overview.html"), ("target", "_self")],
            )
        subdiv = make_html(
            text=href_2,
            tag="div",
            attval=[("style", "position:absolute; left:50%; width:50%; height:100%;")],
            parent=True,
        )
        footer_div.append(subdiv)
        footer_div = make_html(
            text="\n".join(footer_div),
            tag="div",
            attval=[("style", "width:300px;height:100px;position:relative")],
            parent=True,
        )
        html_display.append(footer_div)
        html_display = make_html(tag="html", text="\n".join(html_display), parent=True)
        print(html_display)
        return html_display

`blobs: List[Any] = []` `instance-attribute` ⚓︎

`df` `property` ⚓︎

returns info about lines as a pandas DataFrame

`doc = doc` `instance-attribute` ⚓︎

`doc_len = len(self.doc) if doc_len is None else doc_len` `instance-attribute` ⚓︎

`height = raw['height']` `instance-attribute` ⚓︎

`id = pageid` `instance-attribute` ⚓︎

`lines: List[Any] = []` `instance-attribute` ⚓︎

`links = links` `instance-attribute` ⚓︎

`most_common_line_spacing` `property` ⚓︎

`most_common_styles` `property` ⚓︎

`next = next` `instance-attribute` ⚓︎

`page_number = None` `instance-attribute` ⚓︎

`prev = prev` `instance-attribute` ⚓︎

`raw = raw` `instance-attribute` ⚓︎

`sections` `property` ⚓︎

Collects all section titles (Section) of the Page.

`tdm` `property` ⚓︎

Collects all the section TOC entries (SecIndex) of the Page.

`vspace` `property` ⚓︎

`width = raw['width']` `instance-attribute` ⚓︎

`words = doc.words[self.id] if doc.words is not None else None` `instance-attribute` ⚓︎

`init(pageid, raw, doc, links=None, prev=None, next=None, images=None, ocr=None, doc_len=None)` ⚓︎

Source code in pdfstruct/page.py

def __init__(self, pageid, raw, doc, links=None, prev=None, next=None, images=None, ocr=None, doc_len=None):
    self.raw = raw
    self.id = pageid
    self.doc = doc
    self.doc_len = len(self.doc) if doc_len is None else doc_len
    self.page_number = None
    self.prev = prev
    self.next = next
    self.links = links

    logger.info(f"Processing page {pageid} {ocr=} {raw['width']=} {raw['height']=}")

    self.set_orientation()

    self.width = raw["width"]
    self.height = raw["height"]

    self.words = doc.words[self.id] if doc.words is not None else None

    logger.debug(f"Page {pageid} trying double column")
    col_sep = self.check_double_column()
    if col_sep:
        logger.info(f"Page {pageid} double column with sep={col_sep}")

    # we have an issue for landscape pages

    self.lines: List[Any] = []
    self.blobs: List[Any] = []

    coord2links = defaultdict(list)

    # extracting links
    if links:
        for link in links:
            coord = link[1]
            # x0 = round(coord.x0)
            # x1 = round(coord.x1)
            y0 = round(coord.y0)
            y1 = round(coord.y1)
            for y in range(y0, y1):
                coord2links[y].append(link)

    image2coords = defaultdict(list)

    nblocks = len(raw["blocks"])

    # extracting images

    for block in raw["blocks"]:
        if block["type"] != 0:
            # remove non text blocks
            # but keep them in a separate list
            blob = Blob(block)
            self.blobs.append(blob)
            if blob.image:
                bid = block["number"]
                outname = f"./images/page{pageid}_bid{bid}.png"
                pix = fitz.Pixmap(blob.image)
                pix.save(outname)

    if images:
        for image in images:
            blob = Blob(
                {
                    "bbox": image[0:4],
                    "ext": 0,
                    "width": image[2] - image[0],
                    "height": image[3] - image[1],
                    "image": image[4],
                }
            )
            self.blobs.append(blob)

    for blob in self.blobs:
        logger.debug(f"Page {pageid} blob bbox={blob.bbox} height={blob.height} width={blob.width}")
        for y in range(round(blob.y0) + 1, round(blob.y1), 10):
            image2coords[y].append(blob)
        if False and ocr and 5 < blob.height < 30 and blob.image and len(self.blobs) < 20:
            # we try ocr to extract text from small images
            logger.info(
                f"Page {pageid} try ocr on small image blob height={blob.height} {ocr=} {type(blob.image)} len={len(blob.image)}"  # noqa: E501
            )
            try:
                res = ocr.readtext(blob.image)
                logger.info(f"Page {pageid} ocr {res=}")
                for bbox, text, score in res:
                    logger.debug(f"Page {pageid} blob ocr {text=} {score=} {bbox=}")
            except Exception:
                logger.exception("pb ocr")

    def sorted_blocks(blocks, col_sep=None):
        def local_sort(blocks):
            return sorted(sorted(blocks, key=lambda b: b["bbox"][0]), key=lambda b: b["bbox"][1])

        if col_sep:
            if not isinstance(col_sep, list):
                col_sep = [col_sep]
            cols = []
            min_bound = 0
            for sep in col_sep:
                col = [b for b in blocks if (b["bbox"][2] < sep or b["bbox"][0] < sep) and b["bbox"][0] > min_bound]
                cols.append(col)
                min_bound = sep
            col = [b for b in blocks if b["bbox"][0] > min_bound]
            cols.append(col)
            # left_blocks = [b for b in blocks if b['bbox'][2] < col_sep or b['bbox'][0] < col_sep]
            # right_blocks = [b for b in blocks if b['bbox'][0] > col_sep]
            # return chain(local_sort(left_blocks),local_sort(right_blocks))
            return chain.from_iterable(local_sort(col) for col in cols)
        else:
            return local_sort(blocks)

    # extracting lines

    for block in sorted_blocks(raw["blocks"], col_sep=col_sep):
        bid = block["number"]
        block_bbox = block["bbox"]
        if block["type"] != 0:
            # already processed
            continue
        # keeping the original line and block ids
        for i, line in enumerate(block["lines"]):
            line["original_line_id"] = i
            line["original_block_id"] = bid
            # adding the words tuples for each line and checking words and spans are identical
            if self.words is not None:
                line["words"] = [w for w in self.words if w[5] == bid and w[6] == i]
                span_text = "".join(span["text"] for span in line["spans"]).strip()
                word_text = " ".join([w[4] for w in line["words"]])
                spans, words = re.split(r"\s+", span_text), re.split(r"\s+", word_text)
                if spans != words:
                    logging.info(
                        f"WARNING (PYMUPDF ISSUE): Page {self.id} Block {bid} Line {i} - SPANS AND WORDS DONT MATCH"
                    )
                    # print(spans,"||", words)

        # logging.info(f"BLOCK type={block['type']} {list(block.keys())}")
        # for line in sorted(sorted(block['lines'],key=lambda l: l['bbox'][1]), key=lambda l: l['bbox'][0]):
        try:
            for line in sorted(
                sorted(
                    filter(lambda lline: len(lline["spans"]) > 0, block["lines"]),
                    key=lambda lline: lline["bbox"][0],
                ),
                key=lambda lline: round(lline["spans"][0]["origin"][1]),
            ):
                # apply unicode normalization for some characters such as non-breaking spaces
                # and to avoid some errors related to unicode characters
                # maybe overkill !
                def normalize(s: str):
                    return normalize_unicode("NFKD", s).encode("utf-8", errors="ignore").decode("utf-8")

                content = "".join(normalize(span["text"]) for span in line["spans"]).strip()
                if not content:
                    continue
                line = Line("".join(content), raw=line, page=pageid, doc=None)
                logger.debug(
                    f"Page {self.id} block={bid} {block_bbox} line bbox={line.bbox} base={line.base_y} : |{line}|"
                )

                if nblocks > 2000 and round(line.base_y) in image2coords:
                    # there are cases where many textual annotations are attached to images
                    # we skip them to avoid some complexity issues in other processing steps
                    blobs = image2coords[round(line.base_y)]
                    found = False
                    for blob in blobs:
                        if line.x0 <= blob.x0 <= line.x1 or blob.x0 <= line.x0 <= blob.x1:
                            logger.warning(
                                f"Page {pageid} discard line {line.bbox} embedded in blob {blob.bbox} content={line}"  # noqa: E501
                            )
                            found = True
                            break
                    if found:
                        continue

                if tuple(line.orientation) != (1.0, 0.0):
                    logger.warning(
                        f"Page {pageid} discard line {line.bbox} because of orientation {line.orientation} content={line.content}"  # noqa: E501
                    )
                    continue

                if round(line.base_y) in coord2links:
                    logger.debug(f"Page {self.id} potential link for {line}: {coord2links[round(line.base_y)]}")
                    for link in coord2links[round(line.base_y)]:
                        if line.x0 <= link[1].x0 <= line.x1 or link[1].x0 <= line.x0 <= link[1].x1:
                            logger.info(f"Page {pageid} attach link to page {link[0]} to line {line}")
                            line.links.append(link)

                if nblocks > 2000 and round(line.base_y) in image2coords:
                    # there are cases where many textual annotations are attached to images
                    # we skip them to avoid some complexity issues in other processing steps
                    blobs = image2coords[round(line.base_y)]
                    found = False
                    for blob in blobs:
                        if line.x0 <= blob.x0 <= line.x1 or blob.x0 <= line.x0 <= blob.x1:
                            logger.warning(
                                f"Page {pageid} discard line {line.bbox} embedded in blob {blob.bbox} content={line}"  # noqa: E501
                            )
                            found = True
                            break
                    if found:
                        continue

                if tuple(line.orientation) != (1.0, 0.0):
                    logger.warning(
                        f"Page {pageid} discard line {line.bbox} because of orientation {line.orientation} content={line.content}"  # noqa: E501
                    )
                    continue

                if round(line.base_y) in coord2links:
                    logger.debug(f"Page {self.id} potential link for {line}: {coord2links[round(line.base_y)]}")
                    for link in coord2links[round(line.base_y)]:
                        if line.x0 <= link[1].x0 <= line.x1 or link[1].x0 <= line.x0 <= link[1].x1:
                            logger.info(f"Page {pageid} attach link to page {link[0]} to line {line}")
                            line.links.append(link)

                if self.lines and self.lines[-1].try_merge(line, doc=None, col_sep=col_sep):
                    pass
                else:
                    self.add_line(line)
        except Exception:
            logger.exception(f"Pb page {pageid} block={block['lines']}")

            if not self.lines:
                # could potentially be the sign of a problem
                logger.warning(f"Empty page {pageid}")
            elif len(self.lines) > 1000:
                contents = Counter([len(line.content) for line in self.lines])
                logger.warning(
                    f"Long page {pageid} #lines={len(self.lines)} content distrib={contents.most_common()}"
                )
                for i, line in enumerate(self.lines[:100]):
                    logger.warning(f"line {i} raw={line.raw}")

    # assign column info for each line in where field
    # and sort lines in each column
    self.assign_column_info(col_sep)

    # TODO: ask Eric if this is a typo
    # assign column info for each line in where field
    # and sort lines in each column
    self.assign_column_info(col_sep)

    # check if the page is dense in indexes and try to fill gaps if necessary
    self.handle_dense_index_page()
    self.handle_dense_index_page(index=OtherIndex)

    # track potential tables
    self.detect_tables()
    # some lines not in potential tables could now be joined
    changed = False
    vspace = self.vspace

    # joining lines and updating the indexes
    while True:
        for line in self.lines:
            next_line = line.next_close_line
            if next_line and (line.vdelta(next_line) <= vspace) and line.potential_merge(next_line):
                line.join_next()
                changed = True
        if changed:
            self.handle_dense_index_page()
            self.handle_dense_index_page(index=OtherIndex)
            self.clean()
            changed = False
        else:
            break

`add_line(line)` ⚓︎

The equivalent of an .append() list method : the Line is added at the end of the .lines attribute.

Source code in pdfstruct/page.py

def add_line(self, line):
    """The equivalent of an `.append()` list method : the `Line` is added at the end of the `.lines` attribute."""
    line.page = self.id
    if self.lines:
        line.prev = self.lines[-1]
    else:
        ppage = self.prev
        while ppage:
            if ppage.lines:
                line.prev = ppage.lines[-1]
                break
            ppage = ppage.prev
    self.lines.append(line)
    if line.prev:
        line.prev.next = line

`assign_column_info(col_sep)` ⚓︎

Assigns a column to each Line and sort lines per column. If there are no columns or if a line is intersecting with a column separator, assumes the info is None.

Source code in pdfstruct/page.py

def assign_column_info(self, col_sep):
    """Assigns a column to each `Line` and sort lines per column.
    If there are no columns or if a line is intersecting with a column separator, assumes the info is `None`.
    """
    df = self.df
    df = df.assign(approx_base_y=df.base_y.round())
    min_x0 = df.x0.min()
    max_x1 = df.x1.max()
    max_x1 - min_x0
    df.width.quantile(q=0.9)
    prev_line = self.lines[0].prev if self.lines else None

    def reassign_lines(lines):
        prev = prev_line
        logger.debug(f"reassigning lines prev_line={prev_line}")
        for line in lines:
            line.prev = prev
            if prev:
                prev.next = line
            prev = line
        if lines:
            # to ensure that the last page line has None has next line
            lines[-1].next = None
        self.lines = lines

    # if self.orientation == 'portrait' and max_width < 0.5 * page_width and page_width > 0.8 * self.width:
    # if max_width < 0.5 * page_width and page_width > 0.8 * self.width:
    if col_sep:
        # sep = min_x0 + 0.5 * page_width
        if not isinstance(col_sep, list):
            col_sep = [col_sep]
        cols = []
        min_bound = 0
        label = "col1"
        collected: Set[Any] = set()
        for i, sep in enumerate(col_sep):
            col = df[(df.x0 > min_bound) & (df.x1 < sep)]
            cols.append(col)
            for ll in col.line:
                ll.where = label
            min_bound = sep
            label = f"col{i+2}"
            collected = collected.union(col.index) if collected else col.index
        col = df[df.x0 > min_bound]
        cols.append(col)
        for ll in col.line:
            ll.where = label
        collected = collected.union(col.index)
        other = df[~df.index.isin(collected)]
        # other elements are usually header and footer parts
        # but may also include wide tables covering both columns
        # in that case we need to retrieve parts in col1 and col2 that are vertically aligned with these elements
        top = min(col.y0.min() for col in cols)
        bottom = max(col.y1.max() for col in cols)
        middle_other = other[(other.y0 > top) & (other.y1 < bottom)]
        middle_y0 = []
        for y0, _ in middle_other.groupby("y0"):
            middle_y0.append(y0)
        logger.debug(f"Page {self.id} middle other {middle_y0}")
        range_y0: List[Any] = []
        for y0 in sorted(middle_y0):
            if not range_y0 or y0 > range_y0[-1][-1] + 40:
                range_y0.append((y0, y0))
            else:
                range_y0[-1] = (range_y0[-1][0], y0)
        range_y0 = [r for r in range_y0 if r[1] > r[0]]
        logger.debug(f"Page {self.id} middle other range {range_y0}")
        displace = []
        for col in cols:
            for i, row in col.iterrows():
                if any(r[0] <= row.y0 <= r[1] for r in range_y0):
                    displace.append(i)
        if displace:
            logger.debug(f"Page {self.id} middle other displace {displace}")
            for i in displace:
                logger.debug(f"\t{df.loc[i].line}")
                df.loc[i].line.where = None
            cols = [col[~col.index.isin(displace)] for col in cols]
            other = pd.concat([other, df[df.index.isin(displace)]])

        cols[-1] = pd.concat([cols[-1], other[other.y0 > bottom]])
        cols[0] = pd.concat([cols[0], other[other.y0 < bottom]])
        lines = []

        def extend(df2):
            if len(df2):
                lines.extend(list(df2.sort_values(by=["approx_base_y", "x0"]).line))

        for col in cols:
            extend(col)
        reassign_lines(lines)
        return sep
    else:
        lines = list(df.sort_values(by=["approx_base_y", "x0"]).line)
        reassign_lines(lines)
        return None

`check_double_column()` ⚓︎

check whether the page has double columns and returns the x-separating value between the columns, otherwise returns None

Returns:

Type	Description
`list(float)`	the column separators (x coordinates)

Source code in pdfstruct/page.py

def check_double_column(self):
    """check whether the page has double columns and returns the x-separating value between the columns,
    otherwise returns None

    Returns:
        (list(float)): the column separators (x coordinates)
    """
    lines = []
    for block in self.raw["blocks"]:
        if block["type"] != 0:
            continue
        for line in block["lines"]:
            if tuple(line["dir"]) != (1.0, 0.0):
                continue
            content = "".join([span["text"] for span in line["spans"]])
            if not content or not re.search(r"\S+", content):
                # keep only the lines with some real content (not just whitespaces)
                continue
            bbox = line["bbox"]
            lines.append(bbox)
    df = pd.DataFrame(lines, columns=["x0", "y0", "x1", "y1"])
    df = df.assign(width=(df.x1 - df.x0))
    min_x0 = df.x0.min()
    max_x1 = df.x1.max()
    page_width = max_x1 - min_x0
    max_width = df.width.quantile(q=0.9)

    height = self.height
    width = self.width
    logger.debug(f"param double column: {len(df)=} {height=} {width=}")

    # remove potential header and footer lines
    df = df[(df.y0 > 0.05 * height) & (df.y1 < 0.95 * height)]

    logger.debug(
        f"param double column: {len(df)=} {min_x0=} {max_x1=} {max_width=} {0.5*page_width=} {page_width=} {0.8*self.width=}"  # noqa: E501
    )

    if len(df) < 5:
        logger.debug(self.raw["blocks"])

    if not len(df):
        # (almost) empty page
        return None

    # we search a vertical line around the middle of the page
    # with only a small intersection with the horizontal lines
    # (intersection may be non empty because of headers, footers, and some tables)
    potential_sep = 0.5 * self.width
    potential_seps = [
        potential_sep,
        potential_sep - 0.02 * self.width,
        potential_sep + 0.02 * self.width,
    ]
    if max_width < 0.5 * page_width and page_width > 0.8 * self.width:
        potential_seps.append(min_x0 + 0.5 * page_width)
    for potential_sep in potential_seps:
        inter = df[(df.x0 < potential_sep) & (potential_sep < df.x1)]
        ninter = len(inter)
        logger.debug(f"try sep2 {potential_sep=} {ninter=}, {0.1*len(df)=}")
        if ninter < 0.1 * len(df):
            return potential_sep

    # try for 3 columns
    potential_seps = [(0.33 * self.width, 0.66 * self.width)]
    if max_width < 0.5 * page_width and page_width > 0.8 * self.width:
        potential_seps.append((min_x0 + 0.33 * page_width, min_x0 + 0.66 * page_width))
    for sepA, sepB in potential_seps:
        interA = df[(df.x0 < sepA) & (sepA < df.x1)]
        interB = df[(df.x0 < sepB) & (sepB < df.x1)]
        ninter = len(interA) + len(interB)
        logger.debug(f"try sep3 {sepA=} {sepB=} {ninter=}, {0.1*len(df)=}")
        if ninter < 0.1 * len(df):
            logger.info(f"Page {self.id} has 3 columns with {sepA=} {sepB=} {ninter=}")
            return [sepA, sepB]

    return None

`check_loop()` ⚓︎

Source code in pdfstruct/page.py

def check_loop(self):
    # check loops on self.lines
    seen = set([])
    for line in self.lines:
        if line.next and line.next.id in seen:
            logger.warning(f"loop in lines page {self.id} at line {line} next={line.next}")
            return True
        seen.add(line.id)
    logger.debug(f"no loop in lines page {self.id}")
    return False

`clean()` ⚓︎

Removes deleted lines

Source code in pdfstruct/page.py

def clean(self):
    """Removes deleted lines"""
    self.lines = [line for line in self.lines if line.prev or line.next]

`collect(kind=Section)` ⚓︎

Providing a kind of marker, for each line of the Page, collects all the lines where line.kind = marker.

Returns:

Type	Description
`iter`	an iterator of all the occurrences of the marker in the Page.

Source code in pdfstruct/page.py

def collect(self, kind=Section):
    """Providing a kind of marker, for each line of the `Page`, collects all the lines where `line.kind` = marker.

    Returns:
        (iter): an iterator of all the occurrences of the marker in the Page."""
    return (line for line in self.lines if isinstance(line.kind, kind))

`detect_tables()` ⚓︎

Detects tables on a page.

Algorithm:

Finding the holes between lines, keeping the ones that make a continuous line across the page
Retrieving the columns from the coordinates of the holes.
Filtering columns/holes and keeping the potential tables of the page
Going over each line one last time and the ones that are parts of a table become Cells.

Source code in pdfstruct/page.py

def detect_tables(self):
    """Detects tables on a page.

    **Algorithm**:

    * Finding the holes between lines, keeping the ones that make a continuous line across the page
    * Retrieving the columns from the coordinates of the holes.
    * Filtering columns/holes and keeping the potential tables of the page
    * Going over each line one last time and the ones that are parts of a table become `Cells`.

    """
    df = self.df
    min_x0 = df.x0.min()
    max_x1 = df.x1.max()
    min_bin = 0
    max_bin = max(80, max(len(line.content) for line in self.lines)) if self.lines else 80
    delta = (max_x1 - min_x0) / max_bin  # bin size assuming no more than 40 chars
    logger.debug(f"Page {self.id} max bin {max_bin} {delta=}")

    def holes2cols(holes):
        cols = set(range(min_bin, max_bin + 1)).difference(holes)
        starts = []
        ends: List[Any] = []
        for x in sorted(cols):
            if ends and x == ends[-1] + 1:
                ends[-1] = x
            else:
                starts.append(x)
                ends.append(x)
        return list(zip(starts, ends))

    def col_intersect(cols1, cols2):
        mask = {}
        for _id, (u1, v1) in enumerate(cols1):
            for i in range(u1, v1 + 1):
                mask[i] = _id
        for u2, v2 in cols2:
            masks = [mask[i] for i in range(u2, v2 + 1) if i in mask]
            if len(set(masks)) > 1:
                return True
        return False

    for g, df2 in df.groupby(by="where", dropna=False):
        logger.debug(f"Page {self.id} processing {g}")
        all_holes: List[Any] = []
        potential_tables = {}
        for i, row in df2.iterrows():
            line = df2.loc[i].line
            if line.kind and type(line.kind) is not Cell:
                all_holes = []
                continue
            bin_x0 = int((row.x0 - min_x0) / delta)
            bin_x1 = int((row.x1 - min_x0) / delta)
            holes = set(range(min_bin, bin_x0)).union(range(bin_x1 + 1, max_bin + 1))
            cols = holes2cols(holes)
            old_size = len(all_holes)
            all_holes = [xholes for xholes in all_holes if not col_intersect(holes2cols(xholes[1]), cols)]
            all_holes = [(j, _holes.intersection(holes)) for j, _holes in all_holes]
            all_holes = [xholes for xholes in all_holes if len(holes2cols(xholes[1])) > 1]
            all_cols = [holes2cols(_holes) for _, _holes in all_holes]
            if old_size - len(all_holes) > 6:
                # a drastic size reduction in the number of potential cells in a table
                # may indicate that we have reached the end
                all_holes = []
            all_holes.append((i, holes))
            if False and len(all_holes) > 1:
                logger.debug(f"line {i} {df2.loc[i].line} : cols={holes2cols(holes)} all_cols={all_cols[-5:]}")
                pass
            if len(all_holes) > 3:
                # logger.debug(
                #    f"add table lines #={len(all_holes)} {i} {df2.loc[i].line} cols={cols} all_cols={all_cols}"
                # )
                for j, _holes in all_holes:
                    potential_tables[j] = _holes
        prev = None
        for i in sorted(potential_tables):
            cols = holes2cols(potential_tables[i])
            if not cols:
                # not in a table (should be handled above)
                continue
            if not prev or i > prev + 1:
                # starting a new table: need additionalchecks
                j = i + 1
                line_i = df.loc[i].line
                # line_i_content = df.loc[i].content
                try:
                    line_j = df.loc[j].line
                    if line_j.y0 > line_i.y1 + 10:
                        continue
                except Exception:
                    # line_i is maybe the last one in the page !
                    continue
            logger.debug(f"page {self.id} line {i} in table cols={cols}")
            cell = df.loc[i].line
            bin_x0 = int((cell.x0 - min_x0) / delta)
            bin_x1 = int((cell.x1 - min_x0) / delta)
            holes = set(range(min_bin, bin_x0)).union(range(bin_x1 + 1, max_bin + 1))
            cols = holes2cols(holes)
            if not len(cols):
                logger.warning(f"empty col list for cell {cell} holes={holes}")
            cell.kind = Cell(cols=cols)
            prev = i

`display(start=None, end=None)` ⚓︎

Source code in pdfstruct/page.py

def display(self, start=None, end=None):
    if start is None:
        start = 0
    if end is None:
        end = len(self.lines)
    max_chars = 120
    rows: List[Any] = []

    def rows_display(rows):
        # print(rows)

        max_col_offset = max(line.kind.cols[-1][1] for row in rows for line in row)
        min_col_offset = min(line.kind.cols[-1][0] for row in rows for line in row)
        if max_col_offset == min_col_offset:
            max_col_offset += 10
        # logger.info(f"TABLE {min_col_offset=} {max_col_offset=}")

        def char_offset(col_offset):
            return (col_offset - min_col_offset) * max_chars / (max_col_offset - min_col_offset)

        def start(line):
            return char_offset(line.kind.cols[-1][0])

        def end(line):
            return char_offset(line.kind.cols[-1][1])

        for row in rows:
            row = sorted(row, key=lambda line: line.kind.cols[-1][0])
            starts = [start(line) for line in row] + [max_chars]
            ends = [0] + [end(line) for line in row]
            delta = [int((s - e) / 2) for s, e in zip(starts, ends)]
            # logger.info(f"{starts=} {ends=} {delta=}")
            txt = "".join(f"|{' '*d}{line.display()}{' '*d}|" for d, line in zip(delta, row))
            print(f"[{row[0].where}] {txt}")

    for i in range(start, min(end, len(self.lines))):
        line = self.lines[i]
        if type(line.kind) is Cell and line.kind.cols:
            if rows and abs(rows[-1][-1].base_y - line.base_y) < 5:
                rows[-1].append(line)
            else:
                rows.append([line])
        else:
            if rows:
                rows_display(rows)
                rows = []
            print(line)
    if rows:
        rows_display(rows)

`display_html(start=None, end=None)` ⚓︎

Returns:

Type	Description
	(str) : an HTML version of the Page

Source code in pdfstruct/page.py

def display_html(self, start=None, end=None):
    """
    Returns:
        (str) : an HTML version of the Page
    """
    html_id = self.id + 1
    if start is None:
        start = 0
    if end is None:
        end = len(self.lines)
    max_chars = 120
    rows: List[Any] = []
    text: List[Any] = []
    html_display = []

    def rows_display_html(rows):
        try:
            max_col_offset = max(line.kind.cols[-1][1] for row in rows for line in row)
        except Exception as e:
            for i, row in enumerate(rows):
                for j, line in enumerate(row):
                    logger.debug(
                        f"error {e} row {i} line {j} line={line} marker={type(line.kind)} {line.kind.cols}"
                    )
            raise
        min_col_offset = min(line.kind.cols[-1][0] for row in rows for line in row)
        # logger.debug(f"TABLE {min_col_offset=} {max_col_offset=}")

        def char_offset(col_offset):
            return (col_offset - min_col_offset) * max_chars / (max_col_offset - min_col_offset)

        def start(line):
            return char_offset(line.kind.cols[-1][0])

        def end(line):
            return char_offset(line.kind.cols[-1][1])

        # print(rows)
        trs: List[str] = []
        for row in rows:
            row = sorted(row, key=lambda line: line.kind.cols[-1][0])
            # print(row)
            tds = "\n".join(line.display_html() for line in row)
            tr = make_html(tag="tr", text=tds, parent=True)
            trs.append(tr)
        # trs = "\n".join(trs)
        table = make_html(text="\n".join(trs), tag="table", attval=[("border", "1")], parent=True)
        # print(table)
        html_display.append(table)

    def text_display(texts):
        text = " ".join(t for t in texts)
        p = make_html(tag="p", text=text)
        # print(p)
        html_display.append(p)

    stop = min(end, len(self.lines))
    for i in range(start, stop):
        line = self.lines[i]

        if type(line.kind) is Cell:
            if text:
                text_display(text)
                text = []
            if rows and abs(rows[-1][-1].base_y - line.base_y) < 5:
                rows[-1].append(line)
            else:
                rows.append([line])
        elif line.kind is None:
            if rows:
                rows_display_html(rows)
                rows = []
            # print(line)
            if text == []:
                c = i
                next_line = line.next
                current_line = line
                text.append(line.content)
                while next_line and next_line.kind is None and c <= stop - 2:
                    # print(current_line,next_line)
                    text.append(next_line.content)
                    current_line = next_line
                    next_line = current_line.next
                    c += 1
                # print(text)
        else:
            if rows:
                rows_display_html(rows)
                rows = []
            if text:
                text_display(text)
                text = []
            html_display.append(line.display_html())
            # print(line.display_html())
    if rows:
        rows_display_html(rows)
    if text:
        # print(text)
        text_display(text)
    footer_div = []

    if (html_id - 1) > 0:
        href_1 = make_html(
            text="Page précédente",
            tag="a",
            attval=[("href", f"./page_{html_id-1}.html"), ("target", "_self")],
        )
    else:
        href_1 = make_html(
            text="Page précédente",
            tag="a",
            attval=[
                ("href", "./overview.html"),
                ("target", "_self"),
            ],
        )
    subdiv = make_html(
        text=href_1,
        tag="div",
        attval=[("style", "position:absolute; left:0%; width:50%; height:100%;")],
        parent=True,
    )
    footer_div.append(subdiv)

    if html_id < self.doc_len:
        href_2 = make_html(
            text="Page suivante",
            tag="a",
            attval=[("href", f"./page_{html_id+1}.html"), ("target", "_self")],
        )
    else:
        href_2 = make_html(
            text="Page suivante",
            tag="a",
            attval=[("href", "./overview.html"), ("target", "_self")],
        )
    subdiv = make_html(
        text=href_2,
        tag="div",
        attval=[("style", "position:absolute; left:50%; width:50%; height:100%;")],
        parent=True,
    )
    footer_div.append(subdiv)
    footer_div = make_html(
        text="\n".join(footer_div),
        tag="div",
        attval=[("style", "width:300px;height:100px;position:relative")],
        parent=True,
    )
    html_display.append(footer_div)
    html_display = make_html(tag="html", text="\n".join(html_display), parent=True)
    print(html_display)
    return html_display

`handle_dense_index_page(index=SecIndex)` ⚓︎

handle page dense with indexes, looking for gaps to be corrected (should be rare)

Source code in pdfstruct/page.py

def handle_dense_index_page(self, index=SecIndex):
    """handle page dense with indexes, looking for gaps to be corrected (should be rare)"""
    nlines = len(self.lines)
    vspace = self.vspace
    logger.debug(f"vspace {vspace}")
    indexes = list(self.collect(kind=index))
    logger.debug(
        f"handling potential dense index type={index} on page {self.id} #indexes={len(indexes)} #lines={nlines}"
    )
    if nlines and (len(indexes) > 10 or len(indexes) > 0.7 * nlines):
        logger.info(f"Page {self.id} handling dense page for index={index}")
        for line in indexes[:-1]:
            logger.debug(f"dense index {line}")
            gap = []
            next_line = line.next
            while next_line and type(next_line.kind) is not index and next_line.page == line.page:
                logger.debug(f"expand index gap with <page{next_line.page}> {next_line}")
                gap.append(next_line)
                next_line = next_line.next
            if gap:
                content = " ".join(lline.content for lline in gap)
                logger.info(f"Page {self.id} gap in indexes for #lines={len(gap)} content={content}")
                # TO BE COMPLETED
                for i, xline in enumerate(gap):
                    if type(xline.kind) is index:
                        continue
                    if index == SecIndex:
                        # badly formatted index line
                        try:
                            m = SECTION_RE.match(xline.content, timeout=1)
                        except Exception:
                            logger.exception(f"regex timeout on content {xline.content}")
                            m = None
                        if m and m.group("page"):
                            xline.kind = SecIndex(m)
                            logger.info(f"Page {self.id} switched line {xline} to index")
                            continue
                        # multi-line index entry (3-lines)
                        if type(xline.kind) is Section:
                            next_line = xline.next
                            content = xline.content
                            merge = []
                            prev_line = xline
                            for j in range(i + 1, len(gap)):
                                yline = gap[j]
                                if (
                                    type(yline.kind) in {Section, SecIndex}
                                    or yline.base_y > prev_line.base_y + vspace
                                ):
                                    break
                                content += " " + yline.content
                                merge.append(yline)
                                try:
                                    m = SECTION_RE.match(content, timeout=1)
                                except Exception:
                                    logger.exception(f"regex timeout on content {content}")
                                    m = None
                                if m and m.group("page"):
                                    logger.debug(f"found tdm entry merging {merge}")
                                    for _ in merge:
                                        # logger.debug(f"merging")
                                        xline.join_next()
                                    break
                                prev_line = yline
        logger.debug(f"last index line {indexes[-1]}")
        if index == SecIndex:
            # looking for section entries without numbering as first or last entries
            # such as Introduction, Preamble or Conclusion
            def check_special_section_entry(potential_entry):
                if potential_entry and potential_entry.page == self.id:
                    m = SPECIAL_SECTION_ENTRY_RE.match(potential_entry.content)
                    if m:
                        logger.debug(f"special toc {m.group('dot')=} {m.group('page')=} {m.group('title')=}")
                        potential_entry.kind = SecIndex(m)
                        logger.info(f"Page {self.id} switched to section entry line {potential_entry}")

            # check_special_section_entry(indexes[0].prev)
            # check_special_section_entry(indexes[-1].next)
            for line in self.lines:
                logger.debug(f"Page {self.id} try0 special toc line kind={line.kind} {line}")
                if type(line.kind) not in {SecIndex, OtherIndex}:
                    logger.debug(f"Page {self.id} try special toc line {line}")
                    check_special_section_entry(line)

`is_tdm_page(index=SecIndex)` ⚓︎

Checks whether a Page is actually a Table of Contents. The number of SecIndexes on the Page must cover at least 70% of the lines.

Returns:

Type	Description
`bool`	if True, the Page contains a Table of Contents.

Source code in pdfstruct/page.py

def is_tdm_page(self, index=SecIndex):
    """Checks whether a Page is actually a Table of Contents.
    The number of SecIndexes on the Page must cover at least 70% of the lines.

    Returns:
        (bool): if True, the Page contains a Table of Contents.

    """
    nlines = len(self.lines)
    indexes = list(self.collect(kind=index))
    return nlines and (len(indexes) > 10 or len(indexes) > 0.7 * nlines)

`rotate()` ⚓︎

Source code in pdfstruct/page.py

def rotate(self):
    raw = self.raw
    width = raw["width"]
    height = raw["height"]
    raw["width"] = height
    raw["height"] = width

    def bbox_rotate(elt):
        bbox = elt["bbox"]
        elt["bbox"] = (height - bbox[3], bbox[0], height - bbox[1], bbox[2])

    for block in raw["blocks"]:
        bbox_rotate(block)
        if block["type"] != 0:
            continue
        for line in block["lines"]:
            line["dir"] = (1.0, 0.0)
            bbox_rotate(line)
            for span in line["spans"]:
                bbox_rotate(span)
                orig = span["origin"]
                span["origin"] = (height - orig[1], orig[0])

`set_orientation()` ⚓︎

Source code in pdfstruct/page.py

def set_orientation(self):
    orientations: Any = Counter()
    for block in self.raw["blocks"]:
        if block["type"] != 0:
            continue
        for line in block["lines"]:
            dir = tuple(line["dir"])
            orientations[dir] += 1
    try:
        orientation = orientations.most_common(1)[0][0]
    except Exception:
        # page with no textual content
        orientation = (0, 1)
    logger.debug(f"Page {self.id} main direction is {orientation}")
    if orientation == (0, -1):
        self.rotate()
        self.orientation = "landscape"
    else:
        self.orientation = "portrait"

    if self.prev and self.prev.orientation and self.prev.orientation != self.orientation:
        logger.info(f"Page {self.id} orientation switched from {self.prev.orientation} to {self.orientation}")
    elif not self.prev:
        logger.info(f"Page {self.id} orientation is {self.orientation}")
    else:
        logger.debug(f"Page {self.id} orientation is {self.orientation}")

Page⚓︎

blobs: List[Any] = [] instance-attribute ⚓︎

df property ⚓︎

doc = doc instance-attribute ⚓︎

doc_len = len(self.doc) if doc_len is None else doc_len instance-attribute ⚓︎

height = raw['height'] instance-attribute ⚓︎

id = pageid instance-attribute ⚓︎

lines: List[Any] = [] instance-attribute ⚓︎

links = links instance-attribute ⚓︎

most_common_line_spacing property ⚓︎

most_common_styles property ⚓︎

next = next instance-attribute ⚓︎

page_number = None instance-attribute ⚓︎

prev = prev instance-attribute ⚓︎

raw = raw instance-attribute ⚓︎

sections property ⚓︎

tdm property ⚓︎

vspace property ⚓︎

width = raw['width'] instance-attribute ⚓︎

words = doc.words[self.id] if doc.words is not None else None instance-attribute ⚓︎

__init__(pageid, raw, doc, links=None, prev=None, next=None, images=None, ocr=None, doc_len=None) ⚓︎

add_line(line) ⚓︎

assign_column_info(col_sep) ⚓︎

check_double_column() ⚓︎

check_loop() ⚓︎

clean() ⚓︎

collect(kind=Section) ⚓︎

detect_tables() ⚓︎

display(start=None, end=None) ⚓︎

display_html(start=None, end=None) ⚓︎

handle_dense_index_page(index=SecIndex) ⚓︎

is_tdm_page(index=SecIndex) ⚓︎

rotate() ⚓︎

set_orientation() ⚓︎

`blobs: List[Any] = []` `instance-attribute` ⚓︎

`df` `property` ⚓︎

`doc = doc` `instance-attribute` ⚓︎

`doc_len = len(self.doc) if doc_len is None else doc_len` `instance-attribute` ⚓︎

`height = raw['height']` `instance-attribute` ⚓︎

`id = pageid` `instance-attribute` ⚓︎

`lines: List[Any] = []` `instance-attribute` ⚓︎

`links = links` `instance-attribute` ⚓︎

`most_common_line_spacing` `property` ⚓︎

`most_common_styles` `property` ⚓︎

`next = next` `instance-attribute` ⚓︎

`page_number = None` `instance-attribute` ⚓︎

`prev = prev` `instance-attribute` ⚓︎

`raw = raw` `instance-attribute` ⚓︎

`sections` `property` ⚓︎

`tdm` `property` ⚓︎

`vspace` `property` ⚓︎

`width = raw['width']` `instance-attribute` ⚓︎

`words = doc.words[self.id] if doc.words is not None else None` `instance-attribute` ⚓︎

`init(pageid, raw, doc, links=None, prev=None, next=None, images=None, ocr=None, doc_len=None)` ⚓︎

`add_line(line)` ⚓︎

`assign_column_info(col_sep)` ⚓︎

`check_double_column()` ⚓︎

`check_loop()` ⚓︎

`clean()` ⚓︎

`collect(kind=Section)` ⚓︎

`detect_tables()` ⚓︎

`display(start=None, end=None)` ⚓︎

`display_html(start=None, end=None)` ⚓︎

`handle_dense_index_page(index=SecIndex)` ⚓︎

`is_tdm_page(index=SecIndex)` ⚓︎

`rotate()` ⚓︎

`set_orientation()` ⚓︎