Logical Sections and Paragraph Extraction (in progress)⚓︎

An optional class is dedicated to reconstructing the logical section of the document and dividing it into paragraphs. To do so, the extract_words parameter must be set to True in the from_pdf() class method of the Collection or the Document.

doc = Document.from_pdf("my_pdf_file.pdf", extract_words=True)

# the index of the logical section
index = 2
logical = doc.get_logical_section(index=index)
print(logical.title)

all_paras = logical.get_all_paragraphs()

for para in all_paras:

    print(para.content)
    print(para.words)

Logical Section⚓︎

Source code in pdfstruct/logical_section.py

class LogicalSection:
    def __init__(self, doc, title, sections):
        self.title = title
        self.page_start = title.page
        self.doc = doc
        # self.textlines = [line for line in self.lines if line.kind is None
        self.id = sections.index(title)
        self.next_section = sections[self.id + 1] if self.id < len(sections) - 1 else None
        try:
            self.lines = self.get_section_lines()
        except Exception:
            logger.exception(f"pb collecting lines for section {title}")
        logger.info(f"got all lines #={len(self.lines)}")
        if self.lines:
            # sometimes titles follow each other directly
            self.page_end = self.lines[-1].page
            self.pages = sorted(list(set([line.page for line in self.lines])))
            logger.info("detecting lists")
            self.lists = self.detect_lists()
            logger.info("detecting paragraphs")
            self.paragraphs = self.detect_paragraphs()
            logger.info("done")
        else:
            self.page_end = self.page_start
            self.pages = [self.page_start]
            self.lists, self.paragraphs = [], []

    def get_section_lines(self):
        # can't use a while loop with line.next because lines are indexed via their page, not the whole document
        # so for now, we go through the pages of the document and collect the lines in chunks
        # it's less readable and maybe a little bit slower?
        section_page = self.doc.pages[self.page_start]

        logger.debug(f"section {self.title} starting on page {section_page.id}")

        page_lines = section_page.lines

        try:
            section_pageindex = page_lines.index(self.title)
        except ValueError:
            msg = "\n".join(str(p) for p in page_lines)
            logger.error(f"section {self.title} not found in page {self.page_start} with lines={msg}")
            return []

        # logger.debug(f"section starting on line {section_pageindex}")

        if self.next_section is not None:
            start_pageid = self.page_start
            end_pageid = self.next_section.page
            logger.debug(
                f"collecting lines until next section {self.next_section} from page {start_pageid} to {end_pageid}"
            )

            lines: List[Any] = []
            for pageid in range(start_pageid, end_pageid + 1):
                logger.debug(f"collecting on page {pageid}")
                p = self.doc.pages[pageid]
                _lines = p.lines
                if pageid == end_pageid:
                    _lines = _lines[: p.lines.index(self.next_section)]
                if pageid == start_pageid:
                    _lines = _lines[section_pageindex:]
                _lines = [_line for _line in _lines if type(_line.kind) not in {Header, Footer}]
                if len(_lines) < 2000:
                    lines.extend(_lines)
                else:
                    logger.warning(f"skipping page {pageid} because of too many lines {len(_lines)}")
                logger.debug(f"#lines={len(lines)} #llines={len(_lines)}")
        else:
            # it's the last section of the document
            logger.debug("collecting lines on all remaining pages")
            lines = [line for page in self.doc.pages[section_pageindex::] for line in page.lines if line != self.title]
        return lines

    def display(self):
        for line in self.lines:
            print(line)

    def detect_lists(self, thr=3):
        # TODO : there are still some issues with nested lists, as they are detected twice.
        doc = self.doc
        lst = self.lines

        final_ponct = re.compile(r"([\.\?\!…])")
        starts = []
        pages = []
        fulls = []
        for i, line in enumerate(lst):
            page = doc.pages[line.page]
            prev_line = line.prev if line.prev else lst[i - 1]
            prev_spacing = line.prev_spacing if line.prev else abs((round(line.y0) - round(lst[i - 1].y1)))
            prev_delta_spacing = prev_spacing - page.most_common_line_spacing
            # next_delta_spacing = line.next_spacing - page.most_common_line_spacing
            # potential start of list
            if line.content.endswith(":") and line.next:
                m = re.match(pats["list_start"], line.next.content)
                if m:
                    pages.append(line.page)
                    starts.append(m)
                    rest = lst[i + 2 : :]
                    # print(len(rest))
                    # if the entirety of the intro is in the current line, we only need to collect the following lines
                    if line.content[0].isupper() and (
                        prev_delta_spacing > thr
                        or round(prev_line.x0 > round(line.x0) + 3)
                        or i == 0
                        or (prev_line.x1 < line.x1 and re.match(final_ponct, prev_line.content[-1]))
                    ):
                        intro = [line]
                        items = collect_list_items(m, line, rest, page, thr)
                    else:
                        # otherwise, we must capture the lines before and add it to the 'intro paragraph'.
                        before = list(reversed(lst[:i]))
                        intro = collect_intro(line, before, page, thr)
                        items = collect_list_items(m, line, rest, page, thr)
                    full = TextList(intro=intro, items=items, m=m, doc=self.doc)
                    fulls.append(full)
            # potential rest of list but on another page
            if pages and fulls:
                last = fulls[-1]
                try:
                    line_index = page.lines.index(line)
                    if re.match(last.segmenter, line.content) and line_index <= 1:
                        rest = lst[i + 1 : :]
                        other_items = collect_list_items(last.match, line, rest, page, thr, newpage=True)
                        last.add_items(other_items)
                except Exception:
                    pass
        return fulls

    def detect_paragraphs(self, thr=3):
        doc = self.doc
        all_lists_lines = [line for lst in self.lists for line in lst.lines]

        def filter_lines():
            groups = []
            group = []

            for line in self.lines:
                if line.kind is None and line not in all_lists_lines:
                    group.append(line)
                else:
                    if group:
                        groups.append(group)
                        group = []
            if group:
                groups.append(group)
            return groups

        paragraphs = []
        text_groups = filter_lines()

        final_ponct = re.compile(r"[\.\?\!…;]")
        for lst in text_groups:
            paras = []
            para = []
            page = doc.pages[lst[0].page]
            for i, line in enumerate(lst):
                para.append(line)
                if i == len(lst) - 1:
                    paras.append(para)
                    break
                next_line = lst[i + 1]
                line_spacing = abs((round(next_line.y0) - round(line.y1)))
                diff = line_spacing - page.most_common_line_spacing
                if diff > thr and (re.match(final_ponct, line.content[-1]) or line.x0 > next_line.x0 + 20):
                    paras.append(para)
                    para = []
            paragraphs.extend([Paragraph(p, doc) for p in paras])
        return paragraphs

    def get_all_paragraphs(self):
        all_objs = self.paragraphs + self.lists

        all_objs.sort(key=lambda x: self.lines.index(x.lines[0]))

        return all_objs

`doc = doc` `instance-attribute` ⚓︎

`id = sections.index(title)` `instance-attribute` ⚓︎

`lines = self.get_section_lines()` `instance-attribute` ⚓︎

`lists = self.detect_lists()` `instance-attribute` ⚓︎

`next_section = sections[self.id + 1] if self.id < len(sections) - 1 else None` `instance-attribute` ⚓︎

`page_end = self.lines[-1].page` `instance-attribute` ⚓︎

`page_start = title.page` `instance-attribute` ⚓︎

`pages = sorted(list(set([line.page for line in self.lines])))` `instance-attribute` ⚓︎

`paragraphs = self.detect_paragraphs()` `instance-attribute` ⚓︎

`title = title` `instance-attribute` ⚓︎

`init(doc, title, sections)` ⚓︎

Source code in pdfstruct/logical_section.py

def __init__(self, doc, title, sections):
    self.title = title
    self.page_start = title.page
    self.doc = doc
    # self.textlines = [line for line in self.lines if line.kind is None
    self.id = sections.index(title)
    self.next_section = sections[self.id + 1] if self.id < len(sections) - 1 else None
    try:
        self.lines = self.get_section_lines()
    except Exception:
        logger.exception(f"pb collecting lines for section {title}")
    logger.info(f"got all lines #={len(self.lines)}")
    if self.lines:
        # sometimes titles follow each other directly
        self.page_end = self.lines[-1].page
        self.pages = sorted(list(set([line.page for line in self.lines])))
        logger.info("detecting lists")
        self.lists = self.detect_lists()
        logger.info("detecting paragraphs")
        self.paragraphs = self.detect_paragraphs()
        logger.info("done")
    else:
        self.page_end = self.page_start
        self.pages = [self.page_start]
        self.lists, self.paragraphs = [], []

`detect_lists(thr=3)` ⚓︎

Source code in pdfstruct/logical_section.py

def detect_lists(self, thr=3):
    # TODO : there are still some issues with nested lists, as they are detected twice.
    doc = self.doc
    lst = self.lines

    final_ponct = re.compile(r"([\.\?\!…])")
    starts = []
    pages = []
    fulls = []
    for i, line in enumerate(lst):
        page = doc.pages[line.page]
        prev_line = line.prev if line.prev else lst[i - 1]
        prev_spacing = line.prev_spacing if line.prev else abs((round(line.y0) - round(lst[i - 1].y1)))
        prev_delta_spacing = prev_spacing - page.most_common_line_spacing
        # next_delta_spacing = line.next_spacing - page.most_common_line_spacing
        # potential start of list
        if line.content.endswith(":") and line.next:
            m = re.match(pats["list_start"], line.next.content)
            if m:
                pages.append(line.page)
                starts.append(m)
                rest = lst[i + 2 : :]
                # print(len(rest))
                # if the entirety of the intro is in the current line, we only need to collect the following lines
                if line.content[0].isupper() and (
                    prev_delta_spacing > thr
                    or round(prev_line.x0 > round(line.x0) + 3)
                    or i == 0
                    or (prev_line.x1 < line.x1 and re.match(final_ponct, prev_line.content[-1]))
                ):
                    intro = [line]
                    items = collect_list_items(m, line, rest, page, thr)
                else:
                    # otherwise, we must capture the lines before and add it to the 'intro paragraph'.
                    before = list(reversed(lst[:i]))
                    intro = collect_intro(line, before, page, thr)
                    items = collect_list_items(m, line, rest, page, thr)
                full = TextList(intro=intro, items=items, m=m, doc=self.doc)
                fulls.append(full)
        # potential rest of list but on another page
        if pages and fulls:
            last = fulls[-1]
            try:
                line_index = page.lines.index(line)
                if re.match(last.segmenter, line.content) and line_index <= 1:
                    rest = lst[i + 1 : :]
                    other_items = collect_list_items(last.match, line, rest, page, thr, newpage=True)
                    last.add_items(other_items)
            except Exception:
                pass
    return fulls

`detect_paragraphs(thr=3)` ⚓︎

Source code in pdfstruct/logical_section.py

def detect_paragraphs(self, thr=3):
    doc = self.doc
    all_lists_lines = [line for lst in self.lists for line in lst.lines]

    def filter_lines():
        groups = []
        group = []

        for line in self.lines:
            if line.kind is None and line not in all_lists_lines:
                group.append(line)
            else:
                if group:
                    groups.append(group)
                    group = []
        if group:
            groups.append(group)
        return groups

    paragraphs = []
    text_groups = filter_lines()

    final_ponct = re.compile(r"[\.\?\!…;]")
    for lst in text_groups:
        paras = []
        para = []
        page = doc.pages[lst[0].page]
        for i, line in enumerate(lst):
            para.append(line)
            if i == len(lst) - 1:
                paras.append(para)
                break
            next_line = lst[i + 1]
            line_spacing = abs((round(next_line.y0) - round(line.y1)))
            diff = line_spacing - page.most_common_line_spacing
            if diff > thr and (re.match(final_ponct, line.content[-1]) or line.x0 > next_line.x0 + 20):
                paras.append(para)
                para = []
        paragraphs.extend([Paragraph(p, doc) for p in paras])
    return paragraphs

`display()` ⚓︎

Source code in pdfstruct/logical_section.py

def display(self):
    for line in self.lines:
        print(line)

`get_all_paragraphs()` ⚓︎

Source code in pdfstruct/logical_section.py

def get_all_paragraphs(self):
    all_objs = self.paragraphs + self.lists

    all_objs.sort(key=lambda x: self.lines.index(x.lines[0]))

    return all_objs

`get_section_lines()` ⚓︎

Source code in pdfstruct/logical_section.py

def get_section_lines(self):
    # can't use a while loop with line.next because lines are indexed via their page, not the whole document
    # so for now, we go through the pages of the document and collect the lines in chunks
    # it's less readable and maybe a little bit slower?
    section_page = self.doc.pages[self.page_start]

    logger.debug(f"section {self.title} starting on page {section_page.id}")

    page_lines = section_page.lines

    try:
        section_pageindex = page_lines.index(self.title)
    except ValueError:
        msg = "\n".join(str(p) for p in page_lines)
        logger.error(f"section {self.title} not found in page {self.page_start} with lines={msg}")
        return []

    # logger.debug(f"section starting on line {section_pageindex}")

    if self.next_section is not None:
        start_pageid = self.page_start
        end_pageid = self.next_section.page
        logger.debug(
            f"collecting lines until next section {self.next_section} from page {start_pageid} to {end_pageid}"
        )

        lines: List[Any] = []
        for pageid in range(start_pageid, end_pageid + 1):
            logger.debug(f"collecting on page {pageid}")
            p = self.doc.pages[pageid]
            _lines = p.lines
            if pageid == end_pageid:
                _lines = _lines[: p.lines.index(self.next_section)]
            if pageid == start_pageid:
                _lines = _lines[section_pageindex:]
            _lines = [_line for _line in _lines if type(_line.kind) not in {Header, Footer}]
            if len(_lines) < 2000:
                lines.extend(_lines)
            else:
                logger.warning(f"skipping page {pageid} because of too many lines {len(_lines)}")
            logger.debug(f"#lines={len(lines)} #llines={len(_lines)}")
    else:
        # it's the last section of the document
        logger.debug("collecting lines on all remaining pages")
        lines = [line for page in self.doc.pages[section_pageindex::] for line in page.lines if line != self.title]
    return lines

Paragraph⚓︎

Source code in pdfstruct/paragraph.py

class Paragraph:
    def __init__(self, lst, doc):
        self.lines = lst
        self.x0 = {"x0": lst[0].x0, "page": lst[0].page}
        self.y0 = {"y0": lst[0].y0, "page": lst[0].page}
        self.x1 = {"x1": lst[-1].x1, "page": lst[-1].page}
        self.y1 = {"y1": lst[-1].y1, "page": lst[-1].page}
        self.content = clean_para(lst)
        self.pages = list(set([line.page for line in lst]))
        # TODO : add section id
        # self.section =
        self.words = self.get_words(doc)

    @property
    def is_normal(self):
        """A paragraph that is well formed and does not contain unusual characteristics."""

        return is_normal_para(self.lines)

    def get_words(self, doc):
        words = []
        for line in self.lines:
            # page = line.page
            # page_words = doc.pages[page].words
            for raw in line.raws:
                for w in raw["words"]:
                    if w[4] in line.content:
                        keys = ["x0", "y0", "x1", "y1", "word"]
                        wdict = {k: v for k, v in list(zip(keys, w[0:5]))}
                        words.append(wdict)
        # TODO: check if the strings are the same (will probably need to add options to clean_para())
        # str1 = " ".join([d["string"] for d in self.words])
        # assert str1.replace("’", "'") == self.content
        return words

`content = clean_para(lst)` `instance-attribute` ⚓︎

`is_normal` `property` ⚓︎

A paragraph that is well formed and does not contain unusual characteristics.

`lines = lst` `instance-attribute` ⚓︎

`pages = list(set([line.page for line in lst]))` `instance-attribute` ⚓︎

`words = self.get_words(doc)` `instance-attribute` ⚓︎

`x0 = {'x0': lst[0].x0, 'page': lst[0].page}` `instance-attribute` ⚓︎

`x1 = {'x1': lst[-1].x1, 'page': lst[-1].page}` `instance-attribute` ⚓︎

`y0 = {'y0': lst[0].y0, 'page': lst[0].page}` `instance-attribute` ⚓︎

`y1 = {'y1': lst[-1].y1, 'page': lst[-1].page}` `instance-attribute` ⚓︎

`init(lst, doc)` ⚓︎

Source code in pdfstruct/paragraph.py

def __init__(self, lst, doc):
    self.lines = lst
    self.x0 = {"x0": lst[0].x0, "page": lst[0].page}
    self.y0 = {"y0": lst[0].y0, "page": lst[0].page}
    self.x1 = {"x1": lst[-1].x1, "page": lst[-1].page}
    self.y1 = {"y1": lst[-1].y1, "page": lst[-1].page}
    self.content = clean_para(lst)
    self.pages = list(set([line.page for line in lst]))
    # TODO : add section id
    # self.section =
    self.words = self.get_words(doc)

`get_words(doc)` ⚓︎

Source code in pdfstruct/paragraph.py

def get_words(self, doc):
    words = []
    for line in self.lines:
        # page = line.page
        # page_words = doc.pages[page].words
        for raw in line.raws:
            for w in raw["words"]:
                if w[4] in line.content:
                    keys = ["x0", "y0", "x1", "y1", "word"]
                    wdict = {k: v for k, v in list(zip(keys, w[0:5]))}
                    words.append(wdict)
    # TODO: check if the strings are the same (will probably need to add options to clean_para())
    # str1 = " ".join([d["string"] for d in self.words])
    # assert str1.replace("’", "'") == self.content
    return words

TextList⚓︎

A class used to detect bullet point lists. Is still in progress.

Source code in pdfstruct/lists.py

class TextList:
    """A class used to detect bullet point lists. Is still in progress."""

    def __init__(self, doc, m, intro=None, items=None):
        self.doc = doc
        self.intro = intro
        """The introductory text before the enumeration"""
        self.intro_content = clean_para(intro)
        self.items: List[ListItem] = [ListItem(item) for item in items]
        """The items of the list"""
        self.lines = intro + [line for item in self.items for line in item.lines]
        """All the lines of the list"""
        self.match = m
        """The regex Match object that captured the list."""
        symbol = m.groupdict()["list_symbols"]
        """The bullet point symbol used to separate list items"""
        numbering = m.groupdict()["num"]
        """The numeration used to separate list items."""
        self.segmenter = symbol if symbol else numbering
        self.segmenter_type = "symbol" if symbol else "numbering"
        self.check_items()
        self.content = self.intro_content + "\n" + "\n".join([item.content for item in self.items])
        self.words = self.get_words(self.doc)

        # TODO : add coordinates (bbox)

    def add_item(self, item):
        item = ListItem(item)
        self.items.append(item)

    def add_items(self, items):
        for item in items:
            self.add_item(item)
        self.check_items()

    def remove_item(self, item):
        self.items.remove(item)
        return self.items

    def check_items(self):
        for item in self.items:
            if not re.match(self.segmenter, item.content[0]):
                self.remove_item(item)

    @property
    def is_normal(self):
        """A list that is well formed and does not contain unusual characteristics."""
        for line in self.intro:
            if re.match(pats["list_start"], line.content):
                return False
        if not self.items or len(self.lines) > 25 or self.content[0].islower():
            return False
        return True

    def get_words(self, doc):
        words = []
        for line in self.lines:
            page = line.page
            page_words = doc.pages[page].words
            for w in page_words:
                if (
                    line.page == page
                    and w[5] == line.raw["original_block_id"]
                    and w[6] == line.raw["original_line_id"]
                    and w[4] in line.content
                ):
                    keys = ["x0", "y0", "x1", "y1", "word"]
                    wdict = {k: v for k, v in list(zip(keys, w[0:5]))}
                    words.append(wdict)

        # TODO: check if the strings are the same (will probably need to add options to clean_para())
        # str1 = " ".join([d["string"] for d in self.words])
        # assert str1.replace("’", "'") == self.content
        return words

`content = self.intro_content + '\n' + '\n'.join([item.content for item in self.items])` `instance-attribute` ⚓︎

`doc = doc` `instance-attribute` ⚓︎

`intro = intro` `instance-attribute` ⚓︎

The introductory text before the enumeration

`intro_content = clean_para(intro)` `instance-attribute` ⚓︎

`is_normal` `property` ⚓︎

A list that is well formed and does not contain unusual characteristics.

`items: List[ListItem] = [ListItem(item) for item in items]` `instance-attribute` ⚓︎

The items of the list

`lines = intro + [line for item in self.items for line in item.lines]` `instance-attribute` ⚓︎

All the lines of the list

`match = m` `instance-attribute` ⚓︎

The regex Match object that captured the list.

`segmenter = symbol if symbol else numbering` `instance-attribute` ⚓︎

`segmenter_type = 'symbol' if symbol else 'numbering'` `instance-attribute` ⚓︎

`words = self.get_words(self.doc)` `instance-attribute` ⚓︎

`init(doc, m, intro=None, items=None)` ⚓︎

Source code in pdfstruct/lists.py

def __init__(self, doc, m, intro=None, items=None):
    self.doc = doc
    self.intro = intro
    """The introductory text before the enumeration"""
    self.intro_content = clean_para(intro)
    self.items: List[ListItem] = [ListItem(item) for item in items]
    """The items of the list"""
    self.lines = intro + [line for item in self.items for line in item.lines]
    """All the lines of the list"""
    self.match = m
    """The regex Match object that captured the list."""
    symbol = m.groupdict()["list_symbols"]
    """The bullet point symbol used to separate list items"""
    numbering = m.groupdict()["num"]
    """The numeration used to separate list items."""
    self.segmenter = symbol if symbol else numbering
    self.segmenter_type = "symbol" if symbol else "numbering"
    self.check_items()
    self.content = self.intro_content + "\n" + "\n".join([item.content for item in self.items])
    self.words = self.get_words(self.doc)

`add_item(item)` ⚓︎

Source code in pdfstruct/lists.py

def add_item(self, item):
    item = ListItem(item)
    self.items.append(item)

`add_items(items)` ⚓︎

Source code in pdfstruct/lists.py

def add_items(self, items):
    for item in items:
        self.add_item(item)
    self.check_items()

`check_items()` ⚓︎

Source code in pdfstruct/lists.py

def check_items(self):
    for item in self.items:
        if not re.match(self.segmenter, item.content[0]):
            self.remove_item(item)

`get_words(doc)` ⚓︎

Source code in pdfstruct/lists.py

def get_words(self, doc):
    words = []
    for line in self.lines:
        page = line.page
        page_words = doc.pages[page].words
        for w in page_words:
            if (
                line.page == page
                and w[5] == line.raw["original_block_id"]
                and w[6] == line.raw["original_line_id"]
                and w[4] in line.content
            ):
                keys = ["x0", "y0", "x1", "y1", "word"]
                wdict = {k: v for k, v in list(zip(keys, w[0:5]))}
                words.append(wdict)

    # TODO: check if the strings are the same (will probably need to add options to clean_para())
    # str1 = " ".join([d["string"] for d in self.words])
    # assert str1.replace("’", "'") == self.content
    return words

`remove_item(item)` ⚓︎

Source code in pdfstruct/lists.py

def remove_item(self, item):
    self.items.remove(item)
    return self.items

Bases: TextList

A class to store list items and their clean textual content.

Source code in pdfstruct/lists.py

class ListItem(TextList):
    """A class to store list items and their clean textual content."""

    def __init__(self, item):
        self.lines: List[Any] = item
        self.prev = None
        self.next = None
        self.content: str = clean_para(self.lines)

`content: str = clean_para(self.lines)` `instance-attribute` ⚓︎

`lines: List[Any] = item` `instance-attribute` ⚓︎

`next = None` `instance-attribute` ⚓︎

`prev = None` `instance-attribute` ⚓︎

`init(item)` ⚓︎

Source code in pdfstruct/lists.py

def __init__(self, item):
    self.lines: List[Any] = item
    self.prev = None
    self.next = None
    self.content: str = clean_para(self.lines)

Logical Sections and Paragraph Extraction (in progress)⚓︎

Logical Section⚓︎

doc = doc instance-attribute ⚓︎

id = sections.index(title) instance-attribute ⚓︎

lines = self.get_section_lines() instance-attribute ⚓︎

lists = self.detect_lists() instance-attribute ⚓︎

next_section = sections[self.id + 1] if self.id < len(sections) - 1 else None instance-attribute ⚓︎

page_end = self.lines[-1].page instance-attribute ⚓︎

page_start = title.page instance-attribute ⚓︎

pages = sorted(list(set([line.page for line in self.lines]))) instance-attribute ⚓︎

paragraphs = self.detect_paragraphs() instance-attribute ⚓︎

title = title instance-attribute ⚓︎

__init__(doc, title, sections) ⚓︎

detect_lists(thr=3) ⚓︎

detect_paragraphs(thr=3) ⚓︎

display() ⚓︎

get_all_paragraphs() ⚓︎

get_section_lines() ⚓︎

Paragraph⚓︎

content = clean_para(lst) instance-attribute ⚓︎

is_normal property ⚓︎

lines = lst instance-attribute ⚓︎

pages = list(set([line.page for line in lst])) instance-attribute ⚓︎

words = self.get_words(doc) instance-attribute ⚓︎

x0 = {'x0': lst[0].x0, 'page': lst[0].page} instance-attribute ⚓︎

x1 = {'x1': lst[-1].x1, 'page': lst[-1].page} instance-attribute ⚓︎

y0 = {'y0': lst[0].y0, 'page': lst[0].page} instance-attribute ⚓︎

y1 = {'y1': lst[-1].y1, 'page': lst[-1].page} instance-attribute ⚓︎

__init__(lst, doc) ⚓︎

get_words(doc) ⚓︎

TextList⚓︎

content = self.intro_content + '\n' + '\n'.join([item.content for item in self.items]) instance-attribute ⚓︎

doc = doc instance-attribute ⚓︎

intro = intro instance-attribute ⚓︎

intro_content = clean_para(intro) instance-attribute ⚓︎

is_normal property ⚓︎

items: List[ListItem] = [ListItem(item) for item in items] instance-attribute ⚓︎

lines = intro + [line for item in self.items for line in item.lines] instance-attribute ⚓︎

match = m instance-attribute ⚓︎

segmenter = symbol if symbol else numbering instance-attribute ⚓︎

segmenter_type = 'symbol' if symbol else 'numbering' instance-attribute ⚓︎

words = self.get_words(self.doc) instance-attribute ⚓︎

__init__(doc, m, intro=None, items=None) ⚓︎

add_item(item) ⚓︎

add_items(items) ⚓︎

check_items() ⚓︎

get_words(doc) ⚓︎

remove_item(item) ⚓︎

content: str = clean_para(self.lines) instance-attribute ⚓︎

lines: List[Any] = item instance-attribute ⚓︎

next = None instance-attribute ⚓︎

prev = None instance-attribute ⚓︎

__init__(item) ⚓︎

`doc = doc` `instance-attribute` ⚓︎

`id = sections.index(title)` `instance-attribute` ⚓︎

`lines = self.get_section_lines()` `instance-attribute` ⚓︎

`lists = self.detect_lists()` `instance-attribute` ⚓︎

`next_section = sections[self.id + 1] if self.id < len(sections) - 1 else None` `instance-attribute` ⚓︎

`page_end = self.lines[-1].page` `instance-attribute` ⚓︎

`page_start = title.page` `instance-attribute` ⚓︎

`pages = sorted(list(set([line.page for line in self.lines])))` `instance-attribute` ⚓︎

`paragraphs = self.detect_paragraphs()` `instance-attribute` ⚓︎

`title = title` `instance-attribute` ⚓︎

`init(doc, title, sections)` ⚓︎

`detect_lists(thr=3)` ⚓︎

`detect_paragraphs(thr=3)` ⚓︎

`display()` ⚓︎

`get_all_paragraphs()` ⚓︎

`get_section_lines()` ⚓︎

`content = clean_para(lst)` `instance-attribute` ⚓︎

`is_normal` `property` ⚓︎

`lines = lst` `instance-attribute` ⚓︎

`pages = list(set([line.page for line in lst]))` `instance-attribute` ⚓︎

`words = self.get_words(doc)` `instance-attribute` ⚓︎

`x0 = {'x0': lst[0].x0, 'page': lst[0].page}` `instance-attribute` ⚓︎

`x1 = {'x1': lst[-1].x1, 'page': lst[-1].page}` `instance-attribute` ⚓︎

`y0 = {'y0': lst[0].y0, 'page': lst[0].page}` `instance-attribute` ⚓︎

`y1 = {'y1': lst[-1].y1, 'page': lst[-1].page}` `instance-attribute` ⚓︎

`init(lst, doc)` ⚓︎

`get_words(doc)` ⚓︎

`content = self.intro_content + '\n' + '\n'.join([item.content for item in self.items])` `instance-attribute` ⚓︎

`doc = doc` `instance-attribute` ⚓︎

`intro = intro` `instance-attribute` ⚓︎

`intro_content = clean_para(intro)` `instance-attribute` ⚓︎

`is_normal` `property` ⚓︎

`items: List[ListItem] = [ListItem(item) for item in items]` `instance-attribute` ⚓︎

`lines = intro + [line for item in self.items for line in item.lines]` `instance-attribute` ⚓︎

`match = m` `instance-attribute` ⚓︎

`segmenter = symbol if symbol else numbering` `instance-attribute` ⚓︎

`segmenter_type = 'symbol' if symbol else 'numbering'` `instance-attribute` ⚓︎

`words = self.get_words(self.doc)` `instance-attribute` ⚓︎

`init(doc, m, intro=None, items=None)` ⚓︎

`add_item(item)` ⚓︎

`add_items(items)` ⚓︎

`check_items()` ⚓︎

`get_words(doc)` ⚓︎

`remove_item(item)` ⚓︎

`content: str = clean_para(self.lines)` `instance-attribute` ⚓︎

`lines: List[Any] = item` `instance-attribute` ⚓︎

`next = None` `instance-attribute` ⚓︎

`prev = None` `instance-attribute` ⚓︎

`init(item)` ⚓︎