Corpus

A class to apply Pdfstruct on a large corpus of files and extract statistics on the detected elements : Sections, styles, models...

Source code in pdfstruct/corpus.py

class Corpus:
    """A class to apply Pdfstruct on a large corpus of files and extract statistics on the detected elements :
    Sections, styles, models..."""

    def __init__(self, directory, files=None, destdir=None, stats=None):
        self.directory = directory
        """The source directory"""
        self.destdir = destdir if destdir else directory
        """The destination of the produced statistics file."""

        self.files = (
            files
            if files is not None
            else [Collection.from_pdf(file) for file in glob.glob(self.directory + "/*.pdf", recursive=True)]
        )
        if stats:
            self.statdict = self.from_stats(stats)
        else:
            self.statdict = self.get_statdict()
            """A dictionary containing the frequencies of the pdf elements"""

    def from_stats(self, stats):
        data: Dict[str, Any] = {
            "TOC Models": {},
            "Section Models": {},
            "Caption TOC Models": {},
            "Caption Models": {},
            "Section Titles": [],
            "Section Styles": [],
        }
        for stat in stats:
            if not stat:
                continue
            for m, v in stat.items():
                if isinstance(v, list):
                    data[m].extend(v)
                else:
                    for model, frequency in stat[m].items():
                        if model not in data[m]:
                            data[m][model] = int(frequency)
                        else:
                            data[m][model] += int(frequency)
        data["Section Titles"] = Counter(data["Section Titles"]).most_common()
        data["Section Titles"] = [t for t in data["Section Titles"] if t[1] > 1]
        data["Section Styles"] = Counter([tuple(dic.values()) for dic in data["Section Styles"]]).most_common()
        data["Section Styles"] = [t for t in data["Section Styles"] if t[1] > 1]
        return data

    def get_statdict(self):
        data: Dict[str, Any] = {
            "TOC Models": {},
            "Section Models": {},
            "Caption TOC Models": {},
            "Caption Models": {},
            "Section Titles": [],
            "Section Styles": [],
        }
        for col in self.files:
            for j, doc in enumerate(col.docs):
                models = [
                    ("TOC Models", doc.tdm_models()),
                    ("Section Models", doc.section_models()),
                    ("Caption TOC Models", doc.other_index_models()),
                    ("Caption Models", doc.other_models()),
                ]
                # print(f"DOC {j}")
                sections = list(doc.sections)
                data["Section Titles"].extend([section.kind.title for section in sections])
                data["Section Styles"].extend([section.most_common_styles for section in sections])
                for m, lst in models:
                    for model, frequency in lst:
                        if model not in data[m]:
                            data[m][model] = int(frequency)
                        else:
                            data[m][model] += int(frequency)

        data["Section Titles"] = Counter(data["Section Titles"]).most_common()
        data["Section Titles"] = [t for t in data["Section Titles"] if t[1] > 1]
        data["Section Styles"] = Counter([tuple(dic.values()) for dic in data["Section Styles"]]).most_common()
        data["Section Styles"] = [t for t in data["Section Styles"] if t[1] > 1]
        return data

    def make_df(self, key):
        data = self.statdict
        if isinstance(data[key], dict):
            records = [(k, v) for k, v in data[key].items()]
        else:
            records = data[key]
        df = pd.DataFrame.from_records(records, columns=[key, "Frequency"])
        return df

    def get_stats(self):
        dest = self.destdir + "/stats"
        if not os.path.exists(dest):
            os.makedirs(dest)
        with pd.ExcelWriter(f"{dest}/stats.xlsx") as writer:
            for stat in self.statdict:
                df = self.make_df(stat)
                df.to_excel(writer, sheet_name=stat)

    def make_global_df(self, marker):
        """Returns a bigger df with each row being a detected marker (section, caption...).
        It is more complete than the statdict because it contains more info (style and metadata).
        The available choices are "sections", "tdm","captions" and "caption_tdm"."""
        if marker == "cells":
            raise Exception("'cells' is not a valid property")
        rows = []
        for col in self.files:
            for doc in col.docs:
                meta = doc.metadata
                filename = doc.filename
                for item in getattr(doc, marker):
                    text, model, level = item.kind.title, item.kind.model, item.kind.level
                    styles = item.most_common_styles
                    font, size, flags, color = styles["font"], styles["size"], styles["flags"], styles["color"]
                    pdf_title, form, author, creator, producer = (
                        meta["title"],
                        meta["format"],
                        meta["author"],
                        meta["creator"],
                        meta["producer"],
                    )

                    row = [
                        filename,
                        text,
                        model,
                        level,
                        font,
                        size,
                        flags,
                        color,
                        pdf_title,
                        form,
                        author,
                        creator,
                        producer,
                    ]
                    rows.append(row)

        columns = [
            "filename",
            "text",
            "model",
            "level",
            "font",
            "size",
            "flags",
            "color",
            "pdf_title",
            "format",
            "author",
            "creator",
            "producer",
        ]
        df = pd.DataFrame(rows, columns=columns)
        return df

    def get_grouped_frequencies(columns, df):
        """returns an aggregated df with frequencies with the chosen features (model, font, size... ).
        Features must be a list of strings.
        It is meant to be used after make_global_df()"""
        return df.groupby(columns).size().reset_index(name="frequency").sort_values(["frequency"], ascending=False)

`destdir = destdir if destdir else directory` `instance-attribute` ⚓︎

The destination of the produced statistics file.

`directory = directory` `instance-attribute` ⚓︎

The source directory

`get_grouped_frequencies(columns, df)` ⚓︎

returns an aggregated df with frequencies with the chosen features (model, font, size... ). Features must be a list of strings. It is meant to be used after make_global_df()

Source code in pdfstruct/corpus.py

def get_grouped_frequencies(columns, df):
    """returns an aggregated df with frequencies with the chosen features (model, font, size... ).
    Features must be a list of strings.
    It is meant to be used after make_global_df()"""
    return df.groupby(columns).size().reset_index(name="frequency").sort_values(["frequency"], ascending=False)

`make_global_df(marker)` ⚓︎

Returns a bigger df with each row being a detected marker (section, caption...). It is more complete than the statdict because it contains more info (style and metadata). The available choices are "sections", "tdm","captions" and "caption_tdm".

Source code in pdfstruct/corpus.py

def make_global_df(self, marker):
    """Returns a bigger df with each row being a detected marker (section, caption...).
    It is more complete than the statdict because it contains more info (style and metadata).
    The available choices are "sections", "tdm","captions" and "caption_tdm"."""
    if marker == "cells":
        raise Exception("'cells' is not a valid property")
    rows = []
    for col in self.files:
        for doc in col.docs:
            meta = doc.metadata
            filename = doc.filename
            for item in getattr(doc, marker):
                text, model, level = item.kind.title, item.kind.model, item.kind.level
                styles = item.most_common_styles
                font, size, flags, color = styles["font"], styles["size"], styles["flags"], styles["color"]
                pdf_title, form, author, creator, producer = (
                    meta["title"],
                    meta["format"],
                    meta["author"],
                    meta["creator"],
                    meta["producer"],
                )

                row = [
                    filename,
                    text,
                    model,
                    level,
                    font,
                    size,
                    flags,
                    color,
                    pdf_title,
                    form,
                    author,
                    creator,
                    producer,
                ]
                rows.append(row)

    columns = [
        "filename",
        "text",
        "model",
        "level",
        "font",
        "size",
        "flags",
        "color",
        "pdf_title",
        "format",
        "author",
        "creator",
        "producer",
    ]
    df = pd.DataFrame(rows, columns=columns)
    return df

Corpus

destdir = destdir if destdir else directory instance-attribute ⚓︎

directory = directory instance-attribute ⚓︎

get_grouped_frequencies(columns, df) ⚓︎

make_global_df(marker) ⚓︎

`destdir = destdir if destdir else directory` `instance-attribute` ⚓︎

`directory = directory` `instance-attribute` ⚓︎

`get_grouped_frequencies(columns, df)` ⚓︎

`make_global_df(marker)` ⚓︎