Skip to content

Corpus

A class to apply Pdfstruct on a large corpus of files and extract statistics on the detected elements : Sections, styles, models...

Source code in pdfstruct/corpus.py
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
class Corpus:
    """A class to apply Pdfstruct on a large corpus of files and extract statistics on the detected elements :
    Sections, styles, models..."""

    def __init__(self, directory, files=None, destdir=None, stats=None):
        self.directory = directory
        """The source directory"""
        self.destdir = destdir if destdir else directory
        """The destination of the produced statistics file."""

        self.files = (
            files
            if files is not None
            else [Collection.from_pdf(file) for file in glob.glob(self.directory + "/*.pdf", recursive=True)]
        )
        if stats:
            self.statdict = self.from_stats(stats)
        else:
            self.statdict = self.get_statdict()
            """A dictionary containing the frequencies of the pdf elements"""

    def from_stats(self, stats):
        data: Dict[str, Any] = {
            "TOC Models": {},
            "Section Models": {},
            "Caption TOC Models": {},
            "Caption Models": {},
            "Section Titles": [],
            "Section Styles": [],
        }
        for stat in stats:
            if not stat:
                continue
            for m, v in stat.items():
                if isinstance(v, list):
                    data[m].extend(v)
                else:
                    for model, frequency in stat[m].items():
                        if model not in data[m]:
                            data[m][model] = int(frequency)
                        else:
                            data[m][model] += int(frequency)
        data["Section Titles"] = Counter(data["Section Titles"]).most_common()
        data["Section Titles"] = [t for t in data["Section Titles"] if t[1] > 1]
        data["Section Styles"] = Counter([tuple(dic.values()) for dic in data["Section Styles"]]).most_common()
        data["Section Styles"] = [t for t in data["Section Styles"] if t[1] > 1]
        return data

    def get_statdict(self):
        data: Dict[str, Any] = {
            "TOC Models": {},
            "Section Models": {},
            "Caption TOC Models": {},
            "Caption Models": {},
            "Section Titles": [],
            "Section Styles": [],
        }
        for col in self.files:
            for j, doc in enumerate(col.docs):
                models = [
                    ("TOC Models", doc.tdm_models()),
                    ("Section Models", doc.section_models()),
                    ("Caption TOC Models", doc.other_index_models()),
                    ("Caption Models", doc.other_models()),
                ]
                # print(f"DOC {j}")
                sections = list(doc.sections)
                data["Section Titles"].extend([section.kind.title for section in sections])
                data["Section Styles"].extend([section.most_common_styles for section in sections])
                for m, lst in models:
                    for model, frequency in lst:
                        if model not in data[m]:
                            data[m][model] = int(frequency)
                        else:
                            data[m][model] += int(frequency)

        data["Section Titles"] = Counter(data["Section Titles"]).most_common()
        data["Section Titles"] = [t for t in data["Section Titles"] if t[1] > 1]
        data["Section Styles"] = Counter([tuple(dic.values()) for dic in data["Section Styles"]]).most_common()
        data["Section Styles"] = [t for t in data["Section Styles"] if t[1] > 1]
        return data

    def make_df(self, key):
        data = self.statdict
        if isinstance(data[key], dict):
            records = [(k, v) for k, v in data[key].items()]
        else:
            records = data[key]
        df = pd.DataFrame.from_records(records, columns=[key, "Frequency"])
        return df

    def get_stats(self):
        dest = self.destdir + "/stats"
        if not os.path.exists(dest):
            os.makedirs(dest)
        with pd.ExcelWriter(f"{dest}/stats.xlsx") as writer:
            for stat in self.statdict:
                df = self.make_df(stat)
                df.to_excel(writer, sheet_name=stat)

    def make_global_df(self, marker):
        """Returns a bigger df with each row being a detected marker (section, caption...).
        It is more complete than the statdict because it contains more info (style and metadata).
        The available choices are "sections", "tdm","captions" and "caption_tdm"."""
        if marker == "cells":
            raise Exception("'cells' is not a valid property")
        rows = []
        for col in self.files:
            for doc in col.docs:
                meta = doc.metadata
                filename = doc.filename
                for item in getattr(doc, marker):
                    text, model, level = item.kind.title, item.kind.model, item.kind.level
                    styles = item.most_common_styles
                    font, size, flags, color = styles["font"], styles["size"], styles["flags"], styles["color"]
                    pdf_title, form, author, creator, producer = (
                        meta["title"],
                        meta["format"],
                        meta["author"],
                        meta["creator"],
                        meta["producer"],
                    )

                    row = [
                        filename,
                        text,
                        model,
                        level,
                        font,
                        size,
                        flags,
                        color,
                        pdf_title,
                        form,
                        author,
                        creator,
                        producer,
                    ]
                    rows.append(row)

        columns = [
            "filename",
            "text",
            "model",
            "level",
            "font",
            "size",
            "flags",
            "color",
            "pdf_title",
            "format",
            "author",
            "creator",
            "producer",
        ]
        df = pd.DataFrame(rows, columns=columns)
        return df

    def get_grouped_frequencies(columns, df):
        """returns an aggregated df with frequencies with the chosen features (model, font, size... ).
        Features must be a list of strings.
        It is meant to be used after make_global_df()"""
        return df.groupby(columns).size().reset_index(name="frequency").sort_values(["frequency"], ascending=False)

destdir = destdir if destdir else directory instance-attribute ⚓︎

The destination of the produced statistics file.

directory = directory instance-attribute ⚓︎

The source directory

get_grouped_frequencies(columns, df) ⚓︎

returns an aggregated df with frequencies with the chosen features (model, font, size... ). Features must be a list of strings. It is meant to be used after make_global_df()

Source code in pdfstruct/corpus.py
169
170
171
172
173
def get_grouped_frequencies(columns, df):
    """returns an aggregated df with frequencies with the chosen features (model, font, size... ).
    Features must be a list of strings.
    It is meant to be used after make_global_df()"""
    return df.groupby(columns).size().reset_index(name="frequency").sort_values(["frequency"], ascending=False)

make_global_df(marker) ⚓︎

Returns a bigger df with each row being a detected marker (section, caption...). It is more complete than the statdict because it contains more info (style and metadata). The available choices are "sections", "tdm","captions" and "caption_tdm".

Source code in pdfstruct/corpus.py
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
def make_global_df(self, marker):
    """Returns a bigger df with each row being a detected marker (section, caption...).
    It is more complete than the statdict because it contains more info (style and metadata).
    The available choices are "sections", "tdm","captions" and "caption_tdm"."""
    if marker == "cells":
        raise Exception("'cells' is not a valid property")
    rows = []
    for col in self.files:
        for doc in col.docs:
            meta = doc.metadata
            filename = doc.filename
            for item in getattr(doc, marker):
                text, model, level = item.kind.title, item.kind.model, item.kind.level
                styles = item.most_common_styles
                font, size, flags, color = styles["font"], styles["size"], styles["flags"], styles["color"]
                pdf_title, form, author, creator, producer = (
                    meta["title"],
                    meta["format"],
                    meta["author"],
                    meta["creator"],
                    meta["producer"],
                )

                row = [
                    filename,
                    text,
                    model,
                    level,
                    font,
                    size,
                    flags,
                    color,
                    pdf_title,
                    form,
                    author,
                    creator,
                    producer,
                ]
                rows.append(row)

    columns = [
        "filename",
        "text",
        "model",
        "level",
        "font",
        "size",
        "flags",
        "color",
        "pdf_title",
        "format",
        "author",
        "creator",
        "producer",
    ]
    df = pd.DataFrame(rows, columns=columns)
    return df