11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173 | class Corpus:
"""A class to apply Pdfstruct on a large corpus of files and extract statistics on the detected elements :
Sections, styles, models..."""
def __init__(self, directory, files=None, destdir=None, stats=None):
self.directory = directory
"""The source directory"""
self.destdir = destdir if destdir else directory
"""The destination of the produced statistics file."""
self.files = (
files
if files is not None
else [Collection.from_pdf(file) for file in glob.glob(self.directory + "/*.pdf", recursive=True)]
)
if stats:
self.statdict = self.from_stats(stats)
else:
self.statdict = self.get_statdict()
"""A dictionary containing the frequencies of the pdf elements"""
def from_stats(self, stats):
data: Dict[str, Any] = {
"TOC Models": {},
"Section Models": {},
"Caption TOC Models": {},
"Caption Models": {},
"Section Titles": [],
"Section Styles": [],
}
for stat in stats:
if not stat:
continue
for m, v in stat.items():
if isinstance(v, list):
data[m].extend(v)
else:
for model, frequency in stat[m].items():
if model not in data[m]:
data[m][model] = int(frequency)
else:
data[m][model] += int(frequency)
data["Section Titles"] = Counter(data["Section Titles"]).most_common()
data["Section Titles"] = [t for t in data["Section Titles"] if t[1] > 1]
data["Section Styles"] = Counter([tuple(dic.values()) for dic in data["Section Styles"]]).most_common()
data["Section Styles"] = [t for t in data["Section Styles"] if t[1] > 1]
return data
def get_statdict(self):
data: Dict[str, Any] = {
"TOC Models": {},
"Section Models": {},
"Caption TOC Models": {},
"Caption Models": {},
"Section Titles": [],
"Section Styles": [],
}
for col in self.files:
for j, doc in enumerate(col.docs):
models = [
("TOC Models", doc.tdm_models()),
("Section Models", doc.section_models()),
("Caption TOC Models", doc.other_index_models()),
("Caption Models", doc.other_models()),
]
# print(f"DOC {j}")
sections = list(doc.sections)
data["Section Titles"].extend([section.kind.title for section in sections])
data["Section Styles"].extend([section.most_common_styles for section in sections])
for m, lst in models:
for model, frequency in lst:
if model not in data[m]:
data[m][model] = int(frequency)
else:
data[m][model] += int(frequency)
data["Section Titles"] = Counter(data["Section Titles"]).most_common()
data["Section Titles"] = [t for t in data["Section Titles"] if t[1] > 1]
data["Section Styles"] = Counter([tuple(dic.values()) for dic in data["Section Styles"]]).most_common()
data["Section Styles"] = [t for t in data["Section Styles"] if t[1] > 1]
return data
def make_df(self, key):
data = self.statdict
if isinstance(data[key], dict):
records = [(k, v) for k, v in data[key].items()]
else:
records = data[key]
df = pd.DataFrame.from_records(records, columns=[key, "Frequency"])
return df
def get_stats(self):
dest = self.destdir + "/stats"
if not os.path.exists(dest):
os.makedirs(dest)
with pd.ExcelWriter(f"{dest}/stats.xlsx") as writer:
for stat in self.statdict:
df = self.make_df(stat)
df.to_excel(writer, sheet_name=stat)
def make_global_df(self, marker):
"""Returns a bigger df with each row being a detected marker (section, caption...).
It is more complete than the statdict because it contains more info (style and metadata).
The available choices are "sections", "tdm","captions" and "caption_tdm"."""
if marker == "cells":
raise Exception("'cells' is not a valid property")
rows = []
for col in self.files:
for doc in col.docs:
meta = doc.metadata
filename = doc.filename
for item in getattr(doc, marker):
text, model, level = item.kind.title, item.kind.model, item.kind.level
styles = item.most_common_styles
font, size, flags, color = styles["font"], styles["size"], styles["flags"], styles["color"]
pdf_title, form, author, creator, producer = (
meta["title"],
meta["format"],
meta["author"],
meta["creator"],
meta["producer"],
)
row = [
filename,
text,
model,
level,
font,
size,
flags,
color,
pdf_title,
form,
author,
creator,
producer,
]
rows.append(row)
columns = [
"filename",
"text",
"model",
"level",
"font",
"size",
"flags",
"color",
"pdf_title",
"format",
"author",
"creator",
"producer",
]
df = pd.DataFrame(rows, columns=columns)
return df
def get_grouped_frequencies(columns, df):
"""returns an aggregated df with frequencies with the chosen features (model, font, size... ).
Features must be a list of strings.
It is meant to be used after make_global_df()"""
return df.groupby(columns).size().reset_index(name="frequency").sort_values(["frequency"], ascending=False)
|