Code
# install necessary packages
import requests
#import zipfile
import json
import os
from collections import Counter
import yaml
import seaborn as sns
import matplotlib.pyplot as plt
from pprint import pprint
#from urllib import request
This page offers an overview of the latest content of the HTR-United catalog. The visualizations are normally updated frequently. Feel free to check that the HTR-United’s catalog version listed below does indeed correspond to the latest version available for the catalog (here).
HTR-United is a catalog that lists highly documented training datasets used for automatic transcription or segmentation models. HTR-United standardizes dataset descriptions using a schema, offers guidelines for organizing data repositories, and provides tools for quality control and continuous documentation. It’s an open and transparent ecosystem hosted on GitHub, designed for easy maintenance. HTR-United was created to help projects quickly access diverse ground truth data for training models on smaller collections.
This page is only dedicated to a generic oversight of the content of the catalog, mainly in the form of plots. If you want to browse the datasets listed in the catalog, there is a more suitable interface for that here.
# install necessary packages
import requests
#import zipfile
import json
import os
from collections import Counter
import yaml
import seaborn as sns
import matplotlib.pyplot as plt
from pprint import pprint
#from urllib import request
# function to built the bar plots
def make_bar_plot(counted_data, title, xlabel):
= sorted(counted_data.items(), key=lambda x: x[1], reverse=True)
sorted_data = [item[0] for item in sorted_data]
labels = [item[1] for item in sorted_data]
counts
# Create the bar chart
#plt.bar(labels, counts)
# Create a bar plot using Seaborn
=labels, y=counts, hue=labels)
sns.barplot(x
# Add labels and rotation for better visibility
plt.xlabel(xlabel)'Counts')
plt.ylabel(=45, ha='right')
plt.xticks(rotation
# Add a title
plt.title(title)
# Display the chart
# Ensures labels are not cut off
plt.tight_layout()
plt.show()
# function to change the lists are shown
def pretty_list_of_projects(most_common_projects):
= ""
pretty_list for common_project in most_common_projects:
+= f"\t{common_project[0].strip()} - {common_project[1]} datasets.\n"
pretty_list return pretty_list[:-1]
# fetch latest version of the catalog
= "https://api.github.com/repos/htr-united/htr-united/releases/latest"
url_latest = requests.get(url_latest)
r if r.status_code == 200:
= r.json().get("html_url", "")
github_url
= github_url.split("/")[-1]
htr_united_version
if github_url:
# let's build the url to the latest yaml file
= github_url.replace("/releases/tag/", "/").replace("/github.com/", "/raw.githubusercontent.com/") + "/htr-united.yml"
github_url
= requests.get(github_url)
r_yml if r_yml.status_code == 200:
# now let's download the yaml file
with open("htr-united.yml", "w", encoding="utf8") as fh:
fh.write(r_yml.text) print("We are currently computing \nthe content of HTR-United's \ncatalog", htr_united_version)
else:
print("Couldn't connect to", github_url, "got status code", r_yml.status_code)
else:
print("Couldn't connect to", url_latest, "got status code", r.status_code)
# Load the YAML file as a JSON file
= "htr-united.yml"
yaml_file_path = "htr-united.json"
json_file_path
if os.path.exists(yaml_file_path):
with open(yaml_file_path, "r") as yaml_file:
= yaml.safe_load(yaml_file)
data
# at this point we could remove yaml_file_path but I keep it for history.
We are currently computing
the content of HTR-United's
catalog v0.1.170
= []
languages for entry in data:
if entry.get("language"):
= languages + entry.get("language")
languages
= Counter(languages)
counted_lgges
print(f"There are {len(counted_lgges)} languages represented in the catalog.")
print(f"The top 5 languages represented are:")
for cl in counted_lgges.most_common(5):
print(f"{cl[0]} = {cl[1]} times")
='Language Distribution', xlabel="Languages") make_bar_plot(counted_lgges, title
There are 27 languages represented in the catalog.
The top 5 languages represented are:
fra = 47 times
lat = 24 times
deu = 12 times
eng = 10 times
grc = 9 times
= []
scripts_dict for entry in data:
if entry.get("script"):
= scripts_dict + entry.get("script")
scripts_dict
= []
scripts for sd in scripts_dict:
"iso"])
scripts.append(sd[
= Counter(scripts)
counted_scripts
print(f"There are {len(counted_scripts)} scripts represented in the catalog.")
print(f"The top 5 scripts represented are:")
for cs in counted_scripts.most_common(5):
print(f"{cs[0]} = {cs[1]} times")
='Script Distribution', xlabel="Scripts") make_bar_plot(counted_scripts, title
There are 8 scripts represented in the catalog.
The top 5 scripts represented are:
Latn = 94 times
Grek = 10 times
Arab = 4 times
Deva = 2 times
Goth = 2 times
= [entry.get("script-type") for entry in data if entry.get("script-type")]
script_types = Counter(script_types)
counted_script_types
pprint(counted_script_types)='Writing type Distribution', xlabel="Writing Type") make_bar_plot(counted_script_types, title
Counter({'only-manuscript': 61,
'only-typed': 35,
'mainly-manuscript': 8,
'evenly-mixed': 3})
= [entry.get("production-software") for entry in data if entry.get("production-software")]
softwares = Counter(softwares)
counted_softwares
pprint(counted_softwares)='Software Distribution', xlabel="Software") make_bar_plot(counted_softwares, title
Counter({'eScriptorium + Kraken': 62,
'Transkribus': 21,
'Unknown [Automatically filled]': 11,
'Unknown': 5,
'Calfa Vision': 2,
'Kraken + VGG Image Annotator (VIA)': 1,
'Aletheia': 1,
'eScriptorium + Kraken + Transkribus': 1,
'Transkribus, own': 1,
'Pivan': 1,
'Callico': 1})
# nb of datasets in the catalog
= len(set([entry.get("title") for entry in data if entry.get("title")]))
nb_of_datasets
# nb of projects contributing to the catalog (and signaled)
= [entry.get("project-name", "unknown") for entry in data if entry.get("project-name")]
projects
= len(Counter(projects))
nb_of_projects = Counter(projects).most_common(5)
biggest_project_contributers = len([entry.get("title") for entry in data if entry.get("project-name")])
nb_of_datasets_from_projects
# time span
= [entry.get("time") for entry in data if entry.get("time")]
times = min([int(time['notBefore']) for time in times])
earliest = max([int(time['notAfter']) for time in times])
latest
# let's see the result
print(
f"There are {nb_of_datasets} datasets listed in the catalog",
f"At least {nb_of_projects} identified projects contributed a total of {nb_of_datasets_from_projects} datasets listed in the catalog",
f"The projects which contributed the most to the catalog are:",
pretty_list_of_projects(biggest_project_contributers),f"The catalog covers a period going from the year {earliest} to the year {latest}",
="\n"
sep )
There are 107 datasets listed in the catalog
At least 47 identified projects contributed a total of 86 datasets listed in the catalog
The projects which contributed the most to the catalog are:
FoNDUE - 13 datasets.
CREMMA - 8 datasets.
Gallicorpora - 5 datasets.
HTRomance - 5 datasets.
HTR Winter School 2022, Vienna - 4 datasets.
The catalog covers a period going from the year -250 to the year 2023
= {
metrics 'images': 0,
'characters': 0,
'lines': 0,
'files': 0,
'pages': 0,
'regions': 0}
#{'images', 'characters', 'lines', 'files', 'pages', 'regions'}
for entry in data:
for vol in entry.get("volume", []):
"metric"]] += vol["count"]
metrics[vol[
pprint(metrics)
{'characters': 66442496,
'files': 26036,
'images': 150,
'lines': 1721869,
'pages': 27647,
'regions': 155926}