Skip to content

Commit e023792

Browse files
committed
Add LoadCodepoints::BuiltInFrequenciesList()
It provides a list of available frequency data sets + the set of codepoints each one covers.
1 parent 643ca12 commit e023792

File tree

7 files changed

+52
-24
lines changed

7 files changed

+52
-24
lines changed

MODULE.bazel

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ bazel_dep(name = "brotli", version = "1.2.0")
1919

2020
# Frequency Data
2121
bazel_dep(name = "ift_encoder_data", version = "git")
22-
git_override(module_name = "ift_encoder_data", remote = "https://github.com/w3c/ift-encoder-data.git", commit = "317ea02ac68d45004aa842f831c65ed33c891701")
22+
git_override(module_name = "ift_encoder_data", remote = "https://github.com/w3c/ift-encoder-data.git", commit = "8f50ef507feac8f8bdf6c39a92f95b7c991dcfda")
2323

2424
# Non Bazel Modules
2525
http_archive = use_repo_rule("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")

util/BUILD

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -22,17 +22,6 @@ cc_proto_library(
2222
],
2323
)
2424

25-
proto_library(
26-
name = "unicode_count_proto",
27-
srcs = ["unicode_count.proto"],
28-
)
29-
30-
cc_proto_library(
31-
name = "unicode_count_cc_proto",
32-
visibility = ["//visibility:public"],
33-
deps = [":unicode_count_proto"],
34-
)
35-
3625
proto_library(
3726
name = "common_proto",
3827
srcs = ["common.proto"],
@@ -178,12 +167,13 @@ cc_library(
178167
"load_codepoints.h",
179168
],
180169
deps = [
181-
":unicode_count_cc_proto",
182170
"//common",
183171
"//ift/freq",
184172
"@abseil-cpp//absl/status:statusor",
185173
"@abseil-cpp//absl/strings",
186174
"@harfbuzz",
175+
"@ift_encoder_data//:codepoint_count_cc_proto",
176+
"@ift_encoder_data//:metadata_cc_proto",
187177
"@riegeli//riegeli/bytes:fd_reader",
188178
"@riegeli//riegeli/records:record_reader",
189179
],
@@ -284,12 +274,12 @@ cc_binary(
284274
"generate_riegeli_test_data.cc",
285275
],
286276
deps = [
287-
":unicode_count_cc_proto",
288277
"@abseil-cpp//absl/flags:flag",
289278
"@abseil-cpp//absl/flags:parse",
290279
"@abseil-cpp//absl/status",
291280
"@riegeli//riegeli/bytes:fd_writer",
292281
"@riegeli//riegeli/records:record_writer",
282+
"@ift_encoder_data//:codepoint_count_cc_proto",
293283
],
294284
)
295285

util/generate_riegeli_test_data.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,10 @@
99
#include "absl/strings/str_cat.h"
1010
#include "riegeli/bytes/fd_writer.h"
1111
#include "riegeli/records/record_writer.h"
12-
#include "util/unicode_count.pb.h"
12+
#include "codepoint_count.pb.h"
1313

1414
using absl::StrCat;
15+
using ift_encoder_data::CodepointCount;
1516

1617
ABSL_FLAG(std::string, output_path, "", "Path to write the output file.");
1718

util/load_codepoints.cc

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
#include "load_codepoints.h"
22

3-
#include <algorithm>
43
#include <filesystem>
54
#include <fstream>
65
#include <iostream>
@@ -11,20 +10,26 @@
1110
#include "absl/strings/str_cat.h"
1211
#include "absl/strings/strip.h"
1312
#include "common/font_data.h"
13+
#include "common/int_set.h"
1414
#include "hb.h"
1515
#include "ift/freq/unicode_frequencies.h"
1616
#include "riegeli/bytes/fd_reader.h"
1717
#include "riegeli/records/record_reader.h"
18-
#include "util/unicode_count.pb.h"
18+
#include "codepoint_count.pb.h"
19+
#include "metadata.pb.h"
1920

2021
using absl::Status;
2122
using absl::StatusOr;
2223
using absl::StrCat;
2324
using absl::string_view;
25+
using absl::flat_hash_map;
26+
using common::CodepointSet;
2427
using common::FontData;
2528
using common::hb_blob_unique_ptr;
2629
using common::make_hb_blob;
2730
using ift::freq::UnicodeFrequencies;
31+
using ift_encoder_data::CodepointCount;
32+
using ift_encoder_data::DatasetMetadata;
2833

2934
namespace util {
3035

@@ -210,4 +215,27 @@ StatusOr<UnicodeFrequencies> LoadBuiltInFrequencies(const char* name) {
210215
return LoadFrequenciesFromRiegeli(path.c_str());
211216
}
212217

218+
StatusOr<flat_hash_map<std::string, common::CodepointSet>>
219+
BuiltInFrequenciesList() {
220+
std::string path = "../ift_encoder_data+/data/metadata.binpb";
221+
std::ifstream in(path, std::ios::binary);
222+
if (!in.is_open()) {
223+
return absl::NotFoundError(StrCat("Metadata file ", path, " was not found."));
224+
}
225+
DatasetMetadata metadata;
226+
if (!metadata.ParseFromIstream(&in)) {
227+
return absl::InternalError(StrCat("Failed to parse metadata file ", path));
228+
}
229+
230+
flat_hash_map<std::string, common::CodepointSet> result;
231+
for (const auto& file : metadata.files()) {
232+
CodepointSet codepoints;
233+
for (uint32_t cp : file.codepoints()) {
234+
codepoints.insert(cp);
235+
}
236+
result[file.file_name()] = std::move(codepoints);
237+
}
238+
return result;
239+
}
240+
213241
} // namespace util

util/load_codepoints.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include <vector>
66

77
#include "absl/container/btree_set.h"
8+
#include "absl/container/flat_hash_map.h"
89
#include "absl/status/statusor.h"
910
#include "common/font_data.h"
1011
#include "common/font_helper.h"
@@ -43,13 +44,18 @@ absl::StatusOr<common::FontData> LoadFile(const char* path);
4344
absl::StatusOr<ift::freq::UnicodeFrequencies> LoadFrequenciesFromRiegeli(
4445
const char* path);
4546

46-
// Loads frequency data from https://github.com/w3c/ift-encoder-data
47+
// loads frequency data from https://github.com/w3c/ift-encoder-data
4748
//
4849
// name is the file name to load.
4950
// Append "@*" to the name to load all sharded files for a name.
5051
absl::StatusOr<ift::freq::UnicodeFrequencies> LoadBuiltInFrequencies(
5152
const char* name);
5253

54+
// Returns a list of all built-in frequency data sets and the codepoints
55+
// they cover.
56+
absl::StatusOr<absl::flat_hash_map<std::string, common::CodepointSet>>
57+
BuiltInFrequenciesList();
58+
5359
// Given a filepath if it ends with @* this will expand the path into
5460
// the list of paths matching the pattern: <path>-?????-of-?????
5561
// Otherwise returns just the input path.

util/load_codepoints_test.cc

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,4 +164,13 @@ TEST_F(LoadCodepointsTest, LoadBuiltInFrequencies) {
164164
EXPECT_EQ(result->CoveredCodepoints().size(), 1363);
165165
}
166166

167+
TEST_F(LoadCodepointsTest, BuiltInFrequenciesList) {
168+
auto result = util::BuiltInFrequenciesList();
169+
ASSERT_TRUE(result.ok()) << result.status();
170+
EXPECT_FALSE(result->empty());
171+
EXPECT_TRUE(result->contains("Script_latin.riegeli"));
172+
EXPECT_FALSE((*result)["Script_latin.riegeli"].empty());
173+
EXPECT_TRUE((*result)["Script_latin.riegeli"].contains('Q'));
174+
}
175+
167176
} // namespace util

util/unicode_count.proto

Lines changed: 0 additions & 6 deletions
This file was deleted.

0 commit comments

Comments
 (0)