Skip to content

Commit ce31bc8

Browse files
committed
For the auto config generator add a customizable quality level.
Numeric value which controls the performance vs quality tradeoff. Lower values maximize performance, higher values maximize segmentation quality (at the cost of longer analysis times).
1 parent 6a86dba commit ce31bc8

File tree

5 files changed

+202
-43
lines changed

5 files changed

+202
-43
lines changed

util/auto_segmenter_config.cc

Lines changed: 145 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,29 @@ namespace util {
2727

2828
static constexpr uint32_t kMinimumGroupSize = 4;
2929

30+
// Quality Table:
31+
// Quality | bigrams | find conditions | init brotli | non init brotli | init font merge threshold | opt cut off | preprocess merging | preprocess threshold
32+
// 1 | No | No | 0 | 0 | 60% | 5% | Yes | 5%
33+
// 2 | Yes | No | 0 | 0 | 55% | 4% | Yes | 4%
34+
// 3 | Yes | Yes | 0 | 0 | 50% | 3% | Yes | 3%
35+
// 4 | Yes | Yes | 0 | 9 | 45% | 2% | Yes | 2%
36+
// 5 | Yes | Yes | 9 | 9 | 40% | 1% | Yes | 1%
37+
// 6 | Yes | Yes | 9 | 11 | 30% | 0.5% | Yes | 0.5%
38+
// 7 | Yes | Yes | 11 | 11 | 25% | 0.5% | Yes | 0.05%
39+
// 8 | Yes | Yes | 11 | 11 | 25% | 0.5% | No | na
40+
enum Quality {
41+
MIN = 1, // Alias for ONE
42+
ONE = 1,
43+
TWO = 2,
44+
THREE = 3,
45+
FOUR = 4,
46+
FIVE = 5,
47+
SIX = 6,
48+
SEVEN = 7,
49+
EIGHT = 8,
50+
MAX = 8, // Alias for EIGHT
51+
};
52+
3053
// TODO(garretrieger): define a very basic set of quality levels first (see next TODO),
3154
// start with just a lowest and highest to set the upper and lower bounds for quality
3255
// settings (maybe also a mid point). To begin use number of codepoints to select quality
@@ -464,24 +487,140 @@ static Status ApplyPrimaryScript(
464487
return absl::OkStatus();
465488
}
466489

490+
static void ApplyQualityLevelTo(Quality quality, HeuristicConfiguration& config) {
491+
config.set_min_patch_size(2500);
492+
}
493+
494+
static void ApplyQualityLevelTo(Quality quality, CostConfiguration& config) {
495+
config.set_min_group_size(kMinimumGroupSize);
496+
497+
if (quality == ONE) {
498+
config.set_use_bigrams(false);
499+
} else {
500+
config.set_use_bigrams(true);
501+
}
502+
503+
switch (quality) {
504+
case ONE: config.set_optimization_cutoff_fraction(0.05); break;
505+
case TWO: config.set_optimization_cutoff_fraction(0.04); break;
506+
case THREE: config.set_optimization_cutoff_fraction(0.03); break;
507+
case FOUR: config.set_optimization_cutoff_fraction(0.02); break;
508+
case FIVE: config.set_optimization_cutoff_fraction(0.01); break;
509+
case SIX:
510+
case SEVEN:
511+
case EIGHT:
512+
default: config.set_optimization_cutoff_fraction(0.005); break;
513+
}
514+
}
515+
516+
static void ApplyQualityLevelTo(Quality quality, MergeGroup& merge_group) {
517+
if (merge_group.has_cost_config()) {
518+
if (quality >= ONE && quality <= SEVEN) {
519+
merge_group.set_preprocess_merging_group_size(kMinimumGroupSize);
520+
} else {
521+
merge_group.set_preprocess_merging_group_size(1);
522+
}
523+
524+
switch (quality) {
525+
case ONE: merge_group.set_preprocess_merging_probability_threshold(0.05); break;
526+
case TWO: merge_group.set_preprocess_merging_probability_threshold(0.04); break;
527+
case THREE: merge_group.set_preprocess_merging_probability_threshold(0.03); break;
528+
case FOUR: merge_group.set_preprocess_merging_probability_threshold(0.02); break;
529+
case FIVE: merge_group.set_preprocess_merging_probability_threshold(0.01); break;
530+
case SIX: merge_group.set_preprocess_merging_probability_threshold(0.005); break;
531+
case SEVEN: merge_group.set_preprocess_merging_probability_threshold(0.0005); break;
532+
case EIGHT:
533+
default: merge_group.clear_preprocess_merging_probability_threshold(); break;
534+
}
535+
536+
if (merge_group.mutable_cost_config()->has_initial_font_merge_threshold()) {
537+
switch (quality) {
538+
case ONE: merge_group.mutable_cost_config()->set_initial_font_merge_probability_threshold(0.60); break;
539+
case TWO: merge_group.mutable_cost_config()->set_initial_font_merge_probability_threshold(0.55); break;
540+
case THREE: merge_group.mutable_cost_config()->set_initial_font_merge_probability_threshold(0.50); break;
541+
case FOUR: merge_group.mutable_cost_config()->set_initial_font_merge_probability_threshold(0.45); break;
542+
case FIVE: merge_group.mutable_cost_config()->set_initial_font_merge_probability_threshold(0.40); break;
543+
case SIX: merge_group.mutable_cost_config()->set_initial_font_merge_probability_threshold(0.30); break;
544+
case SEVEN:
545+
case EIGHT:
546+
default: merge_group.mutable_cost_config()->set_initial_font_merge_probability_threshold(0.25); break;
547+
}
548+
}
549+
}
550+
}
551+
552+
static void ApplyQualityLevelTo(Quality quality, SegmenterConfig& config) {
553+
config.set_preprocess_merging_group_size_for_ungrouped(kMinimumGroupSize);
554+
555+
if (quality == ONE || quality == TWO) {
556+
config.set_unmapped_glyph_handling(MOVE_TO_INIT_FONT);
557+
} else {
558+
config.set_unmapped_glyph_handling(FIND_CONDITIONS);
559+
}
560+
561+
switch (quality) {
562+
case ONE:
563+
case TWO:
564+
case THREE:
565+
config.set_brotli_quality(0);
566+
break;
567+
case FOUR:
568+
case FIVE:
569+
config.set_brotli_quality(9);
570+
break;
571+
case SIX:
572+
case SEVEN:
573+
case EIGHT:
574+
default:
575+
config.set_brotli_quality(11);
576+
break;
577+
}
578+
579+
switch (quality) {
580+
case ONE:
581+
case TWO:
582+
case THREE:
583+
case FOUR:
584+
config.set_brotli_quality_for_initial_font_merging(0);
585+
break;
586+
case FIVE:
587+
case SIX:
588+
config.set_brotli_quality_for_initial_font_merging(9);
589+
break;
590+
case SEVEN:
591+
case EIGHT:
592+
default:
593+
config.set_brotli_quality_for_initial_font_merging(11);
594+
break;
595+
}
596+
597+
ApplyQualityLevelTo(quality, *config.mutable_base_heuristic_config());
598+
ApplyQualityLevelTo(quality, *config.mutable_base_cost_config());
599+
600+
for (auto& merge_group : *config.mutable_merge_groups()) {
601+
ApplyQualityLevelTo(quality, merge_group);
602+
}
603+
}
604+
467605
absl::StatusOr<SegmenterConfig> AutoSegmenterConfig::GenerateConfig(
468-
hb_face_t* face, std::optional<std::string> primary_script) {
606+
hb_face_t* face, std::optional<std::string> primary_script, std::optional<int> quality_level) {
469607
SegmenterConfig config;
470608
config.set_generate_table_keyed_segments(true);
471609
config.set_generate_feature_segments(true);
472-
config.set_unmapped_glyph_handling(FIND_CONDITIONS);
473610
config.set_condition_analysis_mode(CLOSURE_AND_DEP_GRAPH);
474611

475612
auto* base_plan = config.mutable_base_segmentation_plan();
476613
base_plan->set_jump_ahead(2);
477614
base_plan->set_use_prefetch_lists(true);
478615

479-
config.mutable_ungrouped_config()->set_min_patch_size(2500);
480-
481616
// Collect codepoints
482617
auto freq_list = TRY(BuiltInFrequenciesList());
483618
CodepointSet unicodes = FontHelper::ToCodepointsSet(face);
484619
uint32_t cp_count = unicodes.size();
620+
Quality quality = cp_count > 2000 ? MIN : MAX;
621+
if (quality_level.has_value() && quality_level.value() >= ONE && quality_level.value() <= MAX) {
622+
quality = static_cast<Quality>(quality_level.value());
623+
}
485624

486625
// Detect scripts by intersection with frequency data
487626
btree_set<std::string> detected_scripts = DetectScripts(freq_list, unicodes);
@@ -491,18 +630,6 @@ absl::StatusOr<SegmenterConfig> AutoSegmenterConfig::GenerateConfig(
491630
// (including accounting for pairs only within merge groups), and then select
492631
// the cutoffs and premerging to keep the number of brotli ops within a
493632
// specific range.
494-
auto* base_cost = config.mutable_base_cost_config();
495-
base_cost->set_use_bigrams(true);
496-
base_cost->set_min_group_size(
497-
kMinimumGroupSize); // as recommended by the spec.
498-
config.set_preprocess_merging_group_size_for_ungrouped(kMinimumGroupSize);
499-
base_cost->set_optimization_cutoff_fraction(0.01);
500-
501-
if (cp_count > 2000) {
502-
config.set_brotli_quality(9);
503-
} else {
504-
config.set_brotli_quality(11);
505-
}
506633

507634
TRYV(ApplyPrimaryScript(freq_list, primary_script.value_or("Script_latin"),
508635
detected_scripts));
@@ -515,20 +642,15 @@ absl::StatusOr<SegmenterConfig> AutoSegmenterConfig::GenerateConfig(
515642
mg->set_name(ScriptName(script));
516643
auto* cost = mg->mutable_cost_config();
517644

518-
// TODO(garretrieger): use a heuristic to select probability threshold based
519-
// on estimated number of brotli ops (assuming O(n^2) on codepoints in the
520-
// group).
521-
mg->set_preprocess_merging_group_size(kMinimumGroupSize);
522-
mg->set_preprocess_merging_probability_threshold(0.001);
523-
524645
cost->set_built_in_freq_data_name(script);
525646
if (script == primary_script_file) {
526647
// TODO(garretrieger): customize these values based on the quality level
527648
cost->set_initial_font_merge_threshold(-60);
528-
cost->set_initial_font_merge_probability_threshold(0.40);
529649
}
530650
}
531651

652+
ApplyQualityLevelTo(quality, config);
653+
532654
return config;
533655
}
534656

util/auto_segmenter_config.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@ class AutoSegmenterConfig {
2121
// Defaults to "Script_latin" if not provided.
2222
static absl::StatusOr<SegmenterConfig> GenerateConfig(
2323
hb_face_t* face,
24-
std::optional<std::string> primary_script = std::nullopt);
24+
std::optional<std::string> primary_script = std::nullopt,
25+
std::optional<int> quality_level = std::nullopt);
2526

2627
// Returns the base script for a given language.
2728
// For example, "Language_fr" -> "Script_latin".

util/auto_segmenter_config_test.cc

Lines changed: 31 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -89,53 +89,49 @@ TEST_F(AutoSegmenterConfigTest, Roboto_UnspecifiedPrimary) {
8989
ASSERT_EQ(config_string, R"(unmapped_glyph_handling: FIND_CONDITIONS
9090
generate_table_keyed_segments: true
9191
brotli_quality: 11
92+
brotli_quality_for_initial_font_merging: 11
93+
base_heuristic_config {
94+
min_patch_size: 2500
95+
}
9296
base_cost_config {
9397
use_bigrams: true
9498
min_group_size: 4
95-
optimization_cutoff_fraction: 0.01
96-
}
97-
ungrouped_config {
98-
min_patch_size: 2500
99+
optimization_cutoff_fraction: 0.005
99100
}
100101
preprocess_merging_group_size_for_ungrouped: 4
101102
merge_groups {
102103
name: "Cyrillic"
103-
preprocess_merging_group_size: 4
104-
preprocess_merging_probability_threshold: 0.001
104+
preprocess_merging_group_size: 1
105105
cost_config {
106106
built_in_freq_data_name: "Script_cyrillic.riegeli"
107107
}
108108
}
109109
merge_groups {
110110
name: "Greek"
111-
preprocess_merging_group_size: 4
112-
preprocess_merging_probability_threshold: 0.001
111+
preprocess_merging_group_size: 1
113112
cost_config {
114113
built_in_freq_data_name: "Script_greek.riegeli"
115114
}
116115
}
117116
merge_groups {
118117
name: "Latin"
119-
preprocess_merging_group_size: 4
120-
preprocess_merging_probability_threshold: 0.001
118+
preprocess_merging_group_size: 1
121119
cost_config {
122120
built_in_freq_data_name: "Script_latin.riegeli"
123121
initial_font_merge_threshold: -60
124-
initial_font_merge_probability_threshold: 0.4
122+
initial_font_merge_probability_threshold: 0.25
125123
}
126124
}
127125
merge_groups {
128126
name: "Symbols"
129-
preprocess_merging_group_size: 4
130-
preprocess_merging_probability_threshold: 0.001
127+
preprocess_merging_group_size: 1
131128
cost_config {
132129
built_in_freq_data_name: "Script_symbols.riegeli"
133130
}
134131
}
135132
merge_groups {
136133
name: "Fallback"
137-
preprocess_merging_group_size: 4
138-
preprocess_merging_probability_threshold: 0.001
134+
preprocess_merging_group_size: 1
139135
cost_config {
140136
built_in_freq_data_name: "fallback.riegeli"
141137
}
@@ -275,5 +271,25 @@ TEST_F(AutoSegmenterConfigTest, LanguageMappingsExist) {
275271
}
276272
}
277273

274+
TEST_F(AutoSegmenterConfigTest, QualityLevelForcing) {
275+
auto config_or = AutoSegmenterConfig::GenerateConfig(
276+
face_.get(), std::nullopt, 1);
277+
ASSERT_TRUE(config_or.ok()) << config_or.status();
278+
EXPECT_EQ(config_or->brotli_quality(), 0);
279+
EXPECT_EQ(config_or->unmapped_glyph_handling(), MOVE_TO_INIT_FONT);
280+
EXPECT_EQ(config_or->base_cost_config().use_bigrams(), false);
281+
EXPECT_EQ(config_or->brotli_quality_for_initial_font_merging(), 0);
282+
EXPECT_EQ(config_or->base_cost_config().optimization_cutoff_fraction(), 0.05);
283+
284+
auto config_or_8 = AutoSegmenterConfig::GenerateConfig(
285+
face_.get(), std::nullopt, 8);
286+
ASSERT_TRUE(config_or_8.ok()) << config_or_8.status();
287+
EXPECT_EQ(config_or_8->brotli_quality(), 11);
288+
EXPECT_EQ(config_or_8->unmapped_glyph_handling(), FIND_CONDITIONS);
289+
EXPECT_EQ(config_or_8->base_cost_config().use_bigrams(), true);
290+
EXPECT_EQ(config_or_8->brotli_quality_for_initial_font_merging(), 11);
291+
EXPECT_EQ(config_or_8->base_cost_config().optimization_cutoff_fraction(), 0.005);
292+
}
293+
278294
} // namespace
279295
} // namespace util

util/closure_glyph_keyed_segmenter_util.cc

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
#include <google/protobuf/text_format.h>
22

33
#include <cstdint>
4-
#include <cstdio>
54
#include <iostream>
65
#include <vector>
6+
#include <chrono>
77

88
#include "absl/container/btree_map.h"
99
#include "absl/container/flat_hash_map.h"
@@ -43,6 +43,9 @@ ABSL_FLAG(
4343
"Path to a text proto file containing the configuration for the segmenter. "
4444
"Should contain a single SegmenterConfig message.");
4545

46+
ABSL_FLAG(int, auto_config_quality, 0,
47+
"The quality level to use when auto_config is enabled. A value of 0 means auto pick. Valid values are 1-8.");
48+
4649
ABSL_FLAG(bool, auto_config, false,
4750
"If set the segmenter configuration will be automatically generated "
4851
"based on the input font.");
@@ -95,8 +98,12 @@ using util::SegmenterConfigUtil;
9598

9699
static StatusOr<SegmenterConfig> LoadConfig(hb_face_t* font) {
97100
if (absl::GetFlag(FLAGS_auto_config)) {
101+
std::optional<int> quality_level = std::nullopt;
102+
if (absl::GetFlag(FLAGS_auto_config_quality) > 0) {
103+
quality_level = absl::GetFlag(FLAGS_auto_config_quality);
104+
}
98105
return AutoSegmenterConfig::GenerateConfig(
99-
font, absl::GetFlag(FLAGS_primary_script));
106+
font, absl::GetFlag(FLAGS_primary_script), quality_level);
100107
}
101108

102109
FontData config_text =
@@ -143,7 +150,7 @@ static Status Analysis(hb_face_t* font,
143150
group_index++;
144151
}
145152

146-
std::cerr << "total_cost_across_groups = " << overall_cost << std::endl;
153+
std::cerr << "total_cost_across_groups = " << (uint64_t) overall_cost << std::endl;
147154

148155
return absl::OkStatus();
149156
}
@@ -224,8 +231,13 @@ static Status Main(const std::vector<char*> args) {
224231
ClosureGlyphSegmenter segmenter(
225232
config.brotli_quality(), config.brotli_quality_for_initial_font_merging(),
226233
config.unmapped_glyph_handling(), config.condition_analysis_mode());
234+
235+
auto start_time = std::chrono::high_resolution_clock::now();
227236
GlyphSegmentation segmentation = TRY(segmenter.CodepointToGlyphSegments(
228237
font.get(), init_segment, segments, merge_groups));
238+
auto end_time = std::chrono::high_resolution_clock::now();
239+
std::chrono::duration<double> duration = end_time - start_time;
240+
std::cerr << "CodepointToGlyphSegments took: " << duration.count() << " seconds" << std::endl;
229241

230242
if (absl::GetFlag(FLAGS_output_segmentation_plan)) {
231243
SegmentationPlan plan = segmentation.ToSegmentationPlanProto();

0 commit comments

Comments
 (0)