Skip to content
Snippets Groups Projects
Unverified Commit de555239 authored by Jonathan Kunstwald's avatar Jonathan Kunstwald
Browse files

Complete 3.

parent b7c3aa66
No related branches found
No related tags found
No related merge requests found
......@@ -174,6 +174,20 @@ struct ngram_statistics
{
std::unordered_map<uint64_t, ngram_info> ngram_occurence; // key: combined n-gram hash, value: "info" - occurence & last index
asr::flat_linear_map<unsigned, unsigned> ngram_count_counts; // key: M, value: how many n-grams occur M times
void compute_count_counts()
{
CC_ASSERT(ngram_count_counts._nodes.empty() && "re-ran");
// iterate over all unique trigrams
for (auto const& val : ngram_occurence)
{
// get the key in the count-counts map corresponding to all n-grams which also appeared this often
// default value: 0 (second argument)
unsigned& occurence_count = ngram_count_counts.get_value(val.second.num_occurences, 0);
// increment it by one
occurence_count++;
}
}
};
/// searches for contiguous occurences of (a, b, c) in the token hashes and returns the amount
......@@ -235,15 +249,80 @@ void compute_ngram_stats(cc::span<uint64_t const> token_hashes, ngram_statistics
info.latest_index = i;
}
// iterate over all unique trigrams
for (auto const& val : out_stats.ngram_occurence)
out_stats.compute_count_counts();
}
struct trigram_postfixes
{
// get the key in the count-counts map corresponding to all n-grams which also appeared this often
// default value: 0 (second argument)
unsigned& occurence_count = out_stats.ngram_count_counts.get_value(val.second.num_occurences, 0);
// increment it by one
occurence_count++;
uint64_t hash_b;
uint64_t hash_c;
};
// get (non-unique) postfixes of all trigrams
cc::alloc_vector<trigram_postfixes> get_trigram_postfixes(cc::span<uint64_t const> token_hashes, ngram_statistics const& stats)
{
cc::alloc_vector<trigram_postfixes> res;
res.reserve(stats.ngram_occurence.size());
for (auto const& trigram : stats.ngram_occurence)
{
auto const first_token_i = trigram.second.latest_index;
res.push_back(trigram_postfixes{token_hashes[first_token_i + 1], token_hashes[first_token_i + 2]});
}
return res;
}
void compute_summed_sub_trigram_stats(cc::span<trigram_postfixes const> postfixes,
cc::span<uint64_t const> vocab_hashes,
ngram_statistics const& prev_trigrams,
ngram_statistics& out_stats_bigrams,
ngram_statistics& out_stats_unigrams)
{
for (trigram_postfixes const& postfix : postfixes)
{
unsigned num_postfix_occurence = 0u;
unsigned latest_trigram_occurence = unsigned(-1);
// inner sum over u, just like in the formula
for (uint64_t const word_hash_u : vocab_hashes)
{
uint64_t const combined_trigram_hash = asr::hash_combine(word_hash_u, postfix.hash_b, postfix.hash_c);
// search in prev_trigrams for this hash, if found, add its occurences to the bigram occurences
auto const map_result = prev_trigrams.ngram_occurence.find(combined_trigram_hash);
if (map_result != prev_trigrams.ngram_occurence.end())
{
num_postfix_occurence += map_result->second.num_occurences;
latest_trigram_occurence = map_result->second.latest_index;
}
}
if (num_postfix_occurence == 0u)
continue;
// by nature of the loop, we do not repeat bigrams (assuming vocabulary entries are unique)
{
uint64_t const bigram_hash = asr::hash_combine(postfix.hash_b, postfix.hash_c);
ngram_info& info = out_stats_bigrams.ngram_occurence[bigram_hash];
CC_ASSERT(latest_trigram_occurence != unsigned(-1) && "programmer error");
info.num_occurences += num_postfix_occurence; // += just to make sure
info.latest_index = latest_trigram_occurence;
}
{
// unigrams are repeated
uint64_t const unigram_hash = postfix.hash_c;
ngram_info& info = out_stats_unigrams.ngram_occurence[unigram_hash];
CC_ASSERT(latest_trigram_occurence != unsigned(-1) && "programmer error");
info.num_occurences += num_postfix_occurence;
info.latest_index = latest_trigram_occurence;
}
}
out_stats_bigrams.compute_count_counts();
out_stats_unigrams.compute_count_counts();
}
using ngram_with_info_t = std::pair<uint64_t, ngram_info>;
......@@ -257,6 +336,7 @@ std::vector<ngram_with_info_t> get_sorted_frequent_ngrams(ngram_statistics const
return linear_infos;
}
}
int main()
......@@ -394,5 +474,45 @@ int main()
printf("2. e) out-of-vocabulary (OOV) rate: %.2f %%\n", oov_rate * 100.f);
// 3.
// first - "naive" summing over already extracted trigrams
{
printf("3. - computing naive sum over (vocab) trigrams to obtain bigram and unigram frequencies..\n");
fflush(stdout);
auto const trigram_postfixes = get_trigram_postfixes(corpus_tokens_over_vocab.ordered_token_hashes, trigram_stats_over_vocab);
ngram_statistics bigram_stats_naive;
ngram_statistics unigram_stats_naive;
printf("3. - naive sum bigrams/unigrams - looping %zu x %zu times\n", trigram_postfixes.size(), vocab_tokens.ordered_token_hashes.size());
fflush(stdout);
compute_summed_sub_trigram_stats(trigram_postfixes, vocab_tokens.ordered_token_hashes, trigram_stats_over_vocab, bigram_stats_naive, unigram_stats_naive);
auto const bigram_frequency = get_sorted_frequent_ngrams(bigram_stats_naive);
auto const unigram_frequency = get_sorted_frequent_ngrams(unigram_stats_naive);
printf("3. naive summed bigrams: got %zu bigrams in total\n", bigram_frequency.size());
printf("3. naive summed unigrams: got %zu unigrams in total\n", unigram_frequency.size());
} // second - recompute
{
printf("3. - recomputing bigrams and unigrams..\n");
fflush(stdout);
ngram_statistics bigram_stats;
compute_ngram_stats(corpus_tokens_over_vocab.ordered_token_hashes, bigram_stats, 2);
ngram_statistics unigram_stats;
compute_ngram_stats(corpus_tokens_over_vocab.ordered_token_hashes, unigram_stats, 1);
auto const bigram_frequency = get_sorted_frequent_ngrams(bigram_stats);
auto const unigram_frequency = get_sorted_frequent_ngrams(unigram_stats);
printf("3. recomputed bigrams: got %zu bigrams in total\n", bigram_frequency.size());
printf("3. recomputed unigrams: got %zu unigrams in total\n", unigram_frequency.size());
}
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment