Complete 3.

de555239 · Jonathan Kunstwald · b7c3aa66 · de555239
Unverified Commit de555239 authored Oct 20, 2020 by Jonathan Kunstwald
--- a/src/main.cc
+++ b/src/main.cc
@@ -174,6 +174,20 @@ struct ngram_statistics
 {
    std::unordered_map<uint64_t, ngram_info> ngram_occurence;    // key: combined n-gram hash, value: "info" - occurence & last index
    asr::flat_linear_map<unsigned, unsigned> ngram_count_counts; // key: M, value: how many n-grams occur M times
+
+    void compute_count_counts()
+    {
+        CC_ASSERT(ngram_count_counts._nodes.empty() && "re-ran");
+        // iterate over all unique trigrams
+        for (auto const& val : ngram_occurence)
+        {
+            // get the key in the count-counts map corresponding to all n-grams which also appeared this often
+            // default value: 0 (second argument)
+            unsigned& occurence_count = ngram_count_counts.get_value(val.second.num_occurences, 0);
+            // increment it by one
+            occurence_count++;
+        }
+    }
 };

 /// searches for contiguous occurences of (a, b, c) in the token hashes and returns the amount
@@ -235,15 +249,80 @@ void compute_ngram_stats(cc::span<uint64_t const> token_hashes, ngram_statistics
        info.latest_index = i;
    }

-    // iterate over all unique trigrams
-    for (auto const& val : out_stats.ngram_occurence)
+    out_stats.compute_count_counts();
+}
+
+struct trigram_postfixes
 {
-        // get the key in the count-counts map corresponding to all n-grams which also appeared this often
-        // default value: 0 (second argument)
-        unsigned& occurence_count = out_stats.ngram_count_counts.get_value(val.second.num_occurences, 0);
-        // increment it by one
-        occurence_count++;
+    uint64_t hash_b;
+    uint64_t hash_c;
+};
+
+// get (non-unique) postfixes of all trigrams
+cc::alloc_vector<trigram_postfixes> get_trigram_postfixes(cc::span<uint64_t const> token_hashes, ngram_statistics const& stats)
+{
+    cc::alloc_vector<trigram_postfixes> res;
+    res.reserve(stats.ngram_occurence.size());
+    for (auto const& trigram : stats.ngram_occurence)
+    {
+        auto const first_token_i = trigram.second.latest_index;
+        res.push_back(trigram_postfixes{token_hashes[first_token_i + 1], token_hashes[first_token_i + 2]});
+    }
+
+    return res;
+}
+
+void compute_summed_sub_trigram_stats(cc::span<trigram_postfixes const> postfixes,
+                                      cc::span<uint64_t const> vocab_hashes,
+                                      ngram_statistics const& prev_trigrams,
+                                      ngram_statistics& out_stats_bigrams,
+                                      ngram_statistics& out_stats_unigrams)
+{
+    for (trigram_postfixes const& postfix : postfixes)
+    {
+        unsigned num_postfix_occurence = 0u;
+        unsigned latest_trigram_occurence = unsigned(-1);
+
+        // inner sum over u, just like in the formula
+        for (uint64_t const word_hash_u : vocab_hashes)
+        {
+            uint64_t const combined_trigram_hash = asr::hash_combine(word_hash_u, postfix.hash_b, postfix.hash_c);
+
+            // search in prev_trigrams for this hash, if found, add its occurences to the bigram occurences
+            auto const map_result = prev_trigrams.ngram_occurence.find(combined_trigram_hash);
+            if (map_result != prev_trigrams.ngram_occurence.end())
+            {
+                num_postfix_occurence += map_result->second.num_occurences;
+                latest_trigram_occurence = map_result->second.latest_index;
+            }
        }
+
+        if (num_postfix_occurence == 0u)
+            continue;
+
+        // by nature of the loop, we do not repeat bigrams (assuming vocabulary entries are unique)
+        {
+            uint64_t const bigram_hash = asr::hash_combine(postfix.hash_b, postfix.hash_c);
+            ngram_info& info = out_stats_bigrams.ngram_occurence[bigram_hash];
+
+            CC_ASSERT(latest_trigram_occurence != unsigned(-1) && "programmer error");
+            info.num_occurences += num_postfix_occurence; // += just to make sure
+            info.latest_index = latest_trigram_occurence;
+        }
+
+        {
+            // unigrams are repeated
+            uint64_t const unigram_hash = postfix.hash_c;
+            ngram_info& info = out_stats_unigrams.ngram_occurence[unigram_hash];
+
+            CC_ASSERT(latest_trigram_occurence != unsigned(-1) && "programmer error");
+            info.num_occurences += num_postfix_occurence;
+            info.latest_index = latest_trigram_occurence;
+        }
+    }
+
+    out_stats_bigrams.compute_count_counts();
+    out_stats_unigrams.compute_count_counts();
 }

 using ngram_with_info_t = std::pair<uint64_t, ngram_info>;
@@ -257,6 +336,7 @@ std::vector<ngram_with_info_t> get_sorted_frequent_ngrams(ngram_statistics const

    return linear_infos;
 }
+
 }

 int main()
@@ -394,5 +474,45 @@ int main()
        printf("2. e) out-of-vocabulary (OOV) rate: %.2f %%\n", oov_rate * 100.f);

        // 3.
+
+        // first - "naive" summing over already extracted trigrams
+        {
+            printf("3. - computing naive sum over (vocab) trigrams to obtain bigram and unigram frequencies..\n");
+            fflush(stdout);
+
+            auto const trigram_postfixes = get_trigram_postfixes(corpus_tokens_over_vocab.ordered_token_hashes, trigram_stats_over_vocab);
+
+            ngram_statistics bigram_stats_naive;
+            ngram_statistics unigram_stats_naive;
+
+            printf("3. - naive sum bigrams/unigrams - looping %zu x %zu times\n", trigram_postfixes.size(), vocab_tokens.ordered_token_hashes.size());
+            fflush(stdout);
+
+            compute_summed_sub_trigram_stats(trigram_postfixes, vocab_tokens.ordered_token_hashes, trigram_stats_over_vocab, bigram_stats_naive, unigram_stats_naive);
+
+
+            auto const bigram_frequency = get_sorted_frequent_ngrams(bigram_stats_naive);
+            auto const unigram_frequency = get_sorted_frequent_ngrams(unigram_stats_naive);
+
+            printf("3. naive summed bigrams: got %zu bigrams in total\n", bigram_frequency.size());
+            printf("3. naive summed unigrams: got %zu unigrams in total\n", unigram_frequency.size());
+
+
+        } // second - recompute
+        {
+            printf("3. - recomputing bigrams and unigrams..\n");
+            fflush(stdout);
+
+            ngram_statistics bigram_stats;
+            compute_ngram_stats(corpus_tokens_over_vocab.ordered_token_hashes, bigram_stats, 2);
+            ngram_statistics unigram_stats;
+            compute_ngram_stats(corpus_tokens_over_vocab.ordered_token_hashes, unigram_stats, 1);
+
+            auto const bigram_frequency = get_sorted_frequent_ngrams(bigram_stats);
+            auto const unigram_frequency = get_sorted_frequent_ngrams(unigram_stats);
+
+            printf("3. recomputed bigrams: got %zu bigrams in total\n", bigram_frequency.size());
+            printf("3. recomputed unigrams: got %zu unigrams in total\n", unigram_frequency.size());
+        }
    }
 }