module Memo::Vocab

Overview

Vocabulary extraction and word-level semantic search

Extended Modules

Defined in:

memo/vocab.cr

Constant Summary

MAX_WORD_LENGTH = 30

Maximum word length to include

MIN_WORD_LENGTH = 3

Minimum word length to include

STOPWORDS = Set {"a", "an", "and", "are", "as", "at", "be", "by", "for", "from", "has", "he", "in", "is", "it", "its", "of", "on", "that", "the", "to", "was", "were", "will", "with", "the", "this", "but", "they", "have", "had", "what", "when", "where", "who", "which", "why", "how", "all", "each", "every", "both", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "can", "just", "should", "now", "also", "been", "being", "do", "does", "did", "doing", "would", "could", "might", "must", "shall", "about", "above", "after", "again", "against", "any", "because", "before", "below", "between", "during", "into", "through", "under", "until", "up", "down", "out", "off", "over", "then", "once", "here", "there", "these", "those", "am", "if", "or", "while", "your", "you", "we", "our", "my", "me", "him", "her", "his", "them", "their", "she", "i", "us"}

Common English stopwords to filter out

Instance Method Summary

Instance Method Detail

def clear(db : DB::Database, service_id : Int64) #

Clear all vocabulary for a service


[View source]
def count(db : DB::Database, service_id : Int64) : Int64 #

Get vocabulary count for a service


[View source]
def extract_terms(text : String) : Array(WordFrequency) #

Extract unique terms from text with frequency counts

Tokenizes text, normalizes words, and filters:

  • Stopwords
  • Words shorter than MIN_WORD_LENGTH
  • Words longer than MAX_WORD_LENGTH
  • Numbers-only tokens

Returns array of WordFrequency sorted by count (descending)


[View source]
def extract_terms_batch(texts : Array(String)) : Array(WordFrequency) #

Extract terms from multiple texts, combining frequencies


[View source]
def get_existing_words(db : DB::Database, words : Array(String), service_id : Int64) : Set(String) #

Get existing words from vocab for a service

Returns set of words that already have embeddings


[View source]
def search(db : DB::Database, query_embedding : Array(Float64), service_id : Int64, limit : Int32 = 10, min_score : Float64 = 0.5) : Array(Result) #

Search vocabulary for similar words

Compares query embedding against stored word embeddings. Returns results ranked by cosine similarity.


[View source]
def store_batch(db : DB::Database, words : Array(String), embeddings : Array(Array(Float64)), frequencies : Array(Int32), service_id : Int64) #

Store a batch of word embeddings

Words should be lowercase and already filtered. Embeddings are stored with frequency counts.


[View source]
def store_word(db : DB::Database, word : String, embedding : Array(Float64), frequency : Int32, service_id : Int64) #

Store a single word with embedding


[View source]
def update_frequencies(db : DB::Database, word_freqs : Array(WordFrequency), service_id : Int64) #

Update frequencies for existing words (increment by count)


[View source]