module Memo::Vocab

Overview

Vocabulary extraction and word-level semantic search

Extended Modules

Memo::Vocab

Defined in:

memo/vocab.cr

Constant Summary

MAX_WORD_LENGTH = 30: Maximum word length to include
MIN_WORD_LENGTH = 3: Minimum word length to include
STOPWORDS = Set {"a", "an", "and", "are", "as", "at", "be", "by", "for", "from", "has", "he", "in", "is", "it", "its", "of", "on", "that", "the", "to", "was", "were", "will", "with", "the", "this", "but", "they", "have", "had", "what", "when", "where", "who", "which", "why", "how", "all", "each", "every", "both", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "can", "just", "should", "now", "also", "been", "being", "do", "does", "did", "doing", "would", "could", "might", "must", "shall", "about", "above", "after", "again", "against", "any", "because", "before", "below", "between", "during", "into", "through", "under", "until", "up", "down", "out", "off", "over", "then", "once", "here", "there", "these", "those", "am", "if", "or", "while", "your", "you", "we", "our", "my", "me", "him", "her", "his", "them", "their", "she", "i", "us"}: Common English stopwords to filter out

Instance Method Summary

#clear(db : DB::Database, service_id : Int64)
Clear all vocabulary for a service
#count(db : DB::Database, service_id : Int64) : Int64
Get vocabulary count for a service
#extract_terms(text : String) : Array(WordFrequency)
Extract unique terms from text with frequency counts
#extract_terms_batch(texts : Array(String)) : Array(WordFrequency)
Extract terms from multiple texts, combining frequencies
#get_existing_words(db : DB::Database, words : Array(String), service_id : Int64) : Set(String)
Get existing words from vocab for a service
#search(db : DB::Database, query_embedding : Array(Float64), service_id : Int64, limit : Int32 = 10, min_score : Float64 = 0.5) : Array(Result)
Search vocabulary for similar words
#store_batch(db : DB::Database, words : Array(String), embeddings : Array(Array(Float64)), frequencies : Array(Int32), service_id : Int64)
Store a batch of word embeddings
#store_word(db : DB::Database, word : String, embedding : Array(Float64), frequency : Int32, service_id : Int64)
Store a single word with embedding
#update_frequencies(db : DB::Database, word_freqs : Array(WordFrequency), service_id : Int64)
Update frequencies for existing words (increment by count)

Instance Method Detail

def clear(db : DB::Database, service_id : Int64) #

Clear all vocabulary for a service

[View source]

def count(db : DB::Database, service_id : Int64) : Int64 #

Get vocabulary count for a service

[View source]

def extract_terms(text : String) : Array(WordFrequency) #

Extract unique terms from text with frequency counts

Tokenizes text, normalizes words, and filters:

Stopwords
Words shorter than MIN_WORD_LENGTH
Words longer than MAX_WORD_LENGTH
Numbers-only tokens

Returns array of WordFrequency sorted by count (descending)

[View source]

def extract_terms_batch(texts : Array(String)) : Array(WordFrequency) #

Extract terms from multiple texts, combining frequencies

[View source]

def get_existing_words(db : DB::Database, words : Array(String), service_id : Int64) : Set(String) #

Get existing words from vocab for a service

Returns set of words that already have embeddings

[View source]

def search(db : DB::Database, query_embedding : Array(Float64), service_id : Int64, limit : Int32 = 10, min_score : Float64 = 0.5) : Array(Result) #

Search vocabulary for similar words

Compares query embedding against stored word embeddings. Returns results ranked by cosine similarity.

[View source]

def store_batch(db : DB::Database, words : Array(String), embeddings : Array(Array(Float64)), frequencies : Array(Int32), service_id : Int64) #

Store a batch of word embeddings

Words should be lowercase and already filtered. Embeddings are stored with frequency counts.

[View source]

def store_word(db : DB::Database, word : String, embedding : Array(Float64), frequency : Int32, service_id : Int64) #

Store a single word with embedding

[View source]

def update_frequencies(db : DB::Database, word_freqs : Array(WordFrequency), service_id : Int64) #

Update frequencies for existing words (increment by count)

[View source]

CrystalDoc.info

memo

module Memo::Vocab

Overview

Extended Modules

Defined in:

Constant Summary

Instance Method Summary

Instance Method Detail