# First, install the gem by running: # gem install pragmatic_segmenter require 'pragmatic_segmenter' # Sentence-Based Chunking def sentence_based_chunking(text, max_sentences_per_chunk) segmenter = PragmaticSegmenter::Segmenter.new(text: text) sentences = segmenter.segment chunks = [] sentences.each_slice(max_sentences_per_chunk) do |sentence_slice| chunks << sentence_slice.join(' ') end chunks end # Sliding Window Chunking def sliding_window_chunking(text, chunk_size, overlap_size) words = text.split chunks = [] step_size = chunk_size - overlap_size (0..(words.size - chunk_size)).step(step_size) do |i| chunk = words[i, chunk_size].join(' ') chunks << chunk end chunks end # Paragraph-Based Chunking def paragraph_based_chunking(text) paragraphs = text.split(/\n\n/) chunks = paragraphs.map(&:strip).reject(&:empty?) chunks end # Main function to choose the chunking method def chunk_text(text, method, param1 = nil, param2 = nil) case method when 'sentence' sentence_based_chunking(text, param1) when 'sliding' sliding_window_chunking(text, param1, param2) when 'paragraph' paragraph_based_chunking(text) else raise "Unknown method: #{method}. Use 'sentence', 'sliding', or 'paragraph'." end end # Example usage text = `cat drinks.txt` method = 'sentence' # Choose 'sentence', 'sliding', or 'paragraph' param1 = 2 # Number of sentences for sentence-based, chunk size for sliding window param2 = 1 # Overlap size for sliding window (not used for other methods) chunks = chunk_text(text, method, param1, param2) #chunks.each { |chunk| puts chunk; puts "\n" } pp chunks