import json import sys import pdfplumber from nltk.tokenize import sent_tokenize import nltk import re # Ensure you have the necessary NLTK data silently nltk.download('punkt', quiet=True) # Sentence-Based Chunking def sentence_based_chunking(text, max_sentences_per_chunk, split_newlines=None): sentences = [] if split_newlines is not None and split_newlines > 0: # Create a regex pattern for the specified number of newlines newline_pattern = r'(\n{%d,})' % split_newlines parts = re.split(newline_pattern, text) for part in parts: part = part.strip() if part and not re.match(newline_pattern, part): # Ignore empty parts and newline patterns sentences.extend(sent_tokenize(part)) elif re.match(newline_pattern, part): sentences.append(part) else: sentences = [sentence.strip() for sentence in sent_tokenize(text) if sentence.strip()] # Create chunks of sentences chunks = [' '.join(sentences[i:i + max_sentences_per_chunk]) for i in range(0, len(sentences), max_sentences_per_chunk)] return chunks # Sliding Window Chunking def sliding_window_chunking(text, chunk_size=50, overlap_size=10): words = text.split() chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words) - chunk_size + 1, chunk_size - overlap_size)] return chunks # Paragraph-Based Chunking def paragraph_based_chunking(text): paragraphs = text.split('\n\n') chunks = [paragraph.strip() for paragraph in paragraphs if paragraph.strip()] return chunks # Page-Based Chunking def page_based_chunking(pages): return [page.strip() for page in pages if page.strip()] # Main function to choose the chunking method def chunk_text(text, method, param1=None, param2=None): if method == 'sentence': max_sentences_per_chunk = param1 if param1 is not None else 1 split_newlines = param2 if param2 is not None else 0 return sentence_based_chunking(text, max_sentences_per_chunk, split_newlines) elif method == 'sliding': chunk_size = param1 if param1 is not None else 50 # Default chunk size overlap_size = param2 if param2 is not None else 10 # Default overlap size return sliding_window_chunking(text, chunk_size, overlap_size) elif method == 'paragraph': return paragraph_based_chunking(text) elif method == 'page': return page_based_chunking(text) else: raise ValueError(f"Unknown method: {method}. Use 'sentence', 'sliding', 'paragraph', or 'page'.") # Main script to extract text from PDF and chunk it if len(sys.argv) < 3: print("Usage: python extract_text.py [ ]") sys.exit(1) pdf_path = sys.argv[1] chunking_method = sys.argv[2] param1 = int(sys.argv[3]) if len(sys.argv) > 3 else None # Default value for param1 param2 = int(sys.argv[4]) if len(sys.argv) > 4 else None # Default value for param2 try: with pdfplumber.open(pdf_path) as pdf: all_text = [page.extract_text() for page in pdf.pages] except Exception as e: print(f"Error processing PDF: {e}") sys.exit(1) if chunking_method == 'page': chunks = chunk_text(all_text, chunking_method) else: full_text = "\n".join(filter(None, all_text)) chunks = chunk_text(full_text, chunking_method, param1, param2) # Print chunks as JSON print(json.dumps(chunks, indent=2))