import json
import sys
import pdfplumber
from nltk.tokenize import sent_tokenize
import nltk
import re

# Ensure you have the necessary NLTK data silently
nltk.download('punkt', quiet=True)

# Sentence-Based Chunking
def sentence_based_chunking(text, max_sentences_per_chunk, split_newlines=None):
    sentences = []

    if split_newlines is not None and split_newlines > 0:
        # Create a regex pattern for the specified number of newlines
        newline_pattern = r'(\n{%d,})' % split_newlines
        parts = re.split(newline_pattern, text)
        
        for part in parts:
            part = part.strip()
            if part and not re.match(newline_pattern, part):  # Ignore empty parts and newline patterns
                sentences.extend(sent_tokenize(part))
            elif re.match(newline_pattern, part):
                sentences.append(part)
    else:
        sentences = [sentence.strip() for sentence in sent_tokenize(text) if sentence.strip()]

    # Create chunks of sentences
    chunks = [' '.join(sentences[i:i + max_sentences_per_chunk]) for i in range(0, len(sentences), max_sentences_per_chunk)]
    return chunks

# Sliding Window Chunking
def sliding_window_chunking(text, chunk_size=50, overlap_size=10):
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words) - chunk_size + 1, chunk_size - overlap_size)]
    return chunks

# Paragraph-Based Chunking
def paragraph_based_chunking(text):
    paragraphs = text.split('\n\n')
    chunks = [paragraph.strip() for paragraph in paragraphs if paragraph.strip()]
    return chunks

# Page-Based Chunking
def page_based_chunking(pages):
    return [page.strip() for page in pages if page.strip()]

# Main function to choose the chunking method
def chunk_text(text, method, param1=None, param2=None):
    if method == 'sentence':
        max_sentences_per_chunk = param1 if param1 is not None else 1
        split_newlines = param2 if param2 is not None else 0
        return sentence_based_chunking(text, max_sentences_per_chunk, split_newlines)
    elif method == 'sliding':
        chunk_size = param1 if param1 is not None else 50  # Default chunk size
        overlap_size = param2 if param2 is not None else 10  # Default overlap size
        return sliding_window_chunking(text, chunk_size, overlap_size)
    elif method == 'paragraph':
        return paragraph_based_chunking(text)
    elif method == 'page':
        return page_based_chunking(text)
    else:
        raise ValueError(f"Unknown method: {method}. Use 'sentence', 'sliding', 'paragraph', or 'page'.")

# Main script to extract text from PDF and chunk it
if len(sys.argv) < 3:
    print("Usage: python extract_text.py <path_to_pdf> <chunking_method> [<param1> <param2>]")
    sys.exit(1)

pdf_path = sys.argv[1]
chunking_method = sys.argv[2]
param1 = int(sys.argv[3]) if len(sys.argv) > 3 else None  # Default value for param1
param2 = int(sys.argv[4]) if len(sys.argv) > 4 else None  # Default value for param2

try:
    with pdfplumber.open(pdf_path) as pdf:
        all_text = [page.extract_text() for page in pdf.pages]
except Exception as e:
    print(f"Error processing PDF: {e}")
    sys.exit(1)

if chunking_method == 'page':
    chunks = chunk_text(all_text, chunking_method)
else:
    full_text = "\n".join(filter(None, all_text))
    chunks = chunk_text(full_text, chunking_method, param1, param2)

# Print chunks as JSON
print(json.dumps(chunks, indent=2))