<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">import json
import sys
import os
import pdfplumber
from docx import Document
from bs4 import BeautifulSoup
import markdown
from striprtf.striprtf import rtf_to_text
from openpyxl import load_workbook
from pptx import Presentation
from nltk.tokenize import sent_tokenize
import nltk
import magic
import re

# Ensure you have the necessary NLTK data silently
nltk.download('punkt', quiet=True)

# Sentence-Based Chunking
def sentence_based_chunking(text, max_sentences_per_chunk, split_newlines=None):
    sentences = []

    if split_newlines is not None and split_newlines &gt; 0:
        # Create a regex pattern for the specified number of newlines
        import re
        newline_pattern = r'(\n{%d,})' % split_newlines
        parts = re.split(newline_pattern, text)
        
        for part in parts:
            part = part.strip()
            if part and not re.match(newline_pattern, part):  # Ignore empty parts and newline patterns
                sentences.extend(sent_tokenize(part))
            elif re.match(newline_pattern, part):
                sentences.append(part)
    else:
        sentences = [sentence.strip() for sentence in sent_tokenize(text) if sentence.strip()]

    # Create chunks of sentences
    chunks = [' '.join(sentences[i:i + max_sentences_per_chunk]) for i in range(0, len(sentences), max_sentences_per_chunk)]
    return chunks

# Sliding Window Chunking
def sliding_window_chunking(text, chunk_size=50, overlap_size=10):
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words) - chunk_size + 1, chunk_size - overlap_size)]
    return chunks

# Paragraph-Based Chunking
def paragraph_based_chunking(text):
    paragraphs = text.split('\n\n')
    chunks = [paragraph.strip() for paragraph in paragraphs if paragraph.strip()]
    return chunks

# Page-Based Chunking
def page_based_chunking(pages):
    return [page.strip() for page in pages if page.strip()]

# Main function to choose the chunking method
def chunk_text(text, method, param1=None, param2=None):
    if method == 'sentence':
        if isinstance(text, list):
            text = "\n".join(text)  # Convert list to string
        max_sentences_per_chunk = param1 if param1 is not None else 1
        split_newlines = param2 if param2 is not None else 0
        return sentence_based_chunking(text, max_sentences_per_chunk, split_newlines)
    elif method == 'sliding':
        if isinstance(text, list):
            text = "\n".join(text)  # Convert list to string
        chunk_size = param1 if param1 is not None else 50  # Default chunk size
        overlap_size = param2 if param2 is not None else 10  # Default overlap size
        return sliding_window_chunking(text, chunk_size, overlap_size)
    elif method == 'paragraph':
        if isinstance(text, list):
            text = "\n".join(text)  # Convert list to string
        return paragraph_based_chunking(text)
    elif method == 'page':
        if not isinstance(text, list):
            text = text.split('\n\n')  # Convert string to list of pages
        return page_based_chunking(text)
    else:
        raise ValueError(f"Unknown method: {method}. Use 'sentence', 'sliding', 'paragraph', or 'page'.")

# Function to extract text from different file types
def extract_text_from_file(file_path):
    mime = magic.Magic(mime=True)
    file_type = mime.from_file(file_path)

    if 'pdf' in file_type:
        try:
            with pdfplumber.open(file_path) as pdf:
                pages = []
                for page in pdf.pages:
                    text = page.extract_text()
                    if text:
                        # Remove page number from the beginning of the text
                        text = re.sub(r'^\d+\.\s*', '', text.strip())
                        pages.append(text)
                return pages
        except Exception as e:
            return json.dumps({"error": f"Error processing PDF: {e}"})
    
    elif 'vnd.openxmlformats-officedocument.wordprocessingml.document' in file_type:
        try:
            doc = Document(file_path)
            return [para.text for para in doc.paragraphs if para.text.strip()]
        except Exception as e:
            return json.dumps({"error": f"Error processing DOCX: {e}"})
    elif 'plain' in file_type or 'text' in file_type:
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                return file.read()
        except Exception as e:
            return json.dumps({"error": f"Error processing TXT: {e}"})
    elif 'html' in file_type:
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                soup = BeautifulSoup(file, 'html.parser')
                return soup.get_text(separator='\n')
        except Exception as e:
            return json.dumps({"error": f"Error processing HTML: {e}"})
    elif 'markdown' in file_type or file_path.endswith('.md'):
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                html = markdown.markdown(file.read())
                soup = BeautifulSoup(html, 'html.parser')
                return soup.get_text(separator='\n')
        except Exception as e:
            return json.dumps({"error": f"Error processing Markdown: {e}"})
    elif 'rtf' in file_type:
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                return rtf_to_text(file.read())
        except Exception as e:
            return json.dumps({"error": f"Error processing RTF: {e}"})
    elif 'vnd.openxmlformats-officedocument.spreadsheetml.sheet' in file_type:
        try:
            wb = load_workbook(file_path)
            sheets_text = []
            for sheet in wb.worksheets:
                for row in sheet.iter_rows(values_only=True):
                    row_text = ' '.join([str(cell) for cell in row if cell is not None])
                    sheets_text.append(row_text)
            return "\n".join(sheets_text)
        except Exception as e:
            return json.dumps({"error": f"Error processing Excel: {e}"})
    elif 'vnd.openxmlformats-officedocument.presentationml.presentation' in file_type or file_path.endswith('.pptx'):
        if not os.path.exists(file_path):
            return json.dumps({"error": f"File not found: {file_path}"})
        try:
            prs = Presentation(file_path)
            slides_text = []
            for slide in prs.slides:
                slide_text = []
                for shape in slide.shapes:
                    if hasattr(shape, "text"):
                        slide_text.append(shape.text)
                slides_text.append("\n".join(slide_text))
            return slides_text
        except Exception as e:
            return json.dumps({"error": f"Error processing PowerPoint: {e}"})
    else:
        return json.dumps({"error": f"Unsupported file type: {file_type}. Please provide a supported file."})

# Main script to extract text from file and chunk it
if len(sys.argv) &lt; 3:
    print(json.dumps({"error": "Usage: python extract_text.py &lt;path_to_file&gt; &lt;chunking_method&gt; [&lt;param1&gt; &lt;param2&gt;]"}))
    sys.exit(1)

file_path = sys.argv[1]
chunking_method = sys.argv[2]
param1 = int(sys.argv[3]) if len(sys.argv) &gt; 3 else None  # Default value for param1
param2 = int(sys.argv[4]) if len(sys.argv) &gt; 4 else None  # Default value for param2

try:
    full_text = extract_text_from_file(file_path)

    if isinstance(full_text, str) and full_text.startswith('{"error"'):
        print(full_text)
        sys.exit(1)
    chunks = chunk_text(full_text, chunking_method, param1, param2)
    print(json.dumps(chunks, indent=2))
except Exception as e:
    print(json.dumps({"error": str(e)}))
    sys.exit(1)
</pre></body></html>