from core.spacy_utils import init_nlp
import spacy
from spacy.language import Language

nlp = init_nlp()

# ------------
# Define break words and patterns for sentence splitting
# ------------
BREAK_WORDS = [
    " and ", ",", " who ", " which ", " that ", " because ", " so ",
    " but ", " or ", " however ", " therefore ", " thus ", " hence ",
    " although ", " though ", " since ", " as ", " if ", " unless ",
    " until ", " when ", " while ", " after ", " before ", " so that ",
    " in order to ", " whereas ", " despite ", " provided that ",
    ";", ":", "—", "-", "(", ")", "[", "]", "{", "}", "\"", "'"
]

@Language.component("custom_boundaries")
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        for break_word in BREAK_WORDS:
            if break_word in token.text_with_ws:
                doc[token.i + 1].is_sent_start = True
                break
    return doc

# Add custom boundary detection to pipeline
if "custom_boundaries" not in nlp.pipe_names:
    nlp.add_pipe("custom_boundaries", before="parser")

text = "Environmental organizations and researchers say the wildfires blazing in the rainforest were set by cattle ranchers and loggers who want to clear and utilize the land,emboldened by the country's pro-business president."
doc = nlp(text)

# Extract sentences with custom boundaries
sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]

# Print all sentences directly
for i, sentence in enumerate(sentences):
    print(f"{i+1}: {sentence}")