from core.spacy_utils import init_nlp import spacy from spacy.language import Language nlp = init_nlp() # ------------ # Define break words and patterns for sentence splitting # ------------ BREAK_WORDS = [ " and ", ",", " who ", " which ", " that ", " because ", " so ", " but ", " or ", " however ", " therefore ", " thus ", " hence ", " although ", " though ", " since ", " as ", " if ", " unless ", " until ", " when ", " while ", " after ", " before ", " so that ", " in order to ", " whereas ", " despite ", " provided that ", ";", ":", "—", "-", "(", ")", "[", "]", "{", "}", "\"", "'" ] @Language.component("custom_boundaries") def set_custom_boundaries(doc): for token in doc[:-1]: for break_word in BREAK_WORDS: if break_word in token.text_with_ws: doc[token.i + 1].is_sent_start = True break return doc # Add custom boundary detection to pipeline if "custom_boundaries" not in nlp.pipe_names: nlp.add_pipe("custom_boundaries", before="parser") text = "Environmental organizations and researchers say the wildfires blazing in the rainforest were set by cattle ranchers and loggers who want to clear and utilize the land,emboldened by the country's pro-business president." doc = nlp(text) # Extract sentences with custom boundaries sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()] # Print all sentences directly for i, sentence in enumerate(sentences): print(f"{i+1}: {sentence}")