40 lines
1.5 KiB
Python
Raw Permalink Normal View History

2025-09-05 14:41:59 +08:00
from core.spacy_utils import init_nlp
import spacy
from spacy.language import Language
nlp = init_nlp()
# ------------
# Define break words and patterns for sentence splitting
# ------------
BREAK_WORDS = [
" and ", ",", " who ", " which ", " that ", " because ", " so ",
" but ", " or ", " however ", " therefore ", " thus ", " hence ",
" although ", " though ", " since ", " as ", " if ", " unless ",
" until ", " when ", " while ", " after ", " before ", " so that ",
" in order to ", " whereas ", " despite ", " provided that ",
";", ":", "", "-", "(", ")", "[", "]", "{", "}", "\"", "'"
]
@Language.component("custom_boundaries")
def set_custom_boundaries(doc):
for token in doc[:-1]:
for break_word in BREAK_WORDS:
if break_word in token.text_with_ws:
doc[token.i + 1].is_sent_start = True
break
return doc
# Add custom boundary detection to pipeline
if "custom_boundaries" not in nlp.pipe_names:
nlp.add_pipe("custom_boundaries", before="parser")
text = "Environmental organizations and researchers say the wildfires blazing in the rainforest were set by cattle ranchers and loggers who want to clear and utilize the land,emboldened by the country's pro-business president."
doc = nlp(text)
# Extract sentences with custom boundaries
sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
# Print all sentences directly
for i, sentence in enumerate(sentences):
print(f"{i+1}: {sentence}")