40 lines
1.5 KiB
Python
40 lines
1.5 KiB
Python
|
|
from core.spacy_utils import init_nlp
|
||
|
|
import spacy
|
||
|
|
from spacy.language import Language
|
||
|
|
|
||
|
|
nlp = init_nlp()
|
||
|
|
|
||
|
|
# ------------
|
||
|
|
# Define break words and patterns for sentence splitting
|
||
|
|
# ------------
|
||
|
|
BREAK_WORDS = [
|
||
|
|
" and ", ",", " who ", " which ", " that ", " because ", " so ",
|
||
|
|
" but ", " or ", " however ", " therefore ", " thus ", " hence ",
|
||
|
|
" although ", " though ", " since ", " as ", " if ", " unless ",
|
||
|
|
" until ", " when ", " while ", " after ", " before ", " so that ",
|
||
|
|
" in order to ", " whereas ", " despite ", " provided that ",
|
||
|
|
";", ":", "—", "-", "(", ")", "[", "]", "{", "}", "\"", "'"
|
||
|
|
]
|
||
|
|
|
||
|
|
@Language.component("custom_boundaries")
|
||
|
|
def set_custom_boundaries(doc):
|
||
|
|
for token in doc[:-1]:
|
||
|
|
for break_word in BREAK_WORDS:
|
||
|
|
if break_word in token.text_with_ws:
|
||
|
|
doc[token.i + 1].is_sent_start = True
|
||
|
|
break
|
||
|
|
return doc
|
||
|
|
|
||
|
|
# Add custom boundary detection to pipeline
|
||
|
|
if "custom_boundaries" not in nlp.pipe_names:
|
||
|
|
nlp.add_pipe("custom_boundaries", before="parser")
|
||
|
|
|
||
|
|
text = "Environmental organizations and researchers say the wildfires blazing in the rainforest were set by cattle ranchers and loggers who want to clear and utilize the land,emboldened by the country's pro-business president."
|
||
|
|
doc = nlp(text)
|
||
|
|
|
||
|
|
# Extract sentences with custom boundaries
|
||
|
|
sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
|
||
|
|
|
||
|
|
# Print all sentences directly
|
||
|
|
for i, sentence in enumerate(sentences):
|
||
|
|
print(f"{i+1}: {sentence}")
|