video_translation/test.py

from core.spacy_utils import init_nlp
import spacy
from spacy.language import Language

nlp = init_nlp()

# ------------
# Define break words and patterns for sentence splitting
# ------------
BREAK_WORDS = [
    " and ", ",", " who ", " which ", " that ", " because ", " so ",
    " but ", " or ", " however ", " therefore ", " thus ", " hence ",
    " although ", " though ", " since ", " as ", " if ", " unless ",
    " until ", " when ", " while ", " after ", " before ", " so that ",
    " in order to ", " whereas ", " despite ", " provided that ",
    ";", ":", "—", "-", "(", ")", "[", "]", "{", "}", "\"", "'"
]

@Language.component("custom_boundaries")
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        for break_word in BREAK_WORDS:
            if break_word in token.text_with_ws:
                doc[token.i + 1].is_sent_start = True
                break
    return doc

# Add custom boundary detection to pipeline
if "custom_boundaries" not in nlp.pipe_names:
    nlp.add_pipe("custom_boundaries", before="parser")

text = "Environmental organizations and researchers say the wildfires blazing in the rainforest were set by cattle ranchers and loggers who want to clear and utilize the land,emboldened by the country's pro-business president."
doc = nlp(text)

# Extract sentences with custom boundaries
sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]

# Print all sentences directly
for i, sentence in enumerate(sentences):
    print(f"{i+1}: {sentence}")
修改基本生成逻辑 2025-09-05 14:41:59 +08:00			`from core.spacy_utils import init_nlp`
			`import spacy`
			`from spacy.language import Language`

			`nlp = init_nlp()`

			`# ------------`
			`# Define break words and patterns for sentence splitting`
			`# ------------`
			`BREAK_WORDS = [`
			`" and ", ",", " who ", " which ", " that ", " because ", " so ",`
			`" but ", " or ", " however ", " therefore ", " thus ", " hence ",`
			`" although ", " though ", " since ", " as ", " if ", " unless ",`
			`" until ", " when ", " while ", " after ", " before ", " so that ",`
			`" in order to ", " whereas ", " despite ", " provided that ",`
			`";", ":", "—", "-", "(", ")", "[", "]", "{", "}", "\"", "'"`
			`]`

			`@Language.component("custom_boundaries")`
			`def set_custom_boundaries(doc):`
			`for token in doc[:-1]:`
			`for break_word in BREAK_WORDS:`
			`if break_word in token.text_with_ws:`
			`doc[token.i + 1].is_sent_start = True`
			`break`
			`return doc`

			`# Add custom boundary detection to pipeline`
			`if "custom_boundaries" not in nlp.pipe_names:`
			`nlp.add_pipe("custom_boundaries", before="parser")`

			`text = "Environmental organizations and researchers say the wildfires blazing in the rainforest were set by cattle ranchers and loggers who want to clear and utilize the land,emboldened by the country's pro-business president."`
			`doc = nlp(text)`

			`# Extract sentences with custom boundaries`
			`sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]`

			`# Print all sentences directly`
			`for i, sentence in enumerate(sentences):`
			`print(f"{i+1}: {sentence}")`