Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
75 commits
Select commit Hold shift + click to select a range
53dd3bb
Merge branch 'master' into integrateCPPJieba
CrazySteve0605 Aug 9, 2025
eba63ab
Merge branch 'master' into integrateCPPJieba
CrazySteve0605 Aug 15, 2025
b0ac081
add `WordSegment` module
CrazySteve0605 Aug 18, 2025
9f62f04
update `textUtils/__init__.py`
CrazySteve0605 Aug 18, 2025
81f2040
update `textInfos/offsets.py`
CrazySteve0605 Aug 18, 2025
da64cd8
update `displayModel.py`
CrazySteve0605 Aug 18, 2025
557f404
Pre-commit auto-fix
pre-commit-ci[bot] Aug 18, 2025
f72d348
update type annotations
CrazySteve0605 Aug 18, 2025
19cad8a
Merge branch 'master' into integrateCPPJieba
CrazySteve0605 Aug 20, 2025
adc22fb
add wrapper for word manager
CrazySteve0605 Aug 20, 2025
4adac07
update the word segmentation structure
CrazySteve0605 Aug 20, 2025
407d4b2
Merge branch 'integrateCPPJieba' into wordNavigationForChineseText
CrazySteve0605 Aug 20, 2025
0d40f0a
Pre-commit auto-fix
pre-commit-ci[bot] Aug 20, 2025
676fc42
add copyright header
CrazySteve0605 Aug 21, 2025
ddd48e8
add type annotations
CrazySteve0605 Aug 21, 2025
3c65868
update log
CrazySteve0605 Aug 21, 2025
d69e8b7
add trailing commas in multi-line constructs
CrazySteve0605 Aug 21, 2025
8244a76
make wordSegment module to make file structure clearer
CrazySteve0605 Aug 21, 2025
3f54d62
add initialization logic to wordSeg module
CrazySteve0605 Aug 21, 2025
38b4bea
Pre-commit auto-fix
pre-commit-ci[bot] Aug 21, 2025
eeb96aa
use multithreading for cppjieba's initialization
CrazySteve0605 Aug 23, 2025
3ba56f0
add configuration for word navigation
CrazySteve0605 Aug 23, 2025
356c11c
Pre-commit auto-fix
pre-commit-ci[bot] Aug 23, 2025
4a680ea
make "Auto" the default option for word navigation
CrazySteve0605 Aug 24, 2025
97b6db7
update for pyright checks
CrazySteve0605 Aug 24, 2025
a4edc9e
Merge branch 'integrateCPPJieba' into wordNavigationForChineseText
CrazySteve0605 Aug 28, 2025
3b2d835
resolve deprecation
CrazySteve0605 Aug 28, 2025
9e6a2e1
Pre-commit auto-fix
pre-commit-ci[bot] Aug 28, 2025
c1fb4b8
Merge branch 'integrateCPPJieba' into wordNavigationForChineseText
CrazySteve0605 Sep 4, 2025
a1113d8
add `segmentedText` method
CrazySteve0605 Sep 4, 2025
abeb147
Merge branch 'integrateCPPJieba' into wordNavigationForChineseText
CrazySteve0605 Sep 7, 2025
b848e1b
update `wordSegStrategy.py`
CrazySteve0605 Sep 7, 2025
3bfbe59
update module importing order and type annotations
CrazySteve0605 Sep 7, 2025
f5087cc
Merge branch 'integrateCPPJieba' into wordNavigationForChineseText
CrazySteve0605 Sep 9, 2025
3a0badc
update `wordSegStrategy.py`
CrazySteve0605 Sep 9, 2025
cf3e115
Pre-commit auto-fix
pre-commit-ci[bot] Sep 9, 2025
984b6eb
Merge branch 'master' into integrateCPPJieba
CrazySteve0605 Sep 12, 2025
2b1d4b3
Merge branch 'integrateCPPJieba' into wordNavigationForChineseText
CrazySteve0605 Sep 12, 2025
97eb6dd
handle punctuation spacing
CrazySteve0605 Sep 13, 2025
bac3210
Pre-commit auto-fix
pre-commit-ci[bot] Sep 13, 2025
a8955a3
Revert "update module importing order and type annotations"
CrazySteve0605 Sep 21, 2025
7ee08d0
Merge branch 'integrateCPPJieba' into wordNavigationForChineseText
CrazySteve0605 Sep 21, 2025
90660ba
update `wordSegStrategy.py`
CrazySteve0605 Sep 21, 2025
9537999
revert copyright header of `configSpec.py`
CrazySteve0605 Sep 21, 2025
dc23346
Update source/core.py
CrazySteve0605 Sep 21, 2025
5562e70
Merge branch 'master' into integrateCPPJieba
CrazySteve0605 Sep 25, 2025
38ec7ff
Merge branch 'integrateCPPJieba' into wordNavigationForChineseText
CrazySteve0605 Sep 25, 2025
ccf07f9
correct method naming
CrazySteve0605 Sep 25, 2025
250e700
update UI text for Uniscribe
CrazySteve0605 Sep 25, 2025
53b3870
make `cppjieba` only available when NVDA's language is set to Chinese
CrazySteve0605 Sep 25, 2025
fec70a9
Merge branch 'master' into integrateCPPJieba
CrazySteve0605 Sep 26, 2025
69617c4
Merge branch 'integrateCPPJieba' into wordNavigationForChineseText
CrazySteve0605 Sep 26, 2025
111a24d
update `wordSegSegmenter.py` to handle offsets at the end of the string
CrazySteve0605 Sep 26, 2025
43bfe03
make initialization of word segmenters conditional on language
CrazySteve0605 Sep 27, 2025
2eec029
add unittest cases for `WordSegmenter`
CrazySteve0605 Sep 27, 2025
f769457
Pre-commit auto-fix
pre-commit-ci[bot] Sep 27, 2025
9479029
fixup
CrazySteve0605 Sep 27, 2025
9834b68
extract punctuation from `wordSegStrategy.py` to `wordSegUtils.py`
CrazySteve0605 Sep 27, 2025
b69d466
fix up
CrazySteve0605 Sep 27, 2025
6f586fd
update changelog
CrazySteve0605 Sep 27, 2025
face4bd
Merge branch 'try-chineseWordSegmentation-staging' into wordNavigatio…
michaelDCurran Sep 29, 2025
b40d709
revert `Initialize Word Segmenters for Unused Languages:` checkbox an…
CrazySteve0605 Sep 29, 2025
653e808
Pre-commit auto-fix
pre-commit-ci[bot] Sep 29, 2025
552b42b
fixup unittests
CrazySteve0605 Sep 30, 2025
5e0e3fd
simplify the logic for 'Auto' option in Word Segmentation Standard se…
CrazySteve0605 Sep 30, 2025
c3a8562
Pre-commit auto-fix
pre-commit-ci[bot] Sep 30, 2025
0940a73
fixup
CrazySteve0605 Sep 30, 2025
80b0472
Pre-commit auto-fix
pre-commit-ci[bot] Sep 30, 2025
085ba2f
Merge branch 'try-chineseWordSegmentation-staging' into wordNavigatio…
michaelDCurran Oct 9, 2025
d55d077
Pre-commit auto-fix
pre-commit-ci[bot] Oct 9, 2025
2083095
fixup
CrazySteve0605 Oct 25, 2025
d32549f
make word segmentation module reinitialized after settings are saved
CrazySteve0605 Oct 25, 2025
b8ace76
Pre-commit auto-fix
pre-commit-ci[bot] Oct 25, 2025
042b778
Merge branch 'try-chineseWordSegmentation-staging' into wordNavigatio…
michaelDCurran Oct 27, 2025
db90fff
remove duplicate importing lines
CrazySteve0605 Oct 28, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions nvdaHelper/cppjieba/sconscript
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,11 @@ sourceFiles = [
"cppjieba.def",
]

env.AppendUnique(
CCFLAGS=['/wd4819'],
CXXFLAGS=['/wd4819'],
)

cppjiebaLib = env.SharedLibrary(target="cppjieba", source=sourceFiles)

if not os.path.exists(outDir.Dir("dicts").get_abspath()) or not os.listdir(outDir.Dir("dicts").get_abspath()): # insure dicts installation happens only once and avoid a scons' warning
Expand Down
14 changes: 11 additions & 3 deletions source/NVDAObjects/window/edit.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# A part of NonVisual Desktop Access (NVDA)
# Copyright (C) 2006-2025 NV Access Limited, Babbage B.V., Cyrille Bougot, Leonard de Ruijter
# This file is covered by the GNU General Public License.
# See the file COPYING for more details.
# Copyright (C) 2006-2025 NV Access Limited, Babbage B.V., Cyrille Bougot, Leonard de Ruijter, Wang Chong
# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt

from typing import (
Dict,
Expand Down Expand Up @@ -32,6 +32,7 @@
import watchdog
import locationHelper
import textUtils
from textUtils.segFlag import CharSegFlag, WordSegFlag
import NVDAHelper.localLib


Expand Down Expand Up @@ -169,6 +170,13 @@ class getTextLengthExStruct(ctypes.Structure):


class EditTextInfo(textInfos.offsets.OffsetsTextInfo):
# Override segFlags to enforce use of Uniscribe
charSegFlag = CharSegFlag.UNISCRIBE

@property
def wordSegFlag(self):
return WordSegFlag.UNISCRIBE

def _getPointFromOffset(self, offset):
if self.obj.editAPIVersion == 1 or self.obj.editAPIVersion >= 3:
processHandle = self.obj.processHandle
Expand Down
5 changes: 4 additions & 1 deletion source/config/configSpec.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# A part of NonVisual Desktop Access (NVDA)
# Copyright (C) 2006-2025 NV Access Limited, Babbage B.V., Davy Kager, Bill Dengler, Julien Cochuyt,
# Joseph Lee, Dawid Pieper, mltony, Bram Duvigneau, Cyrille Bougot, Rob Meredith,
# Burman's Computer and Education Ltd., Leonard de Ruijter, Łukasz Golonka, Cary-rowen
# Burman's Computer and Education Ltd., Leonard de Ruijter, Łukasz Golonka, Cary-rowen,
# Wang Chong
# This file is covered by the GNU General Public License.
# See the file COPYING for more details.

Expand Down Expand Up @@ -261,6 +262,8 @@
reportClickable = boolean(default=true)

[documentNavigation]
initWordSegForUnusedLang = boolean(default=false)
wordSegmentationStandard = featureFlag(optionsEnum="WordNavigationUnitFlag", behaviorOfDefault="Auto")
paragraphStyle = featureFlag(optionsEnum="ParagraphNavigationFlag", behaviorOfDefault="application")

[reviewCursor]
Expand Down
26 changes: 23 additions & 3 deletions source/config/featureFlagEnums.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# A part of NonVisual Desktop Access (NVDA)
# Copyright (C) 2022 NV Access Limited, Bill Dengler, Rob Meredith
# This file is covered by the GNU General Public License.
# See the file COPYING for more details.
# Copyright (C) 2022-2025 NV Access Limited, Bill Dengler, Rob Meredith, Wang Chong
# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt

"""
Feature flag value enumerations.
Expand Down Expand Up @@ -139,6 +139,26 @@ def _displayStringLabels(self) -> dict["FontFormattingBrailleModeFlag", str]:
}


class WordNavigationUnitFlag(DisplayStringEnum):
"""Enumeration for word navigation."""

@property
def _displayStringLabels(self):
return {
# Translators: Label for a method of word segmentation.
self.AUTO: _("Auto"),
# Translators: Label for a method of word segmentation.
self.UNISCRIBE: _("Standard"),
# Translators: Label for a method of word segmentation.
self.CHINESE: _("Chinese"),
}

DEFAULT = enum.auto()
AUTO = enum.auto()
UNISCRIBE = enum.auto()
CHINESE = enum.auto()


def getAvailableEnums() -> typing.Generator[typing.Tuple[str, FlagValueEnum], None, None]:
for name, value in globals().items():
if (
Expand Down
12 changes: 11 additions & 1 deletion source/core.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# A part of NonVisual Desktop Access (NVDA)
# Copyright (C) 2006-2025 NV Access Limited, Aleksey Sadovoy, Christopher Toth, Joseph Lee, Peter Vágner,
# Derek Riemer, Babbage B.V., Zahari Yurukov, Łukasz Golonka, Cyrille Bougot, Julien Cochuyt
# Derek Riemer, Babbage B.V., Zahari Yurukov, Łukasz Golonka, Cyrille Bougot, Julien Cochuyt, Wang Chong
# This file is covered by the GNU General Public License.
# See the file COPYING for more details.

Expand Down Expand Up @@ -909,6 +909,16 @@ def main():

_remoteClient.initialize()

from textUtils import wordSeg

log.debug("Initializing word segmentation module")

try:
wordSeg.initialize()
except RuntimeError:
log.warning("Word segmentation module disabled in configuration")
except Exception:
log.error("Error initializing word segmentation module", exc_info=True)
import _localCaptioner

_localCaptioner.initialize()
Expand Down
14 changes: 10 additions & 4 deletions source/displayModel.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# A part of NonVisual Desktop Access (NVDA)
# This file is covered by the GNU General Public License.
# See the file COPYING for more details.
# Copyright (C) 2006-2025 NV Access Limited, Babbage B.V., Joseph Lee, Cyrille Bougot
# Copyright (C) 2006-2025 NV Access Limited, Babbage B.V., Joseph Lee, Cyrille Bougot, Wang Chong
# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt

from ctypes import byref, c_short, c_long
import unicodedata
Expand All @@ -22,6 +22,7 @@
import windowUtils
from locationHelper import RectLTRB, RectLTWH
import textUtils
from textUtils.segFlag import CharSegFlag, WordSegFlag
from typing import (
List,
Tuple,
Expand Down Expand Up @@ -525,7 +526,12 @@ def _getStoryLength(self):
return lineEndOffsets[-1]
return 0

useUniscribe = False
# Override segFlags to strictly use the old fallen-back method
charSegFlag = CharSegFlag.NONE

@property
def wordSegFlag(self):
return WordSegFlag.NONE

def _getTextRange(self, start, end):
return "".join(x for x in self._getFieldsInRange(start, end) if isinstance(x, str))
Expand Down
31 changes: 28 additions & 3 deletions source/gui/settingsDialogs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@
# Thomas Stivers, Julien Cochuyt, Peter Vágner, Cyrille Bougot, Mesar Hameed,
# Łukasz Golonka, Aaron Cannon, Adriani90, André-Abush Clause, Dawid Pieper,
# Takuya Nishimoto, jakubl7545, Tony Malykh, Rob Meredith,
# Burman's Computer and Education Ltd, hwf1324, Cary-rowen, Christopher Proß., Tianze
# This file is covered by the GNU General Public License.
# See the file COPYING for more details.
# Burman's Computer and Education Ltd, hwf1324, Cary-rowen, Christopher Proß,
# Wang Chong, Tianze.
# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt

from collections.abc import Container
import logging
Expand Down Expand Up @@ -3090,6 +3091,17 @@ class DocumentNavigationPanel(SettingsPanel):

def makeSettings(self, settingsSizer: wx.BoxSizer) -> None:
sHelper = guiHelper.BoxSizerHelper(self, sizer=settingsSizer)

# Translators: This is a label for the word segmentation standard in the document navigation dialog
WordNavigationUnitLabel = _("&Word Segmentation Standard:")
self.wordSegCombo: nvdaControls.FeatureFlagCombo = sHelper.addLabeledControl(
labelText=WordNavigationUnitLabel,
wxCtrlClass=nvdaControls.FeatureFlagCombo,
keyPath=["documentNavigation", "wordSegmentationStandard"],
conf=config.conf,
)
self.bindHelpEvent("wordSegmentationStandard", self.wordSegCombo)

# Translators: This is a label for the paragraph navigation style in the document navigation dialog
paragraphStyleLabel = _("&Paragraph style:")
self.paragraphStyleCombo: nvdaControls.FeatureFlagCombo = sHelper.addLabeledControl(
Expand All @@ -3101,8 +3113,21 @@ def makeSettings(self, settingsSizer: wx.BoxSizer) -> None:
self.bindHelpEvent("ParagraphStyle", self.paragraphStyleCombo)

def onSave(self):
self.wordSegCombo.saveCurrentValueToConf()
self.paragraphStyleCombo.saveCurrentValueToConf()

def postSave(self):
from textUtils import wordSeg

log.debug("Reinitializing word segmentation module")

try:
wordSeg.initialize()
except RuntimeError:
log.warning("Word segmentation module disabled in configuration")
except Exception:
log.error("Error reinitializing word segmentation module", exc_info=True)


def _synthWarningDialog(newSynth: str):
gui.messageBox(
Expand Down
39 changes: 29 additions & 10 deletions source/textInfos/offsets.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
# textInfos/offsets.py
# A part of NonVisual Desktop Access (NVDA)
# This file is covered by the GNU General Public License.
# See the file COPYING for more details.
# Copyright (C) 2006-2024 NV Access Limited, Babbage B.V., Leonard de Ruijter
# Copyright (C) 2006-2025 NV Access Limited, Babbage B.V., Leonard de Ruijter, Wang Chong
# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt

from abc import abstractmethod
import re
import ctypes
import unicodedata
import NVDAHelper
import config
import config.featureFlagEnums
import textInfos
import locationHelper
from treeInterceptorHandler import TreeInterceptor
import textUtils
from textUtils.segFlag import CharSegFlag, WordSegFlag
from dataclasses import dataclass
from typing import (
Optional,
Expand Down Expand Up @@ -156,8 +156,21 @@ class OffsetsTextInfo(textInfos.TextInfo):

#: Honours documentFormatting config option if true - set to false if this is not at all slow.
detectFormattingAfterCursorMaybeSlow: bool = True
#: Use uniscribe to calculate word offsets etc.
useUniscribe: bool = True
#: Method to calculate character and word offsets.
charSegFlag: CharSegFlag = CharSegFlag.UNISCRIBE

@property
def wordSegFlag(self) -> WordSegFlag | None:
match self.wordSegConf.calculated():
case config.featureFlagEnums.WordNavigationUnitFlag.UNISCRIBE:
return WordSegFlag.UNISCRIBE
case config.featureFlagEnums.WordNavigationUnitFlag.AUTO:
return WordSegFlag.AUTO
case config.featureFlagEnums.WordNavigationUnitFlag.CHINESE:
return WordSegFlag.CHINESE
case _:
log.error(f"Unknown word segmentation standard, {self.__wordSegConf.calculated()!r}")

#: The encoding internal to the underlying text info implementation.
encoding: Optional[str] = textUtils.WCHAR_ENCODING

Expand Down Expand Up @@ -377,7 +390,7 @@ def _getCharacterOffsets(self, offset):
lineStart, lineEnd = self._getLineOffsets(offset)
lineText = self._getTextRange(lineStart, lineEnd)
relOffset = offset - lineStart
if self.useUniscribe:
if self.charSegFlag == CharSegFlag.UNISCRIBE:
offsets = self._calculateUniscribeOffsets(lineText, textInfos.UNIT_CHARACTER, relOffset)
if offsets is not None:
return (offsets[0] + lineStart, offsets[1] + lineStart)
Expand All @@ -401,8 +414,10 @@ def _getWordOffsets(self, offset):
# Convert NULL and non-breaking space to space to make sure that words will break on them
lineText = lineText.translate({0: " ", 0xA0: " "})
relOffset = offset - lineStart
if self.useUniscribe:
offsets = self._calculateUniscribeOffsets(lineText, textInfos.UNIT_WORD, relOffset)
if self.wordSegFlag:
offsets = textUtils.WordSegmenter(lineText, self.encoding, self.wordSegFlag).getSegmentForOffset(
relOffset,
)
if offsets is not None:
return (offsets[0] + lineStart, offsets[1] + lineStart)
# Fall back to the older word offsets detection that only breaks on non alphanumeric
Expand Down Expand Up @@ -476,6 +491,10 @@ def __init__(self, obj, position):
Subclasses may extend this to perform implementation specific initialisation, calling their superclass method afterwards.
"""
super(OffsetsTextInfo, self).__init__(obj, position)
self.wordSegConf: config.featureFlag.FeatureFlag = config.conf["documentNavigation"][
"wordSegmentationStandard"
]

from NVDAObjects import NVDAObject

if isinstance(position, locationHelper.Point):
Expand Down
69 changes: 66 additions & 3 deletions source/textUtils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,27 @@
# A part of NonVisual Desktop Access (NVDA)
# This file is covered by the GNU General Public License.
# See the file COPYING for more details.
# Copyright (C) 2018-2024 NV Access Limited, Babbage B.V., Łukasz Golonka
# Copyright (C) 2018-2025 NV Access Limited, Babbage B.V., Łukasz Golonka, Wang Chong
# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt

"""
Classes and utilities to deal with offsets variable width encodings, particularly utf_16.
"""

import ctypes
import re
import encodings
import locale
import unicodedata

from abc import ABCMeta, abstractmethod, abstractproperty
from functools import cached_property
from typing import Generator, Optional, Tuple, Type

from logHandler import log

from .uniscribe import splitAtCharacterBoundaries
from .wordSeg import wordSegStrategy
from .segFlag import WordSegFlag

WCHAR_ENCODING = "utf_16_le"
UTF8_ENCODING = "utf-8"
Expand Down Expand Up @@ -540,3 +544,62 @@ def getOffsetConverter(encoding: str) -> Type[OffsetConverter]:
return ENCODINGS_TO_CONVERTERS[encoding]
except IndexError as e:
raise LookupError(f"Don't know how to deal with encoding '{encoding}'", e)


class WordSegmenter:
"""Selects appropriate segmentation strategy and segments text."""

# Precompiled patterns
# Chinese characters and Japanese kanji (CJK Unified Ideographs U+4E00 - U+9FFF)
_CHINESE_CHARACTER_AND_JAPANESE_KANJI: re.Pattern = re.compile(r"[\u4E00-\u9FFF]")
# Japanese kana (Hiragana U+3040 - U+309F, Katakana U+30A0 - U+30FF)
_KANA: re.Pattern = re.compile(r"[\u3040-\u309F\u30A0-\u30FF]")

def __init__(self, text: str, encoding: str = "UTF-8", wordSegFlag: WordSegFlag = WordSegFlag.AUTO):
self.text: str = text
self.encoding: str | None = encoding
self.wordSegFlag: WordSegFlag = wordSegFlag
self.strategy: wordSegStrategy.WordSegmentationStrategy = self._chooseStrategy()

def _chooseStrategy(self) -> wordSegStrategy.WordSegmentationStrategy: # TODO: optimize
"""Choose the appropriate segmentation strategy based on the text content."""
if self.wordSegFlag == WordSegFlag.AUTO:
if (
wordSegStrategy.ChineseWordSegmentationStrategy._lib
and WordSegmenter._CHINESE_CHARACTER_AND_JAPANESE_KANJI.search(
self.text,
)
and not WordSegmenter._KANA.search(self.text)
):
return wordSegStrategy.ChineseWordSegmentationStrategy(self.text, self.encoding)
else:
return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding)
else:
match self.wordSegFlag:
case WordSegFlag.UNISCRIBE:
return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding)
case WordSegFlag.CHINESE:
if wordSegStrategy.ChineseWordSegmentationStrategy._lib:
return wordSegStrategy.ChineseWordSegmentationStrategy(self.text, self.encoding)
else:
log.debugWarning("Chinese word segmenter is loading. Falling back to Uniscribe.")
return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding)
case _:
return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding)

def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None:
"""Get the segment containing the given offset."""
try:
return self.strategy.getSegmentForOffset(offset)
except Exception as e:
log.debugWarning(
"WordSegmenter.getSegmentForOffset failed: %s text: '%s' offset: %s segmentation strategy: %s",
e,
self.text,
offset,
self.strategy,
)
return None

def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> str:
return self.strategy.segmentedText(sep, newSepIndex)
Loading