nvaccess · michaelDCurran · Oct 28, 2025 · Aug 9, 2025 · Aug 15, 2025 · Aug 18, 2025
@@ -35,6 +35,11 @@ sourceFiles = [
 	"cppjieba.def",
 ]
 
+env.AppendUnique(
+    CCFLAGS=['/wd4819'],
+    CXXFLAGS=['/wd4819'],
+)
+
 cppjiebaLib = env.SharedLibrary(target="cppjieba", source=sourceFiles)
 
 if not os.path.exists(outDir.Dir("dicts").get_abspath()) or not os.listdir(outDir.Dir("dicts").get_abspath()): # insure dicts installation happens only once and avoid a scons' warning

@@ -1,7 +1,7 @@
 # A part of NonVisual Desktop Access (NVDA)
-# Copyright (C) 2006-2025 NV Access Limited, Babbage B.V., Cyrille Bougot, Leonard de Ruijter
-# This file is covered by the GNU General Public License.
-# See the file COPYING for more details.
+# Copyright (C) 2006-2025 NV Access Limited, Babbage B.V., Cyrille Bougot, Leonard de Ruijter, Wang Chong
+# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
+# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
 
 from typing import (
 	Dict,
@@ -32,6 +32,7 @@
 import watchdog
 import locationHelper
 import textUtils
+from textUtils.segFlag import CharSegFlag, WordSegFlag
 import NVDAHelper.localLib
 
 
@@ -169,6 +170,13 @@ class getTextLengthExStruct(ctypes.Structure):
 
 
 class EditTextInfo(textInfos.offsets.OffsetsTextInfo):
+	# Override segFlags to enforce use of Uniscribe
+	charSegFlag = CharSegFlag.UNISCRIBE
+
+	@property
+	def wordSegFlag(self):
+		return WordSegFlag.UNISCRIBE
+
 	def _getPointFromOffset(self, offset):
 		if self.obj.editAPIVersion == 1 or self.obj.editAPIVersion >= 3:
 			processHandle = self.obj.processHandle

@@ -1,7 +1,8 @@
 # A part of NonVisual Desktop Access (NVDA)
 # Copyright (C) 2006-2025 NV Access Limited, Babbage B.V., Davy Kager, Bill Dengler, Julien Cochuyt,
 # Joseph Lee, Dawid Pieper, mltony, Bram Duvigneau, Cyrille Bougot, Rob Meredith,
-# Burman's Computer and Education Ltd., Leonard de Ruijter, Łukasz Golonka, Cary-rowen
+# Burman's Computer and Education Ltd., Leonard de Ruijter, Łukasz Golonka, Cary-rowen,
+# Wang Chong
 # This file is covered by the GNU General Public License.
 # See the file COPYING for more details.
 
@@ -261,6 +262,8 @@
 	reportClickable = boolean(default=true)
 
 [documentNavigation]
+	initWordSegForUnusedLang = boolean(default=false)
+	wordSegmentationStandard = featureFlag(optionsEnum="WordNavigationUnitFlag", behaviorOfDefault="Auto")
 	paragraphStyle = featureFlag(optionsEnum="ParagraphNavigationFlag", behaviorOfDefault="application")
 
 [reviewCursor]

@@ -1,7 +1,7 @@
 # A part of NonVisual Desktop Access (NVDA)
-# Copyright (C) 2022 NV Access Limited, Bill Dengler, Rob Meredith
-# This file is covered by the GNU General Public License.
-# See the file COPYING for more details.
+# Copyright (C) 2022-2025 NV Access Limited, Bill Dengler, Rob Meredith, Wang Chong
+# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
+# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
 
 """
 Feature flag value enumerations.
@@ -139,6 +139,26 @@ def _displayStringLabels(self) -> dict["FontFormattingBrailleModeFlag", str]:
 		}
 
 
+class WordNavigationUnitFlag(DisplayStringEnum):
+	"""Enumeration for word navigation."""
+
+	@property
+	def _displayStringLabels(self):
+		return {
+			# Translators: Label for a method of word segmentation.
+			self.AUTO: _("Auto"),
+			# Translators: Label for a method of word segmentation.
+			self.UNISCRIBE: _("Standard"),
+			# Translators: Label for a method of word segmentation.
+			self.CHINESE: _("Chinese"),
+		}
+
+	DEFAULT = enum.auto()
+	AUTO = enum.auto()
+	UNISCRIBE = enum.auto()
+	CHINESE = enum.auto()
+
+
 def getAvailableEnums() -> typing.Generator[typing.Tuple[str, FlagValueEnum], None, None]:
 	for name, value in globals().items():
 		if (

@@ -1,6 +1,6 @@
 # A part of NonVisual Desktop Access (NVDA)
 # Copyright (C) 2006-2025 NV Access Limited, Aleksey Sadovoy, Christopher Toth, Joseph Lee, Peter Vágner,
-# Derek Riemer, Babbage B.V., Zahari Yurukov, Łukasz Golonka, Cyrille Bougot, Julien Cochuyt
+# Derek Riemer, Babbage B.V., Zahari Yurukov, Łukasz Golonka, Cyrille Bougot, Julien Cochuyt, Wang Chong
 # This file is covered by the GNU General Public License.
 # See the file COPYING for more details.
 
@@ -909,6 +909,16 @@ def main():
 
 	_remoteClient.initialize()
 
+	from textUtils import wordSeg
+
+	log.debug("Initializing word segmentation module")
+
+	try:
+		wordSeg.initialize()
+	except RuntimeError:
+		log.warning("Word segmentation module disabled in configuration")
+	except Exception:
+		log.error("Error initializing word segmentation module", exc_info=True)
 	import _localCaptioner
 
 	_localCaptioner.initialize()

@@ -1,7 +1,7 @@
 # A part of NonVisual Desktop Access (NVDA)
-# This file is covered by the GNU General Public License.
-# See the file COPYING for more details.
-# Copyright (C) 2006-2025 NV Access Limited, Babbage B.V., Joseph Lee, Cyrille Bougot
+# Copyright (C) 2006-2025 NV Access Limited, Babbage B.V., Joseph Lee, Cyrille Bougot, Wang Chong
+# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
+# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
 
 from ctypes import byref, c_short, c_long
 import unicodedata
@@ -22,6 +22,7 @@
 import windowUtils
 from locationHelper import RectLTRB, RectLTWH
 import textUtils
+from textUtils.segFlag import CharSegFlag, WordSegFlag
 from typing import (
 	List,
 	Tuple,
@@ -525,7 +526,12 @@ def _getStoryLength(self):
 			return lineEndOffsets[-1]
 		return 0
 
-	useUniscribe = False
+	# Override segFlags to strictly use the old fallen-back method
+	charSegFlag = CharSegFlag.NONE
+
+	@property
+	def wordSegFlag(self):
+		return WordSegFlag.NONE
 
 	def _getTextRange(self, start, end):
 		return "".join(x for x in self._getFieldsInRange(start, end) if isinstance(x, str))

@@ -5,9 +5,10 @@
 # Thomas Stivers, Julien Cochuyt, Peter Vágner, Cyrille Bougot, Mesar Hameed,
 # Łukasz Golonka, Aaron Cannon, Adriani90, André-Abush Clause, Dawid Pieper,
 # Takuya Nishimoto, jakubl7545, Tony Malykh, Rob Meredith,
-# Burman's Computer and Education Ltd, hwf1324, Cary-rowen, Christopher Proß., Tianze
-# This file is covered by the GNU General Public License.
-# See the file COPYING for more details.
+# Burman's Computer and Education Ltd, hwf1324, Cary-rowen, Christopher Proß,
+# Wang Chong, Tianze.
+# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
+# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
 
 from collections.abc import Container
 import logging
@@ -3090,6 +3091,17 @@ class DocumentNavigationPanel(SettingsPanel):
 
 	def makeSettings(self, settingsSizer: wx.BoxSizer) -> None:
 		sHelper = guiHelper.BoxSizerHelper(self, sizer=settingsSizer)
+
+		# Translators: This is a label for the word segmentation standard in the document navigation dialog
+		WordNavigationUnitLabel = _("&Word Segmentation Standard:")
+		self.wordSegCombo: nvdaControls.FeatureFlagCombo = sHelper.addLabeledControl(
+			labelText=WordNavigationUnitLabel,
+			wxCtrlClass=nvdaControls.FeatureFlagCombo,
+			keyPath=["documentNavigation", "wordSegmentationStandard"],
+			conf=config.conf,
+		)
+		self.bindHelpEvent("wordSegmentationStandard", self.wordSegCombo)
+
 		# Translators: This is a label for the paragraph navigation style in the document navigation dialog
 		paragraphStyleLabel = _("&Paragraph style:")
 		self.paragraphStyleCombo: nvdaControls.FeatureFlagCombo = sHelper.addLabeledControl(
@@ -3101,8 +3113,21 @@ def makeSettings(self, settingsSizer: wx.BoxSizer) -> None:
 		self.bindHelpEvent("ParagraphStyle", self.paragraphStyleCombo)
 
 	def onSave(self):
+		self.wordSegCombo.saveCurrentValueToConf()
 		self.paragraphStyleCombo.saveCurrentValueToConf()
 
+	def postSave(self):
+		from textUtils import wordSeg
+
+		log.debug("Reinitializing word segmentation module")
+
+		try:
+			wordSeg.initialize()
+		except RuntimeError:
+			log.warning("Word segmentation module disabled in configuration")
+		except Exception:
+			log.error("Error reinitializing word segmentation module", exc_info=True)
+
 
 def _synthWarningDialog(newSynth: str):
 	gui.messageBox(

@@ -1,19 +1,19 @@
-# textInfos/offsets.py
 # A part of NonVisual Desktop Access (NVDA)
-# This file is covered by the GNU General Public License.
-# See the file COPYING for more details.
-# Copyright (C) 2006-2024 NV Access Limited, Babbage B.V., Leonard de Ruijter
+# Copyright (C) 2006-2025 NV Access Limited, Babbage B.V., Leonard de Ruijter, Wang Chong
+# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
+# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
 
 from abc import abstractmethod
 import re
 import ctypes
 import unicodedata
 import NVDAHelper
-import config
+import config.featureFlagEnums
 import textInfos
 import locationHelper
 from treeInterceptorHandler import TreeInterceptor
 import textUtils
+from textUtils.segFlag import CharSegFlag, WordSegFlag
 from dataclasses import dataclass
 from typing import (
 	Optional,
@@ -156,8 +156,21 @@ class OffsetsTextInfo(textInfos.TextInfo):
 
 	#: Honours documentFormatting config option if true - set to false if this is not at all slow.
 	detectFormattingAfterCursorMaybeSlow: bool = True
-	#: Use uniscribe to calculate word offsets etc.
-	useUniscribe: bool = True
+	#: Method to calculate character and word offsets.
+	charSegFlag: CharSegFlag = CharSegFlag.UNISCRIBE
+
+	@property
+	def wordSegFlag(self) -> WordSegFlag | None:
+		match self.wordSegConf.calculated():
+			case config.featureFlagEnums.WordNavigationUnitFlag.UNISCRIBE:
+				return WordSegFlag.UNISCRIBE
+			case config.featureFlagEnums.WordNavigationUnitFlag.AUTO:
+				return WordSegFlag.AUTO
+			case config.featureFlagEnums.WordNavigationUnitFlag.CHINESE:
+				return WordSegFlag.CHINESE
+			case _:
+				log.error(f"Unknown word segmentation standard, {self.__wordSegConf.calculated()!r}")
+
 	#: The encoding internal to the underlying text info implementation.
 	encoding: Optional[str] = textUtils.WCHAR_ENCODING
 
@@ -377,7 +390,7 @@ def _getCharacterOffsets(self, offset):
 		lineStart, lineEnd = self._getLineOffsets(offset)
 		lineText = self._getTextRange(lineStart, lineEnd)
 		relOffset = offset - lineStart
-		if self.useUniscribe:
+		if self.charSegFlag == CharSegFlag.UNISCRIBE:
 			offsets = self._calculateUniscribeOffsets(lineText, textInfos.UNIT_CHARACTER, relOffset)
 			if offsets is not None:
 				return (offsets[0] + lineStart, offsets[1] + lineStart)
@@ -401,8 +414,10 @@ def _getWordOffsets(self, offset):
 		# Convert NULL and non-breaking space to space to make sure that words will break on them
 		lineText = lineText.translate({0: " ", 0xA0: " "})
 		relOffset = offset - lineStart
-		if self.useUniscribe:
-			offsets = self._calculateUniscribeOffsets(lineText, textInfos.UNIT_WORD, relOffset)
+		if self.wordSegFlag:
+			offsets = textUtils.WordSegmenter(lineText, self.encoding, self.wordSegFlag).getSegmentForOffset(
+				relOffset,
+			)
 			if offsets is not None:
 				return (offsets[0] + lineStart, offsets[1] + lineStart)
 		# Fall back to the older word offsets detection that only breaks on non alphanumeric
@@ -476,6 +491,10 @@ def __init__(self, obj, position):
 		Subclasses may extend this to perform implementation specific initialisation, calling their superclass method afterwards.
 		"""
 		super(OffsetsTextInfo, self).__init__(obj, position)
+		self.wordSegConf: config.featureFlag.FeatureFlag = config.conf["documentNavigation"][
+			"wordSegmentationStandard"
+		]
+
 		from NVDAObjects import NVDAObject
 
 		if isinstance(position, locationHelper.Point):

@@ -1,23 +1,27 @@
 # A part of NonVisual Desktop Access (NVDA)
-# This file is covered by the GNU General Public License.
-# See the file COPYING for more details.
-# Copyright (C) 2018-2024 NV Access Limited, Babbage B.V., Łukasz Golonka
+# Copyright (C) 2018-2025 NV Access Limited, Babbage B.V., Łukasz Golonka, Wang Chong
+# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
+# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
 
 """
 Classes and utilities to deal with offsets variable width encodings, particularly utf_16.
 """
 
 import ctypes
+import re
 import encodings
 import locale
 import unicodedata
+
 from abc import ABCMeta, abstractmethod, abstractproperty
 from functools import cached_property
 from typing import Generator, Optional, Tuple, Type
 
 from logHandler import log
 
 from .uniscribe import splitAtCharacterBoundaries
+from .wordSeg import wordSegStrategy
+from .segFlag import WordSegFlag
 
 WCHAR_ENCODING = "utf_16_le"
 UTF8_ENCODING = "utf-8"
@@ -540,3 +544,62 @@ def getOffsetConverter(encoding: str) -> Type[OffsetConverter]:
 		return ENCODINGS_TO_CONVERTERS[encoding]
 	except IndexError as e:
 		raise LookupError(f"Don't know how to deal with encoding '{encoding}'", e)
+
+
+class WordSegmenter:
+	"""Selects appropriate segmentation strategy and segments text."""
+
+	# Precompiled patterns
+	# Chinese characters and Japanese kanji (CJK Unified Ideographs U+4E00 - U+9FFF)
+	_CHINESE_CHARACTER_AND_JAPANESE_KANJI: re.Pattern = re.compile(r"[\u4E00-\u9FFF]")
+	# Japanese kana (Hiragana U+3040 - U+309F, Katakana U+30A0 - U+30FF)
+	_KANA: re.Pattern = re.compile(r"[\u3040-\u309F\u30A0-\u30FF]")
+
+	def __init__(self, text: str, encoding: str = "UTF-8", wordSegFlag: WordSegFlag = WordSegFlag.AUTO):
+		self.text: str = text
+		self.encoding: str | None = encoding
+		self.wordSegFlag: WordSegFlag = wordSegFlag
+		self.strategy: wordSegStrategy.WordSegmentationStrategy = self._chooseStrategy()
+
+	def _chooseStrategy(self) -> wordSegStrategy.WordSegmentationStrategy:  # TODO: optimize
+		"""Choose the appropriate segmentation strategy based on the text content."""
+		if self.wordSegFlag == WordSegFlag.AUTO:
+			if (
+				wordSegStrategy.ChineseWordSegmentationStrategy._lib
+				and WordSegmenter._CHINESE_CHARACTER_AND_JAPANESE_KANJI.search(
+					self.text,
+				)
+				and not WordSegmenter._KANA.search(self.text)
+			):
+				return wordSegStrategy.ChineseWordSegmentationStrategy(self.text, self.encoding)
+			else:
+				return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding)
+		else:
+			match self.wordSegFlag:
+				case WordSegFlag.UNISCRIBE:
+					return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding)
+				case WordSegFlag.CHINESE:
+					if wordSegStrategy.ChineseWordSegmentationStrategy._lib:
+						return wordSegStrategy.ChineseWordSegmentationStrategy(self.text, self.encoding)
+					else:
+						log.debugWarning("Chinese word segmenter is loading. Falling back to Uniscribe.")
+						return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding)
+				case _:
+					return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding)
+
+	def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None:
+		"""Get the segment containing the given offset."""
+		try:
+			return self.strategy.getSegmentForOffset(offset)
+		except Exception as e:
+			log.debugWarning(
+				"WordSegmenter.getSegmentForOffset failed: %s  text: '%s' offset: %s  segmentation strategy: %s",
+				e,
+				self.text,
+				offset,
+				self.strategy,
+			)
+			return None
+
+	def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> str:
+		return self.strategy.segmentedText(sep, newSepIndex)