diff --git a/nvdaHelper/cppjieba/sconscript b/nvdaHelper/cppjieba/sconscript index a6d61dd39c5..d59fd0e3431 100644 --- a/nvdaHelper/cppjieba/sconscript +++ b/nvdaHelper/cppjieba/sconscript @@ -35,6 +35,11 @@ sourceFiles = [ "cppjieba.def", ] +env.AppendUnique( + CCFLAGS=['/wd4819'], + CXXFLAGS=['/wd4819'], +) + cppjiebaLib = env.SharedLibrary(target="cppjieba", source=sourceFiles) if not os.path.exists(outDir.Dir("dicts").get_abspath()) or not os.listdir(outDir.Dir("dicts").get_abspath()): # insure dicts installation happens only once and avoid a scons' warning diff --git a/source/NVDAObjects/window/edit.py b/source/NVDAObjects/window/edit.py index a3345971330..1961ccdd258 100644 --- a/source/NVDAObjects/window/edit.py +++ b/source/NVDAObjects/window/edit.py @@ -1,7 +1,7 @@ # A part of NonVisual Desktop Access (NVDA) -# Copyright (C) 2006-2025 NV Access Limited, Babbage B.V., Cyrille Bougot, Leonard de Ruijter -# This file is covered by the GNU General Public License. -# See the file COPYING for more details. +# Copyright (C) 2006-2025 NV Access Limited, Babbage B.V., Cyrille Bougot, Leonard de Ruijter, Wang Chong +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt from typing import ( Dict, @@ -32,6 +32,7 @@ import watchdog import locationHelper import textUtils +from textUtils.segFlag import CharSegFlag, WordSegFlag import NVDAHelper.localLib @@ -169,6 +170,13 @@ class getTextLengthExStruct(ctypes.Structure): class EditTextInfo(textInfos.offsets.OffsetsTextInfo): + # Override segFlags to enforce use of Uniscribe + charSegFlag = CharSegFlag.UNISCRIBE + + @property + def wordSegFlag(self): + return WordSegFlag.UNISCRIBE + def _getPointFromOffset(self, offset): if self.obj.editAPIVersion == 1 or self.obj.editAPIVersion >= 3: processHandle = self.obj.processHandle diff --git a/source/config/configSpec.py b/source/config/configSpec.py index 9f83d3a3ea3..e38bd142a1a 100644 --- a/source/config/configSpec.py +++ b/source/config/configSpec.py @@ -1,7 +1,8 @@ # A part of NonVisual Desktop Access (NVDA) # Copyright (C) 2006-2025 NV Access Limited, Babbage B.V., Davy Kager, Bill Dengler, Julien Cochuyt, # Joseph Lee, Dawid Pieper, mltony, Bram Duvigneau, Cyrille Bougot, Rob Meredith, -# Burman's Computer and Education Ltd., Leonard de Ruijter, Łukasz Golonka, Cary-rowen +# Burman's Computer and Education Ltd., Leonard de Ruijter, Łukasz Golonka, Cary-rowen, +# Wang Chong # This file is covered by the GNU General Public License. # See the file COPYING for more details. @@ -261,6 +262,8 @@ reportClickable = boolean(default=true) [documentNavigation] + initWordSegForUnusedLang = boolean(default=false) + wordSegmentationStandard = featureFlag(optionsEnum="WordNavigationUnitFlag", behaviorOfDefault="Auto") paragraphStyle = featureFlag(optionsEnum="ParagraphNavigationFlag", behaviorOfDefault="application") [reviewCursor] diff --git a/source/config/featureFlagEnums.py b/source/config/featureFlagEnums.py index 5bcb1db1fdb..59c78bef409 100644 --- a/source/config/featureFlagEnums.py +++ b/source/config/featureFlagEnums.py @@ -1,7 +1,7 @@ # A part of NonVisual Desktop Access (NVDA) -# Copyright (C) 2022 NV Access Limited, Bill Dengler, Rob Meredith -# This file is covered by the GNU General Public License. -# See the file COPYING for more details. +# Copyright (C) 2022-2025 NV Access Limited, Bill Dengler, Rob Meredith, Wang Chong +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt """ Feature flag value enumerations. @@ -139,6 +139,26 @@ def _displayStringLabels(self) -> dict["FontFormattingBrailleModeFlag", str]: } +class WordNavigationUnitFlag(DisplayStringEnum): + """Enumeration for word navigation.""" + + @property + def _displayStringLabels(self): + return { + # Translators: Label for a method of word segmentation. + self.AUTO: _("Auto"), + # Translators: Label for a method of word segmentation. + self.UNISCRIBE: _("Standard"), + # Translators: Label for a method of word segmentation. + self.CHINESE: _("Chinese"), + } + + DEFAULT = enum.auto() + AUTO = enum.auto() + UNISCRIBE = enum.auto() + CHINESE = enum.auto() + + def getAvailableEnums() -> typing.Generator[typing.Tuple[str, FlagValueEnum], None, None]: for name, value in globals().items(): if ( diff --git a/source/core.py b/source/core.py index 40c20289e7b..9308166c6b4 100644 --- a/source/core.py +++ b/source/core.py @@ -1,6 +1,6 @@ # A part of NonVisual Desktop Access (NVDA) # Copyright (C) 2006-2025 NV Access Limited, Aleksey Sadovoy, Christopher Toth, Joseph Lee, Peter Vágner, -# Derek Riemer, Babbage B.V., Zahari Yurukov, Łukasz Golonka, Cyrille Bougot, Julien Cochuyt +# Derek Riemer, Babbage B.V., Zahari Yurukov, Łukasz Golonka, Cyrille Bougot, Julien Cochuyt, Wang Chong # This file is covered by the GNU General Public License. # See the file COPYING for more details. @@ -909,6 +909,16 @@ def main(): _remoteClient.initialize() + from textUtils import wordSeg + + log.debug("Initializing word segmentation module") + + try: + wordSeg.initialize() + except RuntimeError: + log.warning("Word segmentation module disabled in configuration") + except Exception: + log.error("Error initializing word segmentation module", exc_info=True) import _localCaptioner _localCaptioner.initialize() diff --git a/source/displayModel.py b/source/displayModel.py index fde2cb0110e..941f11a6234 100644 --- a/source/displayModel.py +++ b/source/displayModel.py @@ -1,7 +1,7 @@ # A part of NonVisual Desktop Access (NVDA) -# This file is covered by the GNU General Public License. -# See the file COPYING for more details. -# Copyright (C) 2006-2025 NV Access Limited, Babbage B.V., Joseph Lee, Cyrille Bougot +# Copyright (C) 2006-2025 NV Access Limited, Babbage B.V., Joseph Lee, Cyrille Bougot, Wang Chong +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt from ctypes import byref, c_short, c_long import unicodedata @@ -22,6 +22,7 @@ import windowUtils from locationHelper import RectLTRB, RectLTWH import textUtils +from textUtils.segFlag import CharSegFlag, WordSegFlag from typing import ( List, Tuple, @@ -525,7 +526,12 @@ def _getStoryLength(self): return lineEndOffsets[-1] return 0 - useUniscribe = False + # Override segFlags to strictly use the old fallen-back method + charSegFlag = CharSegFlag.NONE + + @property + def wordSegFlag(self): + return WordSegFlag.NONE def _getTextRange(self, start, end): return "".join(x for x in self._getFieldsInRange(start, end) if isinstance(x, str)) diff --git a/source/gui/settingsDialogs.py b/source/gui/settingsDialogs.py index b58ddb7da5c..ca549a623a5 100644 --- a/source/gui/settingsDialogs.py +++ b/source/gui/settingsDialogs.py @@ -5,9 +5,10 @@ # Thomas Stivers, Julien Cochuyt, Peter Vágner, Cyrille Bougot, Mesar Hameed, # Łukasz Golonka, Aaron Cannon, Adriani90, André-Abush Clause, Dawid Pieper, # Takuya Nishimoto, jakubl7545, Tony Malykh, Rob Meredith, -# Burman's Computer and Education Ltd, hwf1324, Cary-rowen, Christopher Proß., Tianze -# This file is covered by the GNU General Public License. -# See the file COPYING for more details. +# Burman's Computer and Education Ltd, hwf1324, Cary-rowen, Christopher Proß, +# Wang Chong, Tianze. +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt from collections.abc import Container import logging @@ -3090,6 +3091,17 @@ class DocumentNavigationPanel(SettingsPanel): def makeSettings(self, settingsSizer: wx.BoxSizer) -> None: sHelper = guiHelper.BoxSizerHelper(self, sizer=settingsSizer) + + # Translators: This is a label for the word segmentation standard in the document navigation dialog + WordNavigationUnitLabel = _("&Word Segmentation Standard:") + self.wordSegCombo: nvdaControls.FeatureFlagCombo = sHelper.addLabeledControl( + labelText=WordNavigationUnitLabel, + wxCtrlClass=nvdaControls.FeatureFlagCombo, + keyPath=["documentNavigation", "wordSegmentationStandard"], + conf=config.conf, + ) + self.bindHelpEvent("wordSegmentationStandard", self.wordSegCombo) + # Translators: This is a label for the paragraph navigation style in the document navigation dialog paragraphStyleLabel = _("&Paragraph style:") self.paragraphStyleCombo: nvdaControls.FeatureFlagCombo = sHelper.addLabeledControl( @@ -3101,8 +3113,21 @@ def makeSettings(self, settingsSizer: wx.BoxSizer) -> None: self.bindHelpEvent("ParagraphStyle", self.paragraphStyleCombo) def onSave(self): + self.wordSegCombo.saveCurrentValueToConf() self.paragraphStyleCombo.saveCurrentValueToConf() + def postSave(self): + from textUtils import wordSeg + + log.debug("Reinitializing word segmentation module") + + try: + wordSeg.initialize() + except RuntimeError: + log.warning("Word segmentation module disabled in configuration") + except Exception: + log.error("Error reinitializing word segmentation module", exc_info=True) + def _synthWarningDialog(newSynth: str): gui.messageBox( diff --git a/source/textInfos/offsets.py b/source/textInfos/offsets.py index f9a6973bd7e..32e18fc41bf 100755 --- a/source/textInfos/offsets.py +++ b/source/textInfos/offsets.py @@ -1,19 +1,19 @@ -# textInfos/offsets.py # A part of NonVisual Desktop Access (NVDA) -# This file is covered by the GNU General Public License. -# See the file COPYING for more details. -# Copyright (C) 2006-2024 NV Access Limited, Babbage B.V., Leonard de Ruijter +# Copyright (C) 2006-2025 NV Access Limited, Babbage B.V., Leonard de Ruijter, Wang Chong +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt from abc import abstractmethod import re import ctypes import unicodedata import NVDAHelper -import config +import config.featureFlagEnums import textInfos import locationHelper from treeInterceptorHandler import TreeInterceptor import textUtils +from textUtils.segFlag import CharSegFlag, WordSegFlag from dataclasses import dataclass from typing import ( Optional, @@ -156,8 +156,21 @@ class OffsetsTextInfo(textInfos.TextInfo): #: Honours documentFormatting config option if true - set to false if this is not at all slow. detectFormattingAfterCursorMaybeSlow: bool = True - #: Use uniscribe to calculate word offsets etc. - useUniscribe: bool = True + #: Method to calculate character and word offsets. + charSegFlag: CharSegFlag = CharSegFlag.UNISCRIBE + + @property + def wordSegFlag(self) -> WordSegFlag | None: + match self.wordSegConf.calculated(): + case config.featureFlagEnums.WordNavigationUnitFlag.UNISCRIBE: + return WordSegFlag.UNISCRIBE + case config.featureFlagEnums.WordNavigationUnitFlag.AUTO: + return WordSegFlag.AUTO + case config.featureFlagEnums.WordNavigationUnitFlag.CHINESE: + return WordSegFlag.CHINESE + case _: + log.error(f"Unknown word segmentation standard, {self.__wordSegConf.calculated()!r}") + #: The encoding internal to the underlying text info implementation. encoding: Optional[str] = textUtils.WCHAR_ENCODING @@ -377,7 +390,7 @@ def _getCharacterOffsets(self, offset): lineStart, lineEnd = self._getLineOffsets(offset) lineText = self._getTextRange(lineStart, lineEnd) relOffset = offset - lineStart - if self.useUniscribe: + if self.charSegFlag == CharSegFlag.UNISCRIBE: offsets = self._calculateUniscribeOffsets(lineText, textInfos.UNIT_CHARACTER, relOffset) if offsets is not None: return (offsets[0] + lineStart, offsets[1] + lineStart) @@ -401,8 +414,10 @@ def _getWordOffsets(self, offset): # Convert NULL and non-breaking space to space to make sure that words will break on them lineText = lineText.translate({0: " ", 0xA0: " "}) relOffset = offset - lineStart - if self.useUniscribe: - offsets = self._calculateUniscribeOffsets(lineText, textInfos.UNIT_WORD, relOffset) + if self.wordSegFlag: + offsets = textUtils.WordSegmenter(lineText, self.encoding, self.wordSegFlag).getSegmentForOffset( + relOffset, + ) if offsets is not None: return (offsets[0] + lineStart, offsets[1] + lineStart) # Fall back to the older word offsets detection that only breaks on non alphanumeric @@ -476,6 +491,10 @@ def __init__(self, obj, position): Subclasses may extend this to perform implementation specific initialisation, calling their superclass method afterwards. """ super(OffsetsTextInfo, self).__init__(obj, position) + self.wordSegConf: config.featureFlag.FeatureFlag = config.conf["documentNavigation"][ + "wordSegmentationStandard" + ] + from NVDAObjects import NVDAObject if isinstance(position, locationHelper.Point): diff --git a/source/textUtils/__init__.py b/source/textUtils/__init__.py index d88ef055572..83e8f739732 100644 --- a/source/textUtils/__init__.py +++ b/source/textUtils/__init__.py @@ -1,16 +1,18 @@ # A part of NonVisual Desktop Access (NVDA) -# This file is covered by the GNU General Public License. -# See the file COPYING for more details. -# Copyright (C) 2018-2024 NV Access Limited, Babbage B.V., Łukasz Golonka +# Copyright (C) 2018-2025 NV Access Limited, Babbage B.V., Łukasz Golonka, Wang Chong +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt """ Classes and utilities to deal with offsets variable width encodings, particularly utf_16. """ import ctypes +import re import encodings import locale import unicodedata + from abc import ABCMeta, abstractmethod, abstractproperty from functools import cached_property from typing import Generator, Optional, Tuple, Type @@ -18,6 +20,8 @@ from logHandler import log from .uniscribe import splitAtCharacterBoundaries +from .wordSeg import wordSegStrategy +from .segFlag import WordSegFlag WCHAR_ENCODING = "utf_16_le" UTF8_ENCODING = "utf-8" @@ -540,3 +544,62 @@ def getOffsetConverter(encoding: str) -> Type[OffsetConverter]: return ENCODINGS_TO_CONVERTERS[encoding] except IndexError as e: raise LookupError(f"Don't know how to deal with encoding '{encoding}'", e) + + +class WordSegmenter: + """Selects appropriate segmentation strategy and segments text.""" + + # Precompiled patterns + # Chinese characters and Japanese kanji (CJK Unified Ideographs U+4E00 - U+9FFF) + _CHINESE_CHARACTER_AND_JAPANESE_KANJI: re.Pattern = re.compile(r"[\u4E00-\u9FFF]") + # Japanese kana (Hiragana U+3040 - U+309F, Katakana U+30A0 - U+30FF) + _KANA: re.Pattern = re.compile(r"[\u3040-\u309F\u30A0-\u30FF]") + + def __init__(self, text: str, encoding: str = "UTF-8", wordSegFlag: WordSegFlag = WordSegFlag.AUTO): + self.text: str = text + self.encoding: str | None = encoding + self.wordSegFlag: WordSegFlag = wordSegFlag + self.strategy: wordSegStrategy.WordSegmentationStrategy = self._chooseStrategy() + + def _chooseStrategy(self) -> wordSegStrategy.WordSegmentationStrategy: # TODO: optimize + """Choose the appropriate segmentation strategy based on the text content.""" + if self.wordSegFlag == WordSegFlag.AUTO: + if ( + wordSegStrategy.ChineseWordSegmentationStrategy._lib + and WordSegmenter._CHINESE_CHARACTER_AND_JAPANESE_KANJI.search( + self.text, + ) + and not WordSegmenter._KANA.search(self.text) + ): + return wordSegStrategy.ChineseWordSegmentationStrategy(self.text, self.encoding) + else: + return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding) + else: + match self.wordSegFlag: + case WordSegFlag.UNISCRIBE: + return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding) + case WordSegFlag.CHINESE: + if wordSegStrategy.ChineseWordSegmentationStrategy._lib: + return wordSegStrategy.ChineseWordSegmentationStrategy(self.text, self.encoding) + else: + log.debugWarning("Chinese word segmenter is loading. Falling back to Uniscribe.") + return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding) + case _: + return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding) + + def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None: + """Get the segment containing the given offset.""" + try: + return self.strategy.getSegmentForOffset(offset) + except Exception as e: + log.debugWarning( + "WordSegmenter.getSegmentForOffset failed: %s text: '%s' offset: %s segmentation strategy: %s", + e, + self.text, + offset, + self.strategy, + ) + return None + + def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> str: + return self.strategy.segmentedText(sep, newSepIndex) diff --git a/source/textUtils/segFlag.py b/source/textUtils/segFlag.py new file mode 100644 index 00000000000..72153c80e18 --- /dev/null +++ b/source/textUtils/segFlag.py @@ -0,0 +1,28 @@ +# A part of NonVisual Desktop Access (NVDA) +# Copyright (C) 2025 NV Access Limited, Wang Chong +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt + +from enum import IntFlag + +# shared bit masks (explicit powers of two) +_AUTO: int = 1 << 0 +_UNISCRIBE: int = 1 << 1 +_CHINESE: int = 1 << 2 + + +class CharSegFlag(IntFlag): + """Character-level segmentation flags.""" + + NONE: int = 0 + AUTO: int = _AUTO + UNISCRIBE: int = _UNISCRIBE + + +class WordSegFlag(IntFlag): + """Word-level segmentation flags.""" + + NONE: int = 0 + AUTO: int = _AUTO + UNISCRIBE: int = _UNISCRIBE + CHINESE: int = _CHINESE diff --git a/source/textUtils/wordSeg/__init__.py b/source/textUtils/wordSeg/__init__.py new file mode 100644 index 00000000000..77231b58fa3 --- /dev/null +++ b/source/textUtils/wordSeg/__init__.py @@ -0,0 +1,46 @@ +# A part of NonVisual Desktop Access (NVDA) +# Copyright (C) 2025 NV Access Limited, Wang Chong +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt + +import importlib +from logHandler import log + + +def initialize(): + """ + Call all registered initializer functions recorded in wordSegStrategy.initializerList. + + Each entry is a tuple: (module_name, qualname, func_obj, args, kwargs). + We try to resolve the callable from the module and qualname at runtime + (this handles classmethod/staticmethod wrapping order). If resolution fails, + we fall back to the stored func_obj. + + Exceptions from individual initializers are caught and logged so that one + failing initializer doesn't stop the rest. + """ + + from . import wordSegStrategy + from threading import Thread + + for module_name, qualname, func_obj, args, kwargs in wordSegStrategy.initializerList: + callable_to_call = None + # try to resolve module + qualname to a current attribute (handles classmethod/staticmethod) + try: + mod = importlib.import_module(module_name) + obj = mod + for part in qualname.split("."): + obj = getattr(obj, part) + callable_to_call = obj + except Exception: + # fallback to original function object captured during decoration + callable_to_call = func_obj + + # Final call with its args/kwargs and exception handling + try: + if not callable(callable_to_call): + raise TypeError(f"Resolved initializer is not callable: {module_name}.{qualname}") + Thread(target=callable_to_call, args=args, kwargs=kwargs, daemon=True).start() + except Exception as e: + log.debug("Initializer %s.%s failed: %s", module_name, qualname, e) + return diff --git a/source/textUtils/wordSeg/wordSegStrategy.py b/source/textUtils/wordSeg/wordSegStrategy.py new file mode 100644 index 00000000000..f59f57aca0c --- /dev/null +++ b/source/textUtils/wordSeg/wordSegStrategy.py @@ -0,0 +1,332 @@ +# A part of NonVisual Desktop Access (NVDA) +# Copyright (C) 2025 NV Access Limited, Wang Chong +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt + +import os +import ctypes +from ctypes import ( + c_bool, + c_char_p, + c_int, + create_string_buffer, + POINTER, + byref, +) +from abc import ABC, abstractmethod +from functools import lru_cache +from collections.abc import Callable +from typing import Any +import re + +import textUtils +from logHandler import log + + +# Initializer registry (robust: saves module + qualname + original function + args/kwargs) +# Each entry: (module_name: str, qualname: str, func_obj: Callable, args: tuple, kwargs: dict) +initializerList: list[tuple[str, str, Callable[..., Any], tuple[Any, ...], dict[str, Any]]] = [] + + +def initializerRegistry(*decorator_args, **decorator_kwargs): + """ + A decorator to register an initializer function. + Usage: + @initializerRegistry + def f(): ... + or with arguments: + @initializerRegistry(arg1, arg2, kw=val) + def f(...): ... + We save (func.__module__, func.__qualname__, func, args, kwargs) so that during + package initialize() we can dynamically resolve the callable from the module + (this handles classmethod/staticmethod ordering issues). + """ + if decorator_args and callable(decorator_args[0]) and len(decorator_args) == 1 and not decorator_kwargs: + func = decorator_args[0] + initializerList.append((func.__module__, func.__qualname__, func, (), {})) + return func + + def _decorator(func: Callable[..., Any]): + initializerList.append((func.__module__, func.__qualname__, func, decorator_args, decorator_kwargs)) + return func + + return _decorator + + +class WordSegmentationStrategy(ABC): + """Abstract base class for word segmentation strategies.""" + + def __init__(self, text: str, encoding: str | None = None): + self.text: str = text + self.encoding: str | None = encoding + self.wordEnds: list[int] = [] + + @abstractmethod + def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None: # TODO: optimize + """Return (start inclusive, end exclusive) or None. Offsets are str offsets relative to self.text.""" + pass + + @abstractmethod + def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> str: + """Segmented result with separators.""" + pass + + def getWordOffsetRange( + self, + offset: int, + ) -> tuple[int, int] | None: + """Helper to get word offset range from a list of word end offsets.""" + if not self.wordEnds: + return None + index = next((i for i, end in enumerate(self.wordEnds) if end > offset), len(self.wordEnds) - 1) + start = 0 if index == 0 else self.wordEnds[index - 1] + end = self.wordEnds[index] + return (start, end) + + @classmethod + def isUsingRelatedLanguage(cls) -> bool: + """Returns True if this strategy is for the current language.""" + + if not hasattr(cls, "_LANGUAGE_PATTERN"): + return False + + import languageHandler + import braille + + return ( + re.match(cls._LANGUAGE_PATTERN, languageHandler.getWindowsLanguage()) + or re.match(cls._LANGUAGE_PATTERN, languageHandler.getLanguage()) + or re.match(cls._LANGUAGE_PATTERN, braille.handler.table.fileName) + ) + + +class UniscribeWordSegmentationStrategy(WordSegmentationStrategy): + """Windows Uniscribe-based segmentation (calls NVDAHelper.localLib.calculateWordOffsets).""" + + # Copied from OffsetTextInfos. TODO: optimize + def _calculateUniscribeOffsets( + self, + lineText: str, + relOffset: int, + ) -> tuple[int, int] | None: + """ + Calculates the bounds of a unit at an offset within a given string of text + using the Windows uniscribe library, also used in Notepad, for example. + Units supported are character and word. + @param lineText: the text string to analyze + @param relOffset: the character offset within the text string at which to calculate the bounds. + """ + + import NVDAHelper + + helperFunc = NVDAHelper.localLib.calculateWordOffsets + + relStart = ctypes.c_int() + relEnd = ctypes.c_int() + # uniscribe does some strange things + # when you give it a string with not more than two alphanumeric chars in a row. + # Inject two alphanumeric characters at the end to fix this + uniscribeLineText = lineText + "xx" + # We can't rely on len(lineText) to calculate the length of the line. + offsetConverter = textUtils.WideStringOffsetConverter(lineText) + lineLength = offsetConverter.encodedStringLength + if self.encoding != textUtils.WCHAR_ENCODING: + # We need to convert the str based line offsets to wide string offsets. + relOffset = offsetConverter.strToEncodedOffsets(relOffset, relOffset)[0] + uniscribeLineLength = lineLength + 2 + if helperFunc( + uniscribeLineText, + uniscribeLineLength, + relOffset, + ctypes.byref(relStart), + ctypes.byref(relEnd), + ): + relStart = relStart.value + relEnd = min(lineLength, relEnd.value) + if self.encoding != textUtils.WCHAR_ENCODING: + # We need to convert the uniscribe based offsets to str offsets. + relStart, relEnd = offsetConverter.encodedToStrOffsets(relStart, relEnd) + return (relStart, relEnd) + return None + + def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None: + return self._calculateUniscribeOffsets(self.text, offset) + + def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> str: + return self.text + + +class ChineseWordSegmentationStrategy(WordSegmentationStrategy): + _lib = None + _LANGUAGE_PATTERN = re.compile(r"^zh", re.IGNORECASE) + + @classmethod + @initializerRegistry + def _initCppJieba(cls, forceInit: bool = False): # TODO: make cppjieba alternative + """ + Class-level initializer: attempts to load the versioned cppjieba library and + set up ctypes signatures. + """ + import config + + if not forceInit and ( + cls._lib + or ( + config.conf["documentNavigation"]["wordSegmentationStandard"].calculated() + != config.featureFlagEnums.WordNavigationUnitFlag.CHINESE + and not cls.isUsingRelatedLanguage() + ) + ): + return + try: + from NVDAState import ReadPaths + + lib_path = os.path.join(ReadPaths.coreArchLibPath, "cppjieba.dll") + cls._lib = ctypes.cdll.LoadLibrary(lib_path) + + # Setup function signatures + # bool initJieba(const char* dictDir) + cls._lib.initJieba.restype = c_bool + cls._lib.initJieba.argtypes = [c_char_p] + + # bool calculateWordOffsets(const char* text, int** wordEndOffsets, int* outLen) + cls._lib.calculateWordOffsets.restype = c_bool + cls._lib.calculateWordOffsets.argtypes = [c_char_p, POINTER(POINTER(c_int)), POINTER(c_int)] + + # bool insertUserWord(const char* word, int freq, const char* tag) + cls._lib.insertUserWord.restype = c_bool + cls._lib.insertUserWord.argtypes = [c_char_p, c_int, c_char_p] + + # bool deleteUserWord(const char* word, const char* tag) + cls._lib.deleteUserWord.restype = c_bool + cls._lib.deleteUserWord.argtypes = [c_char_p, c_char_p] + + # bool find(const char* word) + cls._lib.find.restype = c_bool + cls._lib.find.argtypes = [c_char_p] + + # void freeOffsets(int* offsets) + cls._lib.freeOffsets.restype = None + cls._lib.freeOffsets.argtypes = [POINTER(c_int)] + + # Initialize with dictionary path + import globalVars + + DICTS_DIR = os.path.join(globalVars.appDir, "cppjieba", "dicts") + DICTS_DIR_BYTES = DICTS_DIR.encode("utf-8") + dictDir = create_string_buffer(DICTS_DIR_BYTES) + cls._lib.initJieba(dictDir) + except Exception as e: + log.debugWarning("Failed to load cppjieba library: %s", e) + cls._lib = None + + @lru_cache(maxsize=256) + def _callCppjiebaCached(self, text_utf8: bytes) -> list[int] | None: + if self._lib is None: + return None + + charPtr = POINTER(c_int)() + outLen = c_int(0) + + try: + success: bool = self._lib.calculateWordOffsets(text_utf8, byref(charPtr), byref(outLen)) + if not success or not bool(charPtr) or outLen.value <= 0: + return None + + try: + n = outLen.value + offsets = [charPtr[i] for i in range(n)] + return offsets + finally: + self._lib.freeOffsets(charPtr) + except Exception as e: + log.debugWarning("Exception calling cppjieba: %s", e) + try: + if bool(charPtr): + self._lib.freeOffsets(charPtr) + except Exception: + pass + return None + + def _callCPPJieba(self) -> list[int] | None: + """ + Instance method: encode self.text and call cppjieba. + Returns list[int] on success, None on failure. + Uses LRU cache keyed by utf-8 bytes. + """ + data = self.text.encode("utf-8") + + if getattr(self, "_lib", None) is ChineseWordSegmentationStrategy._lib: + return self._callCppjiebaCached(data) + else: + if self._lib is None: + return None + + charPtr = POINTER(c_int)() + outLen = c_int(0) + try: + success: bool = self._lib.calculateWordOffsets(data, byref(charPtr), byref(outLen)) + if not success or not bool(charPtr) or outLen.value <= 0: + return None + + try: + n = outLen.value + return [charPtr[i] for i in range(n)] + finally: + self._lib.freeOffsets(charPtr) + except Exception as e: + log.debugWarning("Exception calling cppjieba: %s", e) + try: + if bool(charPtr): + self._lib.freeOffsets(charPtr) + except Exception: + pass + return None + + def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> str: + """Segments the text using the word end indices.""" + + if len(self.wordEnds) <= 1: + return self.text + + from .wordSegUtils import NO_SEP_BEFORE, NO_SEP_AFTER + + result = "" + for sepIndex in range(len(self.wordEnds) - 1): + preIndex = 0 if sepIndex == 0 else self.wordEnds[sepIndex - 1] + curIndex = self.wordEnds[sepIndex] + postIndex = self.wordEnds[sepIndex + 1] + + # append the token before the potential separator position + result += self.text[preIndex:curIndex] + + # quick checks: avoid adding duplicate separator if already present + if result.endswith(sep) or self.text[curIndex:postIndex].startswith(sep): + # separator already present at either side -> skip adding + continue + + # slice to check the next token (text between curIndex and postIndex) + nextSlice = self.text[curIndex:postIndex] + + # Determine whether any punctuation forbids a separator BEFORE the next token + noSepBefore = any(nextSlice.startswith(s) for s in NO_SEP_BEFORE) + # Determine whether any punctuation forbids a separator AFTER the current result + noSepAfter = any(result.endswith(s) for s in NO_SEP_AFTER) + + if not (noSepBefore or noSepAfter): + # If neither side forbids the separator, add it + result += sep + if newSepIndex is not None: + newSepIndex.append(len(result) - len(sep)) + else: + # append the final trailing token after the loop + result += self.text[curIndex:postIndex] + + return result + + def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None: + return self.getWordOffsetRange(offset) + + def __init__(self, text, encoding=None): + super().__init__(text, encoding) + self.wordEnds = self._callCPPJieba() diff --git a/source/textUtils/wordSeg/wordSegUtils.py b/source/textUtils/wordSeg/wordSegUtils.py new file mode 100644 index 00000000000..8298bda1359 --- /dev/null +++ b/source/textUtils/wordSeg/wordSegUtils.py @@ -0,0 +1,77 @@ +# A part of NonVisual Desktop Access (NVDA) +# Copyright (C) 2025 NV Access Limited, Wang Chong +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt + +# Punctuation that should NOT have a separator BEFORE it (no space before these marks) +NO_SEP_BEFORE = { + # Common Chinese fullwidth punctuation + "。", + ",", + "、", + ";", + ":", + "?", + "!", + "…", + "...", + "—", + "–", + "——", + ")", + "】", + "》", + "〉", + "」", + "』", + "”", + "’", + "%", + "‰", + "¥", + # Common ASCII / halfwidth punctuation + ".", + ",", + ";", + ":", + "?", + "!", + "%", + ".", + ")", + "]", + "}", + ">", + '"', + "'", +} + +# Punctuation that should NOT have a separator AFTER it (no space after these marks) +NO_SEP_AFTER = { + # Common Chinese fullwidth opening/leading punctuation + "(", + "【", + "《", + "〈", + "「", + "『", + "“", + "‘", + # Common ASCII / halfwidth opening/leading punctuation + "(", + "[", + "{", + "<", + '"', + "'", + # Currency and prefix-like symbols that typically bind to the following token + "$", + "€", + "£", + "¥", + "₹", + # Social/identifier prefixes + "@", + "#", + "&", +} diff --git a/tests/unit/test_textUtils.py b/tests/unit/test_textUtils.py index 6993ac7d962..048c8580e78 100644 --- a/tests/unit/test_textUtils.py +++ b/tests/unit/test_textUtils.py @@ -1,14 +1,15 @@ # A part of NonVisual Desktop Access (NVDA) # This file is covered by the GNU General Public License. # See the file COPYING for more details. -# Copyright (C) 2019-2025 NV Access Limited, Babbage B.V., Leonard de Ruijter +# Copyright (C) 2019-2025 NV Access Limited, Babbage B.V., Leonard de Ruijter, Wang Chong """Unit tests for the textUtils module.""" import unittest -from textUtils import UnicodeNormalizationOffsetConverter, WideStringOffsetConverter +from textUtils import UnicodeNormalizationOffsetConverter, WideStringOffsetConverter, WordSegmenter from textUtils.uniscribe import splitAtCharacterBoundaries +from textUtils.segFlag import WordSegFlag FACE_PALM = "\U0001f926" # 🤦 SMILE = "\U0001f60a" # 😊 @@ -442,3 +443,28 @@ def test_sentenceWithComposites(self): def test_hebrew(self): self._testHelper("בְּרֵאשִׁית", ["בְּ", "רֵ", "א", "שִׁ", "י", "ת"]) + + +class TestWordSegmenter(unittest.TestCase): + """Tests for the WordSegmenter class.""" + + def test_basicLatin(self): + text = "hello world" + segmenter = WordSegmenter(text, wordSegFlag=WordSegFlag.UNISCRIBE) + self.assertEqual(segmenter.getSegmentForOffset(0), (0, 6)) + self.assertEqual(segmenter.getSegmentForOffset(5), (0, 6)) + self.assertEqual(segmenter.getSegmentForOffset(6), (6, 11)) + self.assertEqual(segmenter.getSegmentForOffset(11), (6, 11)) + + def test_chinese(self): + text = "你好世界" + + from textUtils.wordSeg.wordSegStrategy import ChineseWordSegmentationStrategy + + ChineseWordSegmentationStrategy._initCppJieba(forceInit=True) + segmenter = WordSegmenter(text, wordSegFlag=WordSegFlag.CHINESE) + self.assertEqual(segmenter.getSegmentForOffset(0), (0, 2)) + self.assertEqual(segmenter.getSegmentForOffset(1), (0, 2)) + self.assertEqual(segmenter.getSegmentForOffset(2), (2, 4)) + self.assertEqual(segmenter.getSegmentForOffset(3), (2, 4)) + self.assertEqual(segmenter.getSegmentForOffset(4), (2, 4)) diff --git a/user_docs/en/changes.md b/user_docs/en/changes.md index 308f3ee84a0..2808e3dbbcf 100644 --- a/user_docs/en/changes.md +++ b/user_docs/en/changes.md @@ -31,6 +31,8 @@ This can be enabled using the "Report when lists support multiple selection" set * VirusTotal scan results are now available in the details for an add-on in the Add-on Store. An action has been added to view the full scan results on the VirusTotal website. (#18974) * In the Add-on Store, a new action has been added to see the latest changes for the current version of add-ons. (#14041, @josephsl, @nvdaes) +* Chinese text can be navigated by word via build-in input gestures. + Several GUI elements are added for its configuration in `Document Navigation` panel. (#18735, @CrazySteve0605) * In browse mode, the number of items in a list is now reported in braille. (#7455, @nvdaes) ### Changes @@ -82,10 +84,6 @@ On ARM64 machines with Windows 11, these ARM64EC libraries are loaded instead of * NVDA is now licensed under "GPL-2 or later". * In `braille.py`, the `FormattingMarker` class has a new `shouldBeUsed` method, to determine if the formatting marker key should be reported (#7608, @nvdaes) - -#### API Breaking Changes - -These are breaking API changes. Please open a GitHub issue if your add-on has an issue with updating to the new API. * NVDA is now built with Python 3.13. (#18591) @@ -192,6 +190,8 @@ Use `INPUT_TYPE.MOUSE`, `INPUT_TYPE.KEYBOARD`, `KEYEVENTF.KEYUP` and `KEYEVENTF. Use `winBindings.magnification.MAGCOLOREFFECT` instead. (#18958) * `visionEnhancementProviders.screenCurtain.isScreenFullyBlack` is deprecated. Use `NVDAHelper.localLib.isScreenFullyBlack` instead. (#18958) +* `useUniscribe` from `textUtils.offset.OffsetsTextInfo` and its subclasses is deprecated. + Use `charSegFlag` and `wordSegFlag` instead. (#18735)