pythonpyqtscintillaqscintilla

QScintilla syntax highlighting with QsciLexerCustom - UTF-8 issue with german characters


Much like this related question, I found myself using QScintilla to create a syntax highlighter that has to deal with non-ASCII characters (é, ä, ß, etc...). I use the trick described in the comments of that question to solve the problem, styling the characters base on the length of the utf-8 bytes rather than the Latin-1 bytes. When styling the entire document, it works fine.

However, my issue arises when using the start/end parameters to only style part of the document as there seems to be a mismatch between the start/end parameters and the actual length of the text being styled. I need to use this as I am dealing with large files that cause a 1-2 second input delay if I continuously style the entire document.

I have the following very simple example:

é

; Comment

When I open the file, which runs the highlighter from start to finish, it looks like that: Right colours

However, if I remove and re-type the comment, the colouring will always be one letter off. Wrong colours

This effect stacks indefinitely, with every non-ASCII character, the colouring goes off by another letter until it is a mess.

I have provided a minimal reproducible example below. All you have to do to notice the problem is start a comment below the last line, you will notice that your styling is always one off.

more wrong colours

import sys
from PyQt6.QtWidgets import *
from PyQt6.QtCore import *
from PyQt6.QtGui import *
from PyQt6.Qsci import *
import re

class MyLexer(QsciLexerCustom):
    def __init__(self, parent):
        super(MyLexer, self).__init__(parent)
        self.setDefaultColor(QColor("#ff000000"))
        self.setDefaultPaper(QColor("#ffffffff"))
        self.setDefaultFont(QFont("Consolas", 14))

        self.setColor(QColor("#ff000000"), 0)   # Style 0: black
        self.setColor(QColor("#ff007f00"), 3)   # Style 3: green

        self.setPaper(QColor("#ffffffff"), 0)   # Style 0: white
        self.setPaper(QColor("#ffffffff"), 3)   # Style 3: white

        self.setFont(QFont("Consolas", 14, weight=QFont.Weight.Bold), 0)   # Style 0: Consolas 14pt
        self.setFont(QFont("Consolas", 14, weight=QFont.Weight.Bold), 3)   # Style 3: Consolas 14pt

    def language(self):
        return "SimpleLanguage"

    def description(self, style):
        return str(style)

    def styleText(self, start, end):
        self.startStyling(start)
        text = self.parent().text()[start:end]
        p = re.compile(r"[*]\/|\/[*]|\s+|\w+|\W")

        token_list = [ (token, len(bytearray(token, "utf-8"))) for token in p.findall(text)]

        editor = self.parent()
        apply_until_linebreak = None
        if start > 0:
            previous_style_nr = editor.SendScintilla(editor.SCI_GETSTYLEAT, start - 1)
            if previous_style_nr in [2, 3]:
                apply_until_linebreak = previous_style_nr

        for i, token in enumerate(token_list):
            if apply_until_linebreak is not None:
                if "\n" in token[0]:
                    apply_until_linebreak = None
                    self.setStyling(token[1], 0)
                else:
                    self.setStyling(token[1], apply_until_linebreak)
            else:
                if token[0] in ["/", ";"]:
                    apply_until_linebreak = 3
                    self.setStyling(token[1], 3)
                else:
                    self.setStyling(token[1], 0)

myCodeSample = r"""
This is white

// This is a green comment

This is white again

// This comment has a (ä) special character 
"""

class CustomMainWindow(QMainWindow):
    def __init__(self):
        super(CustomMainWindow, self).__init__()
        self.setGeometry(300, 300, 800, 400)
        self.setWindowTitle("QScintilla Test")

        self.__frm = QFrame(self)
        self.__frm.setStyleSheet("QWidget { background-color: #ffeaeaea }")
        self.__lyt = QVBoxLayout()
        self.__frm.setLayout(self.__lyt)
        self.setCentralWidget(self.__frm)
        self.__myFont = QFont()
        self.__myFont.setPointSize(14)

        self.__editor = QsciScintilla()
        self.__editor.setText(myCodeSample) 
        self.__editor.setLexer(None)            # We install lexer later
        self.__editor.setUtf8(True)             # Set encoding to UTF-8
        self.__editor.setFont(self.__myFont)    # Gets overridden by lexer later on

        self.__lexer = MyLexer(self.__editor)
        self.__editor.setLexer(self.__lexer)

        self.__lyt.addWidget(self.__editor)
        self.show()

if __name__ == '__main__':
    app = QApplication(sys.argv)
    QApplication.setStyle(QStyleFactory.create('Fusion'))
    myGUI = CustomMainWindow()
    sys.exit(app.exec())

Solution

  • Years later, AI would answer my question.

    Ah — yes, I see the problem.

    The key cause is here:

    token_list = [(token, len(bytearray(token, "utf-8"))) for token in p.findall(text)]

    QsciLexerCustom.setStyling() expects the length in bytes of the text in the editor’s encoding, which for QScintilla is usually UTF-8 if you’ve set it that way.

    When you have a character like ä (U+00E4),

    In UTF-8 it’s 2 bytes long,
    
    But re.findall gives you 1 Python character.
    

    So, if you pass len(bytearray(token, "utf-8")) for all tokens, you’re telling Scintilla "this token is N bytes" — which is fine only if the editor is actually storing the text in UTF-8. But QScintilla’s styleText passes you positions in bytes, not characters, so your regex slice and byte length must align with the underlying editor buffer.

    Right now you’re slicing like this:

    text = self.parent().text()[start:end]

    That’s slicing by characters, not bytes. So if start and end are in bytes (which they are), slicing them as Python characters causes the wrong offset whenever multi-byte characters appear.

    class MyLexer(QsciLexerCustom):
        def __init__(self, parent):
            super(MyLexer, self).__init__(parent)
            self.setDefaultColor(QColor("#ff000000"))
            self.setDefaultPaper(QColor("#ffffffff"))
            self.setDefaultFont(QFont("Consolas", 14))
    
            self.setColor(QColor("#ff000000"), 0)   # Style 0: black
            self.setColor(QColor("#ff007f00"), 3)   # Style 3: green
    
            self.setPaper(QColor("#ffffffff"), 0)   # Style 0: white
            self.setPaper(QColor("#ffffffff"), 3)   # Style 3: white
    
            self.setFont(QFont("Consolas", 14, weight=QFont.Weight.Bold), 0)   # Style 0: Consolas 14pt
            self.setFont(QFont("Consolas", 14, weight=QFont.Weight.Bold), 3)   # Style 3: Consolas 14pt
    
        def language(self):
            return "SimpleLanguage"
    
        def description(self, style):
            return str(style)
    
        def styleText(self, start, end):
            self.startStyling(start)
            editor = self.parent()
            text = editor.text()[start:end]
            p = re.compile(r"[*]\/|\/[*]|\s+|\w+|\W")
    
            token_list = [(token, len(bytearray(token, "utf-8"))) for token in p.findall(text)]
    
            apply_until_linebreak = None
    
            if start > 0:
                # Read the single byte before 'start' directly from the editor
                prev_byte = bytearray(1)
                editor.SendScintilla(editor.SCI_GETTEXTRANGE, start - 1, start, memoryview(prev_byte))
                prev_char = prev_byte.decode("utf-8", errors="ignore") or ""
    
                previous_style_nr = editor.SendScintilla(editor.SCI_GETSTYLEAT, start - 1)
                # Only carry comment style if previous char is NOT a newline
                if previous_style_nr in [2, 3] and prev_char not in ("\n", "\r"):
                    apply_until_linebreak = previous_style_nr
    
            for token, length in token_list:
                if apply_until_linebreak is not None:
                    if "\n" in token:
                        apply_until_linebreak = None
                        self.setStyling(length, 0)
                    else:
                        self.setStyling(length, apply_until_linebreak)
                else:
                    if token in ["/", ";"]:
                        apply_until_linebreak = 3
                        self.setStyling(length, 3)
                    else:
                        self.setStyling(length, 0)