From b404143e6a013b4808151ab6db0ee9e2d2f815b9 Mon Sep 17 00:00:00 2001 From: Tim Hsiung Date: Sat, 9 May 2026 18:55:04 +0800 Subject: [PATCH 1/2] fix(config): preserve unicode characters when writing yaml config yaml.dump() defaults to ASCII-only output, which causes `cz bump` (and `cz init`) to rewrite emoji and other non-ASCII characters in `.cz.yaml` as `\Uxxxx` escape sequences. Pass `allow_unicode=True` so the original characters round-trip. Closes #1164 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- commitizen/config/yaml_config.py | 6 ++++-- tests/test_conf.py | 36 ++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/commitizen/config/yaml_config.py b/commitizen/config/yaml_config.py index 1e9610e17..018eed390 100644 --- a/commitizen/config/yaml_config.py +++ b/commitizen/config/yaml_config.py @@ -30,7 +30,9 @@ def init_empty_config_content(self) -> None: with smart_open( self.path, "a", encoding=self._settings["encoding"] ) as json_file: - yaml.dump({"commitizen": {}}, json_file, explicit_start=True) + yaml.dump( + {"commitizen": {}}, json_file, explicit_start=True, allow_unicode=True + ) def contains_commitizen_section(self) -> bool: with self.path.open("rb") as yaml_file: @@ -63,6 +65,6 @@ def set_key(self, key: str, value: object) -> Self: with smart_open( self.path, "w", encoding=self._settings["encoding"] ) as yaml_file: - yaml.dump(config_doc, yaml_file, explicit_start=True) + yaml.dump(config_doc, yaml_file, explicit_start=True, allow_unicode=True) return self diff --git a/tests/test_conf.py b/tests/test_conf.py index c004e96e1..23b27ff08 100644 --- a/tests/test_conf.py +++ b/tests/test_conf.py @@ -497,3 +497,39 @@ def test_init_with_invalid_content(self, tmp_path, config_file): with pytest.raises(InvalidConfigurationError) as excinfo: YAMLConfig(data=existing_content, path=path) assert config_file in str(excinfo.value) + + def test_set_key_preserves_unicode(self, tmp_path, config_file): + """Regression test for #1164: emoji and other non-ASCII characters + must be preserved verbatim, not escaped to ``\\Uxxxx`` sequences.""" + path = tmp_path / "commitizen" / config_file + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text( + "commitizen:\n" + ' bump_message: "🚀 chore: bump $current_version to $new_version"\n', + encoding="utf-8", + ) + + yaml_config = YAMLConfig(data=path.read_text(encoding="utf-8"), path=path) + yaml_config.set_key("version", "0.1.1") + + rewritten = path.read_text(encoding="utf-8") + assert "🚀" in rewritten + assert "\\U0001F680" not in rewritten + + def test_init_empty_config_content_passes_allow_unicode( + self, tmp_path, config_file, mocker + ): + """``init_empty_config_content`` must call ``yaml.dump`` with + ``allow_unicode=True`` so that any non-ASCII default content (for + future maintainers) is written verbatim. The current default + (``{"commitizen": {}}``) is ASCII-only, so this asserts the + keyword is passed rather than its observable behaviour.""" + path = tmp_path / "commitizen" / config_file + path.parent.mkdir(parents=True, exist_ok=True) + dump_spy = mocker.spy(yaml, "dump") + + yaml_config = YAMLConfig(data="{}", path=path) + yaml_config.init_empty_config_content() + + dump_spy.assert_called_once() + assert dump_spy.call_args.kwargs.get("allow_unicode") is True From 56bca188eb04d2d6b0cd6dad1e995d299ca256dc Mon Sep 17 00:00:00 2001 From: Tim Hsiung <26526132+bearomorphism@users.noreply.github.com> Date: Sat, 9 May 2026 22:26:12 +0800 Subject: [PATCH 2/2] fix(yaml-config): address PR #1966 reviewer feedback * rename json_file -> yaml_file context-manager var * force UTF-8 for YAML writes (YAML 1.2 spec mandates UTF-8/16/32) to avoid UnicodeEncodeError when self._settings.encoding is non-UTF-8 * tighten regression test to reject any \Uxxxxxxxx escape (case-insensitive) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- commitizen/config/yaml_config.py | 12 +++++------- tests/test_conf.py | 3 ++- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/commitizen/config/yaml_config.py b/commitizen/config/yaml_config.py index 018eed390..9093c919f 100644 --- a/commitizen/config/yaml_config.py +++ b/commitizen/config/yaml_config.py @@ -27,11 +27,10 @@ def __init__(self, *, data: bytes | str, path: Path) -> None: self._parse_setting(data) def init_empty_config_content(self) -> None: - with smart_open( - self.path, "a", encoding=self._settings["encoding"] - ) as json_file: + # Write YAML as UTF-8; YAML 1.2 requires UTF-8/16/32. + with smart_open(self.path, "a", encoding="utf-8") as yaml_file: yaml.dump( - {"commitizen": {}}, json_file, explicit_start=True, allow_unicode=True + {"commitizen": {}}, yaml_file, explicit_start=True, allow_unicode=True ) def contains_commitizen_section(self) -> bool: @@ -62,9 +61,8 @@ def set_key(self, key: str, value: object) -> Self: config_doc = yaml.load(yaml_file, Loader=yaml.FullLoader) config_doc["commitizen"][key] = value - with smart_open( - self.path, "w", encoding=self._settings["encoding"] - ) as yaml_file: + # Write YAML as UTF-8; YAML 1.2 requires UTF-8/16/32. + with smart_open(self.path, "w", encoding="utf-8") as yaml_file: yaml.dump(config_doc, yaml_file, explicit_start=True, allow_unicode=True) return self diff --git a/tests/test_conf.py b/tests/test_conf.py index 23b27ff08..a348e511a 100644 --- a/tests/test_conf.py +++ b/tests/test_conf.py @@ -2,6 +2,7 @@ import json import os +import re from pathlib import Path from typing import Any @@ -514,7 +515,7 @@ def test_set_key_preserves_unicode(self, tmp_path, config_file): rewritten = path.read_text(encoding="utf-8") assert "🚀" in rewritten - assert "\\U0001F680" not in rewritten + assert not re.search(r"\\U[0-9a-fA-F]{8}", rewritten) def test_init_empty_config_content_passes_allow_unicode( self, tmp_path, config_file, mocker