Skip to content

pyyaml does not support literals in unicode over codepoint 0xffff #25

@kitterma

Description

@kitterma

See https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=806826

the yaml spec says that

“The allowed character range explicitly excludes the surrogate
block #xD800-#xDFFF, DEL #x7F, the C0 control block #x0-#x1F
(except for #x9, #xA, and #xD), the C1 control block #x80-#x9F,
#xFFFE, and #xFFFF.”

however pyyaml has chosen to negate that check and apply it to only
plane 0. This means that any yaml document that contains unicode
literals in higher planes will fail to parse (and, on output, use the
rather unfriendly \Uxxxxxxxx format).

The attached patch fixes this in a minimally intrusive way, by
extending the checks to cover the additional codepoints where
appropriate. A better fix would be to use the check as the spec
specifies it, but that would be a bigger change.

Index: pyyaml-3.11/lib/yaml/emitter.py

--- pyyaml-3.11.orig/lib/yaml/emitter.py
+++ pyyaml-3.11/lib/yaml/emitter.py
@@ -8,9 +8,13 @@

all = ['Emitter', 'EmitterError']

+import sys
+
from error import YAMLError
from events import *

+has_ucs4 = sys.maxunicode > 0xffff
+
class EmitterError(YAMLError):
pass

@@ -701,7 +705,8 @@ class Emitter(object):
line_breaks = True
if not (ch == u'\n' or u'\x20' <= ch <= u'\x7E'):
if (ch == u'\x85' or u'\xA0' <= ch <= u'\uD7FF'

  •                    or u'\uE000' <= ch <= u'\uFFFD') and ch != u'\uFEFF':
    
  •                    or u'\uE000' <= ch <= u'\uFFFD'
    
  •                    or ((not has_ucs4) or (u'\U00010000' <= ch < u'\U0010ffff'))) and ch != u'\uFEFF':
                 unicode_characters = True
                 if not self.allow_unicode:
                     special_characters = True
    

    Index: pyyaml-3.11/lib/yaml/reader.py

    --- pyyaml-3.11.orig/lib/yaml/reader.py
    +++ pyyaml-3.11/lib/yaml/reader.py
    @@ -19,7 +19,9 @@ all = ['Reader', 'ReaderError']

    from error import YAMLError, Mark

-import codecs, re
+import codecs, re, sys
+
+has_ucs4 = sys.maxunicode > 0xffff

class ReaderError(YAMLError):

@@ -134,7 +136,10 @@ class Reader(object):
self.encoding = 'utf-8'
self.update(1)

  • NON_PRINTABLE = re.compile(u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD]')

  • if has_ucs4:

  •    NON_PRINTABLE = re.compile(u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD\U00010000-\U0010ffff]')
    
  • else:

  •    NON_PRINTABLE = re.compile(u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD]')
    

    def check_printable(self, data):
    match = self.NON_PRINTABLE.search(data)
    if match:
    Index: pyyaml-3.11/lib3/yaml/emitter.py

    --- pyyaml-3.11.orig/lib3/yaml/emitter.py
    +++ pyyaml-3.11/lib3/yaml/emitter.py
    @@ -698,7 +698,8 @@ class Emitter:
    line_breaks = True
    if not (ch == '\n' or '\x20' <= ch <= '\x7E'):
    if (ch == '\x85' or '\xA0' <= ch <= '\uD7FF'

  •                    or '\uE000' <= ch <= '\uFFFD') and ch != '\uFEFF':
    
  •                    or '\uE000' <= ch <= '\uFFFD'
    
  •                    or '\U00010000' <= ch < '\U0010ffff') and ch != '\uFEFF':
                 unicode_characters = True
                 if not self.allow_unicode:
                     special_characters = True
    

    Index: pyyaml-3.11/lib3/yaml/reader.py

    --- pyyaml-3.11.orig/lib3/yaml/reader.py
    +++ pyyaml-3.11/lib3/yaml/reader.py
    @@ -134,7 +134,7 @@ class Reader(object):
    self.encoding = 'utf-8'
    self.update(1)

  • NON_PRINTABLE = re.compile('[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD]')

  • NON_PRINTABLE = re.compile('[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD\U00010000-\U0010ffff]')
    def check_printable(self, data):
    match = self.NON_PRINTABLE.search(data)
    if match:

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions