Skip to content

Commit e8d7113

Browse files
committed
Support UTF-8 surrogates for UTF-16 and 32.
1 parent a6845eb commit e8d7113

File tree

3 files changed

+73
-17
lines changed

3 files changed

+73
-17
lines changed

YamlDotNet.Benchmark/Program.cs

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,4 @@
1-
using System.Globalization;
2-
using BenchmarkDotNet.Running;
1+
using BenchmarkDotNet.Running;
32
using YamlDotNet.Benchmark;
4-
using YamlDotNet.Core;
5-
using YamlDotNet.Core.Events;
6-
using YamlDotNet.Serialization;
7-
using YamlDotNet.Serialization.NamingConventions;
83

9-
var dateTimeOffset = new DateTimeOffset(new DateTime(2017, 1, 2, 3, 4, 5), new TimeSpan(-6, 0, 0));
10-
Console.WriteLine(dateTimeOffset.ToString("MM/dd/yyyy HH:mm:ss zzz", CultureInfo.InvariantCulture));
11-
Console.WriteLine(dateTimeOffset.ToString("O", CultureInfo.InvariantCulture));
4+
BenchmarkSwitcher.FromAssembly(typeof(YamlStreamBenchmark).Assembly).Run(args);

YamlDotNet.Test/Core/ScannerTests.cs

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// This file is part of YamlDotNet - A .NET library for YAML.
1+
// This file is part of YamlDotNet - A .NET library for YAML.
22
// Copyright (c) Antoine Aubry and contributors
33
//
44
// Permission is hereby granted, free of charge, to any person obtaining a copy of
@@ -530,6 +530,20 @@ public void Keys_can_start_with_colons_after_double_quoted_values_in_nested_bloc
530530
StreamEnd);
531531
}
532532

533+
[Fact]
534+
public void Utf16StringsAsUtf8SurrogatesWorkCorrectly()
535+
{
536+
AssertSequenceOfTokensFrom(Yaml.ScannerForText("Test: \"\\uD83D\\uDC4D\""),
537+
StreamStart,
538+
BlockMappingStart,
539+
Key,
540+
PlainScalar("Test"),
541+
Value,
542+
DoubleQuotedScalar("\uD83D\uDC4D"), // guaranteed thumbs up emoticon that will work in Windows Terminal since it pukes on displaying it.
543+
BlockEnd,
544+
StreamEnd);
545+
}
546+
533547
private void AssertPartialSequenceOfTokensFrom(Scanner scanner, params Token[] tokens)
534548
{
535549
var tokenNumber = 1;

YamlDotNet/Core/Scanner.cs

Lines changed: 56 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1937,19 +1937,68 @@ private Scalar ScanFlowScalar(bool isSingleQuoted)
19371937

19381938
// Check the value and write the character.
19391939

1940-
if ((character >= 0xD800 && character <= 0xDFFF) || character > 0x10FFFF)
1940+
//check for utf-8 surrogate pair
1941+
if (character >= 0xD800 && character <= 0xDFFF)
1942+
{
1943+
for (var k = 0; k < codeLength; ++k)
1944+
{
1945+
Skip();
1946+
}
1947+
1948+
if (analyzer.Peek(0) == '\\' &&
1949+
(analyzer.Peek(1) == 'u' || analyzer.Peek(1) == 'U'))
1950+
{
1951+
Skip(); //escape character
1952+
if (analyzer.Peek(0) == 'u')
1953+
{
1954+
codeLength = 4;
1955+
}
1956+
else
1957+
{
1958+
codeLength = 8;
1959+
}
1960+
Skip(); //escape code
1961+
1962+
var lowSurrogate = 0;
1963+
1964+
// Scan the character value.
1965+
for (var k = 0; k < codeLength; ++k)
1966+
{
1967+
if (!analyzer.IsHex(0))
1968+
{
1969+
throw new SyntaxErrorException(start, cursor.Mark(), "While scanning a quoted scalar, did not find expected hexadecimal number.");
1970+
}
1971+
lowSurrogate = ((lowSurrogate << 4) + analyzer.AsHex(k));
1972+
}
1973+
1974+
for (var k = 0; k < codeLength; ++k)
1975+
{
1976+
Skip();
1977+
}
1978+
1979+
character = char.ConvertToUtf32((char)character, (char)lowSurrogate);
1980+
}
1981+
else
1982+
{
1983+
throw new SyntaxErrorException(start, cursor.Mark(), "While scanning a quoted scalar, found invalid Unicode surrogates.");
1984+
}
1985+
}
1986+
else if (character > 0x10FFFF)
19411987
{
19421988
throw new SyntaxErrorException(start, cursor.Mark(), "While scanning a quoted scalar, found invalid Unicode character escape code.");
19431989
}
1990+
else
1991+
{
1992+
// Advance the pointer.
19441993

1945-
value.Append(char.ConvertFromUtf32(character));
1946-
1947-
// Advance the pointer.
1994+
for (var k = 0; k < codeLength; ++k)
1995+
{
1996+
Skip();
1997+
}
19481998

1949-
for (var k = 0; k < codeLength; ++k)
1950-
{
1951-
Skip();
19521999
}
2000+
2001+
value.Append(char.ConvertFromUtf32(character));
19532002
}
19542003
}
19552004
else

0 commit comments

Comments
 (0)