Skip to content

Commit 13309a1

Browse files
committed
Java: Stop breaking surrogate pairs in toDelta()
Resolves google#69 for Java Sometimes we can find a common prefix that runs into the middle of a surrogate pair and we split that pair when building our diff groups. This is fine as long as we are operating on UTF-16 code units. It becomes problematic when we start trying to treat those substrings as valid Unicode (or UTF-8) sequences. When we pass these split groups into `toDelta()` we do just that and the library crashes. In this patch we're post-processing the diff groups before encoding them to make sure that we un-split the surrogate pairs. The post-processed diffs should produce the same output when applying the diffs. The diff string itself will be different but should change that much - only by a single character at surrogate boundaries.
1 parent 143d61d commit 13309a1

File tree

2 files changed

+177
-8
lines changed

2 files changed

+177
-8
lines changed

java/src/name/fraser/neil/plaintext/diff_match_patch.java

Lines changed: 141 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
package name.fraser.neil.plaintext;
2020

2121
import java.io.UnsupportedEncodingException;
22+
import java.lang.Character;
2223
import java.net.URLDecoder;
2324
import java.net.URLEncoder;
2425
import java.util.*;
@@ -1293,6 +1294,46 @@ public void diff_cleanupMerge(LinkedList<Diff> diffs) {
12931294
}
12941295
}
12951296

1297+
/**
1298+
* Rearrange diff boudnaries that split Unicode surrogate pairs.
1299+
* @param diffs Linked list of diff objects
1300+
*/
1301+
public void diff_cleanupSplitSurrogates(List<Diff> diffs) {
1302+
char lastEnd = 0;
1303+
boolean isFirst = true;
1304+
HashSet<Diff> toRemove = new HashSet<Diff>();
1305+
1306+
for (Diff aDiff : diffs) {
1307+
if (aDiff.text.isEmpty()) {
1308+
toRemove.add(aDiff);
1309+
continue;
1310+
}
1311+
1312+
char thisTop = aDiff.text.charAt(0);
1313+
char thisEnd = aDiff.text.charAt(aDiff.text.length() - 1);
1314+
1315+
if (Character.isHighSurrogate(thisEnd)) {
1316+
lastEnd = thisEnd;
1317+
aDiff.text = aDiff.text.substring(0, aDiff.text.length() - 1);
1318+
}
1319+
1320+
if (!isFirst && Character.isHighSurrogate(lastEnd) && Character.isLowSurrogate(thisTop)) {
1321+
aDiff.text = lastEnd + aDiff.text;
1322+
}
1323+
1324+
isFirst = false;
1325+
1326+
if ( aDiff.text.isEmpty() ) {
1327+
toRemove.add(aDiff);
1328+
continue;
1329+
}
1330+
}
1331+
1332+
for (Diff aDiff : toRemove) {
1333+
diffs.remove(aDiff);
1334+
}
1335+
}
1336+
12961337
/**
12971338
* loc is a location in text1, compute and return the equivalent location in
12981339
* text2.
@@ -1429,6 +1470,7 @@ public int diff_levenshtein(List<Diff> diffs) {
14291470
*/
14301471
public String diff_toDelta(List<Diff> diffs) {
14311472
StringBuilder text = new StringBuilder();
1473+
this.diff_cleanupSplitSurrogates(diffs);
14321474
for (Diff aDiff : diffs) {
14331475
switch (aDiff.operation) {
14341476
case INSERT:
@@ -1457,6 +1499,103 @@ public String diff_toDelta(List<Diff> diffs) {
14571499
return delta;
14581500
}
14591501

1502+
private int digit16(char b) throws IllegalArgumentException {
1503+
switch (b) {
1504+
case '0': return 0;
1505+
case '1': return 1;
1506+
case '2': return 2;
1507+
case '3': return 3;
1508+
case '4': return 4;
1509+
case '5': return 5;
1510+
case '6': return 6;
1511+
case '7': return 7;
1512+
case '8': return 8;
1513+
case '9': return 9;
1514+
case 'A': case 'a': return 10;
1515+
case 'B': case 'b': return 11;
1516+
case 'C': case 'c': return 12;
1517+
case 'D': case 'd': return 13;
1518+
case 'E': case 'e': return 14;
1519+
case 'F': case 'f': return 15;
1520+
default:
1521+
throw new IllegalArgumentException();
1522+
}
1523+
}
1524+
1525+
private String decodeURI(String text) throws IllegalArgumentException {
1526+
int i = 0;
1527+
StringBuilder decoded = new StringBuilder(text.length());
1528+
1529+
while (i < text.length()) {
1530+
if (text.charAt(i) != '%') {
1531+
decoded.append(text.charAt(i++));
1532+
continue;
1533+
}
1534+
1535+
// start a percent-sequence
1536+
int byte1 = (digit16(text.charAt(i + 1)) << 4) + digit16(text.charAt(i + 2));
1537+
if ((byte1 & 0x80) == 0) {
1538+
decoded.append(Character.toChars(byte1));
1539+
i += 3;
1540+
continue;
1541+
}
1542+
1543+
if ( text.charAt(i + 3) != '%') {
1544+
throw new IllegalArgumentException();
1545+
}
1546+
1547+
int byte2 = (digit16(text.charAt(i + 4)) << 4) + digit16(text.charAt(i + 5));
1548+
if ((byte2 & 0xC0) != 0x80) {
1549+
throw new IllegalArgumentException();
1550+
}
1551+
byte2 = byte2 & 0x3F;
1552+
if ((byte1 & 0xE0) == 0xC0) {
1553+
decoded.append(Character.toChars(((byte1 & 0x1F) << 6) | byte2));
1554+
i += 6;
1555+
continue;
1556+
}
1557+
1558+
if (text.charAt(i + 6) != '%') {
1559+
throw new IllegalArgumentException();
1560+
}
1561+
1562+
int byte3 = (digit16(text.charAt(i + 7)) << 4) + digit16(text.charAt(i + 8));
1563+
if ((byte3 & 0xC0) != 0x80) {
1564+
throw new IllegalArgumentException();
1565+
}
1566+
byte3 = byte3 & 0x3F;
1567+
if ((byte1 & 0xF0) == 0xE0) {
1568+
// unpaired surrogate are fine here
1569+
decoded.append(Character.toChars(((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3));
1570+
i += 9;
1571+
continue;
1572+
}
1573+
1574+
if (text.charAt(i + 9) != '%') {
1575+
throw new IllegalArgumentException();
1576+
}
1577+
1578+
int byte4 = (digit16(text.charAt(i + 10)) << 4) + digit16(text.charAt(i + 11));
1579+
if ((byte4 & 0xC0) != 0x80) {
1580+
throw new IllegalArgumentException();
1581+
}
1582+
byte4 = byte4 & 0x3F;
1583+
if ((byte1 & 0xF8) == 0xF0) {
1584+
int codePoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0C) | (byte3 << 0x06) | byte4;
1585+
if (codePoint >= 0x010000 && codePoint <= 0x10FFFF) {
1586+
decoded.append(Character.toChars((codePoint & 0xFFFF) >>> 10 & 0x3FF | 0xD800));
1587+
decoded.append(Character.toChars(0xDC00 | (codePoint & 0xFFFF) & 0x3FF));
1588+
i += 12;
1589+
continue;
1590+
}
1591+
}
1592+
1593+
throw new IllegalArgumentException();
1594+
}
1595+
1596+
return decoded.toString();
1597+
}
1598+
14601599
/**
14611600
* Given the original text1, and an encoded string which describes the
14621601
* operations required to transform text1 into text2, compute the full diff.
@@ -1483,10 +1622,7 @@ public LinkedList<Diff> diff_fromDelta(String text1, String delta)
14831622
// decode would change all "+" to " "
14841623
param = param.replace("+", "%2B");
14851624
try {
1486-
param = URLDecoder.decode(param, "UTF-8");
1487-
} catch (UnsupportedEncodingException e) {
1488-
// Not likely on modern system.
1489-
throw new Error("This system does not support UTF-8.", e);
1625+
param = this.decodeURI(param);
14901626
} catch (IllegalArgumentException e) {
14911627
// Malformed URI sequence.
14921628
throw new IllegalArgumentException(
@@ -2269,10 +2405,7 @@ public List<Patch> patch_fromText(String textline)
22692405
line = text.getFirst().substring(1);
22702406
line = line.replace("+", "%2B"); // decode would change all "+" to " "
22712407
try {
2272-
line = URLDecoder.decode(line, "UTF-8");
2273-
} catch (UnsupportedEncodingException e) {
2274-
// Not likely on modern system.
2275-
throw new Error("This system does not support UTF-8.", e);
2408+
line = this.decodeURI(line);
22762409
} catch (IllegalArgumentException e) {
22772410
// Malformed URI sequence.
22782411
throw new IllegalArgumentException(

java/tests/name/fraser/neil/plaintext/diff_match_patch_test.java

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,42 @@ public static void testDiffDelta() {
424424

425425
assertEquals("diff_fromDelta: Unicode.", diffs, dmp.diff_fromDelta(text1, delta));
426426

427+
diffs = diffList(new Diff(EQUAL, "\ud83d\ude4b\ud83d"), new Diff(INSERT, "\ude4c\ud83d"), new Diff(EQUAL, "\ude4b"));
428+
delta = dmp.diff_toDelta(diffs);
429+
assertEquals("diff_toDelta: Surrogate Pairs.", "=2\t+%F0%9F%99%8C\t=2", delta);
430+
431+
assertEquals(
432+
"diff_toDelta: insert surrogate pair between similar high surrogates",
433+
dmp.diff_toDelta(diffList(new Diff(EQUAL, "\ud83c\udd70"), new Diff(INSERT, "\ud83c\udd70"), new Diff(EQUAL, "\ud83c\udd71"))),
434+
dmp.diff_toDelta(diffList(new Diff(EQUAL, "\ud83c\udd70\ud83c"), new Diff(INSERT, "\udd70\ud83c"), new Diff(EQUAL, "\udd71")))
435+
);
436+
437+
assertEquals(
438+
"diff_toDelta: swap surrogate pairs delete/insert",
439+
dmp.diff_toDelta(diffList(new Diff(DELETE, "\ud83c\udd70"), new Diff(INSERT, "\ud83c\udd71"))),
440+
dmp.diff_toDelta(diffList(new Diff(EQUAL, "\ud83c"), new Diff(DELETE, "\udd70"), new Diff(INSERT, "\udd71")))
441+
);
442+
443+
assertEquals(
444+
"diff_toDelta: swap surrogate pairs insert/delete",
445+
dmp.diff_toDelta(diffList(new Diff(INSERT, "\ud83c\udd70"), new Diff(DELETE, "\ud83c\udd71"))),
446+
dmp.diff_toDelta(diffList(new Diff(EQUAL, "\ud83c"), new Diff(INSERT, "\udd70"), new Diff(DELETE, "\udd71")))
447+
);
448+
449+
assertEquals(
450+
"diff_toDelta: empty diff groups",
451+
dmp.diff_toDelta(diffList(new Diff(EQUAL, "abcdef"), new Diff(DELETE, ""), new Diff(INSERT, "ghijk"))),
452+
dmp.diff_toDelta(diffList(new Diff(EQUAL, "abcdef"), new Diff(INSERT, "ghijk")))
453+
);
454+
455+
// Different versions of the library may have created deltas with
456+
// half of a surrogate pair encoded as if it were valid UTF-8
457+
assertEquals(
458+
"diff_toDelta: surrogate half encoded as UTF8",
459+
dmp.diff_toDelta(dmp.diff_fromDelta("\ud83c\udd70", "-2\t+%F0%9F%85%B1")),
460+
dmp.diff_toDelta(dmp.diff_fromDelta("\ud83c\udd70", "=1\t-1\t+%ED%B5%B1"))
461+
);
462+
427463
// Verify pool of unchanged characters.
428464
diffs = diffList(new Diff(INSERT, "A-Z a-z 0-9 - _ . ! ~ * ' ( ) ; / ? : @ & = + $ , # "));
429465
String text2 = dmp.diff_text2(diffs);

0 commit comments

Comments
 (0)