19
19
package name .fraser .neil .plaintext ;
20
20
21
21
import java .io .UnsupportedEncodingException ;
22
+ import java .lang .Character ;
22
23
import java .net .URLDecoder ;
23
24
import java .net .URLEncoder ;
24
25
import java .util .*;
@@ -1293,6 +1294,46 @@ public void diff_cleanupMerge(LinkedList<Diff> diffs) {
1293
1294
}
1294
1295
}
1295
1296
1297
+ /**
1298
+ * Rearrange diff boudnaries that split Unicode surrogate pairs.
1299
+ * @param diffs Linked list of diff objects
1300
+ */
1301
+ public void diff_cleanupSplitSurrogates (List <Diff > diffs ) {
1302
+ char lastEnd = 0 ;
1303
+ boolean isFirst = true ;
1304
+ HashSet <Diff > toRemove = new HashSet <Diff >();
1305
+
1306
+ for (Diff aDiff : diffs ) {
1307
+ if (aDiff .text .isEmpty ()) {
1308
+ toRemove .add (aDiff );
1309
+ continue ;
1310
+ }
1311
+
1312
+ char thisTop = aDiff .text .charAt (0 );
1313
+ char thisEnd = aDiff .text .charAt (aDiff .text .length () - 1 );
1314
+
1315
+ if (Character .isHighSurrogate (thisEnd )) {
1316
+ lastEnd = thisEnd ;
1317
+ aDiff .text = aDiff .text .substring (0 , aDiff .text .length () - 1 );
1318
+ }
1319
+
1320
+ if (!isFirst && Character .isHighSurrogate (lastEnd ) && Character .isLowSurrogate (thisTop )) {
1321
+ aDiff .text = lastEnd + aDiff .text ;
1322
+ }
1323
+
1324
+ isFirst = false ;
1325
+
1326
+ if ( aDiff .text .isEmpty () ) {
1327
+ toRemove .add (aDiff );
1328
+ continue ;
1329
+ }
1330
+ }
1331
+
1332
+ for (Diff aDiff : toRemove ) {
1333
+ diffs .remove (aDiff );
1334
+ }
1335
+ }
1336
+
1296
1337
/**
1297
1338
* loc is a location in text1, compute and return the equivalent location in
1298
1339
* text2.
@@ -1429,6 +1470,7 @@ public int diff_levenshtein(List<Diff> diffs) {
1429
1470
*/
1430
1471
public String diff_toDelta (List <Diff > diffs ) {
1431
1472
StringBuilder text = new StringBuilder ();
1473
+ this .diff_cleanupSplitSurrogates (diffs );
1432
1474
for (Diff aDiff : diffs ) {
1433
1475
switch (aDiff .operation ) {
1434
1476
case INSERT :
@@ -1457,6 +1499,103 @@ public String diff_toDelta(List<Diff> diffs) {
1457
1499
return delta ;
1458
1500
}
1459
1501
1502
+ private int digit16 (char b ) throws IllegalArgumentException {
1503
+ switch (b ) {
1504
+ case '0' : return 0 ;
1505
+ case '1' : return 1 ;
1506
+ case '2' : return 2 ;
1507
+ case '3' : return 3 ;
1508
+ case '4' : return 4 ;
1509
+ case '5' : return 5 ;
1510
+ case '6' : return 6 ;
1511
+ case '7' : return 7 ;
1512
+ case '8' : return 8 ;
1513
+ case '9' : return 9 ;
1514
+ case 'A' : case 'a' : return 10 ;
1515
+ case 'B' : case 'b' : return 11 ;
1516
+ case 'C' : case 'c' : return 12 ;
1517
+ case 'D' : case 'd' : return 13 ;
1518
+ case 'E' : case 'e' : return 14 ;
1519
+ case 'F' : case 'f' : return 15 ;
1520
+ default :
1521
+ throw new IllegalArgumentException ();
1522
+ }
1523
+ }
1524
+
1525
+ private String decodeURI (String text ) throws IllegalArgumentException {
1526
+ int i = 0 ;
1527
+ StringBuilder decoded = new StringBuilder (text .length ());
1528
+
1529
+ while (i < text .length ()) {
1530
+ if (text .charAt (i ) != '%' ) {
1531
+ decoded .append (text .charAt (i ++));
1532
+ continue ;
1533
+ }
1534
+
1535
+ // start a percent-sequence
1536
+ int byte1 = (digit16 (text .charAt (i + 1 )) << 4 ) + digit16 (text .charAt (i + 2 ));
1537
+ if ((byte1 & 0x80 ) == 0 ) {
1538
+ decoded .append (Character .toChars (byte1 ));
1539
+ i += 3 ;
1540
+ continue ;
1541
+ }
1542
+
1543
+ if ( text .charAt (i + 3 ) != '%' ) {
1544
+ throw new IllegalArgumentException ();
1545
+ }
1546
+
1547
+ int byte2 = (digit16 (text .charAt (i + 4 )) << 4 ) + digit16 (text .charAt (i + 5 ));
1548
+ if ((byte2 & 0xC0 ) != 0x80 ) {
1549
+ throw new IllegalArgumentException ();
1550
+ }
1551
+ byte2 = byte2 & 0x3F ;
1552
+ if ((byte1 & 0xE0 ) == 0xC0 ) {
1553
+ decoded .append (Character .toChars (((byte1 & 0x1F ) << 6 ) | byte2 ));
1554
+ i += 6 ;
1555
+ continue ;
1556
+ }
1557
+
1558
+ if (text .charAt (i + 6 ) != '%' ) {
1559
+ throw new IllegalArgumentException ();
1560
+ }
1561
+
1562
+ int byte3 = (digit16 (text .charAt (i + 7 )) << 4 ) + digit16 (text .charAt (i + 8 ));
1563
+ if ((byte3 & 0xC0 ) != 0x80 ) {
1564
+ throw new IllegalArgumentException ();
1565
+ }
1566
+ byte3 = byte3 & 0x3F ;
1567
+ if ((byte1 & 0xF0 ) == 0xE0 ) {
1568
+ // unpaired surrogate are fine here
1569
+ decoded .append (Character .toChars (((byte1 & 0x0F ) << 12 ) | (byte2 << 6 ) | byte3 ));
1570
+ i += 9 ;
1571
+ continue ;
1572
+ }
1573
+
1574
+ if (text .charAt (i + 9 ) != '%' ) {
1575
+ throw new IllegalArgumentException ();
1576
+ }
1577
+
1578
+ int byte4 = (digit16 (text .charAt (i + 10 )) << 4 ) + digit16 (text .charAt (i + 11 ));
1579
+ if ((byte4 & 0xC0 ) != 0x80 ) {
1580
+ throw new IllegalArgumentException ();
1581
+ }
1582
+ byte4 = byte4 & 0x3F ;
1583
+ if ((byte1 & 0xF8 ) == 0xF0 ) {
1584
+ int codePoint = ((byte1 & 0x07 ) << 0x12 ) | (byte2 << 0x0C ) | (byte3 << 0x06 ) | byte4 ;
1585
+ if (codePoint >= 0x010000 && codePoint <= 0x10FFFF ) {
1586
+ decoded .append (Character .toChars ((codePoint & 0xFFFF ) >>> 10 & 0x3FF | 0xD800 ));
1587
+ decoded .append (Character .toChars (0xDC00 | (codePoint & 0xFFFF ) & 0x3FF ));
1588
+ i += 12 ;
1589
+ continue ;
1590
+ }
1591
+ }
1592
+
1593
+ throw new IllegalArgumentException ();
1594
+ }
1595
+
1596
+ return decoded .toString ();
1597
+ }
1598
+
1460
1599
/**
1461
1600
* Given the original text1, and an encoded string which describes the
1462
1601
* operations required to transform text1 into text2, compute the full diff.
@@ -1483,10 +1622,7 @@ public LinkedList<Diff> diff_fromDelta(String text1, String delta)
1483
1622
// decode would change all "+" to " "
1484
1623
param = param .replace ("+" , "%2B" );
1485
1624
try {
1486
- param = URLDecoder .decode (param , "UTF-8" );
1487
- } catch (UnsupportedEncodingException e ) {
1488
- // Not likely on modern system.
1489
- throw new Error ("This system does not support UTF-8." , e );
1625
+ param = this .decodeURI (param );
1490
1626
} catch (IllegalArgumentException e ) {
1491
1627
// Malformed URI sequence.
1492
1628
throw new IllegalArgumentException (
@@ -2269,10 +2405,7 @@ public List<Patch> patch_fromText(String textline)
2269
2405
line = text .getFirst ().substring (1 );
2270
2406
line = line .replace ("+" , "%2B" ); // decode would change all "+" to " "
2271
2407
try {
2272
- line = URLDecoder .decode (line , "UTF-8" );
2273
- } catch (UnsupportedEncodingException e ) {
2274
- // Not likely on modern system.
2275
- throw new Error ("This system does not support UTF-8." , e );
2408
+ line = this .decodeURI (line );
2276
2409
} catch (IllegalArgumentException e ) {
2277
2410
// Malformed URI sequence.
2278
2411
throw new IllegalArgumentException (
0 commit comments