# HG changeset patch # User Simon Montagu # Date 1334833171 -10800 # Node ID 3b8a84e1a1035391f062607d9754866e632a6101 # Parent 2f0d8ab5b3cb0b75dbb91f18939a1fcc37e95762 [mq]: 746900 diff --git a/intl/uconv/src/nsUTF8ToUnicode.cpp b/intl/uconv/src/nsUTF8ToUnicode.cpp --- a/intl/uconv/src/nsUTF8ToUnicode.cpp +++ b/intl/uconv/src/nsUTF8ToUnicode.cpp @@ -249,53 +249,34 @@ NS_IMETHODIMP nsUTF8ToUnicode::Convert(c if (0 == mState) { // When mState is zero we expect either a US-ASCII character or a // multi-octet sequence. if (0 == (0x80 & (*in))) { PRInt32 max_loops = NS_MIN(inend - in, outend - out); Convert_ascii_run(in, out, max_loops); --in; // match the rest of the cases mBytes = 1; - } else if (0xC0 == (0xE0 & (*in))) { - // First octet of 2 octet sequence + } else if (0xC0 == (0xE0 & (*in)) && (unsigned char)*in > 0xC1) { + // First octet of 2 octet sequence (excluding illegal values 0xC0/0xC1) mUcs4 = (PRUint32)(*in); mUcs4 = (mUcs4 & 0x1F) << 6; mState = 1; mBytes = 2; } else if (0xE0 == (0xF0 & (*in))) { // First octet of 3 octet sequence mUcs4 = (PRUint32)(*in); mUcs4 = (mUcs4 & 0x0F) << 12; mState = 2; mBytes = 3; - } else if (0xF0 == (0xF8 & (*in))) { - // First octet of 4 octet sequence + } else if (0xF0 == (0xF8 & (*in)) && (unsigned char)*in < 0xF5) { + // First octet of 4 octet sequence in the legal range 0xF0-0xF4 mUcs4 = (PRUint32)(*in); mUcs4 = (mUcs4 & 0x07) << 18; mState = 3; mBytes = 4; - } else if (0xF8 == (0xFC & (*in))) { - /* First octet of 5 octet sequence. - * - * This is illegal because the encoded codepoint must be either - * (a) not the shortest form or - * (b) outside the Unicode range of 0-0x10FFFF. - * Rather than trying to resynchronize, we will carry on until the end - * of the sequence and let the later error handling code catch it. - */ - mUcs4 = (PRUint32)(*in); - mUcs4 = (mUcs4 & 0x03) << 24; - mState = 4; - mBytes = 5; - } else if (0xFC == (0xFE & (*in))) { - // First octet of 6 octet sequence, see comments for 5 octet sequence. - mUcs4 = (PRUint32)(*in); - mUcs4 = (mUcs4 & 1) << 30; - mState = 5; - mBytes = 6; } else { /* Current octet is neither in the US-ASCII range nor a legal first * octet of a multi-octet sequence. * * Return an error condition. Caller is responsible for flushing and * refilling the buffer and resetting state. */ res = NS_ERROR_ILLEGAL_INPUT; diff --git a/intl/uconv/tests/unit/test_utf8_illegals.js b/intl/uconv/tests/unit/test_utf8_illegals.js --- a/intl/uconv/tests/unit/test_utf8_illegals.js +++ b/intl/uconv/tests/unit/test_utf8_illegals.js @@ -1,18 +1,15 @@ // Tests illegal UTF-8 sequences const Cc = Components.Constructor; const Ci = Components.interfaces; -const inStrings1 = new Array("%c0%af", // long forms of 0x2F - "%e0%80%af", +const inStrings1 = new Array("%e0%80%af", // long forms of 0x2F "%f0%80%80%af", - "%f8%80%80%80%af", - "%fc%80%80%80%80%af", // lone surrogates "%ed%a0%80", // D800 "%ed%ad%bf", // DB7F "%ed%ae%80", // DB80 "%ed%af%bf", // DBFF "%ed%b0%80", // DC00 "%ed%be%80", // DF80 "%ed%bf%bf"); // DFFF @@ -23,16 +20,36 @@ const inStrings2 = new Array("%ed%a0%80% "%ed%ad%bf%ed%b0%80", // DB7F DC00 "%ed%ad%bf%ed%bf%bf", // DB7F DFFF "%ed%ae%80%ed%b0%80", // DB80 DC00 "%ed%ae%80%ed%bf%bf", // DB80 DFFF "%ed%af%bf%ed%b0%80", // DBFF DC00 "%ed%ad%bf%ed%bf%bf"); // DBFF DFFF const expected2 = "ABC\ufffd\ufffdXYZ"; +const inStrings3 = new Array("%c0%af", // Illegal bytes in 2-octet + "%c1%af"); // sequences +const expected3 = "ABC\ufffd\ufffdXYZ"; + +const inStrings4 = new Array("%f5%80%80%80", // Illegal bytes in 4-octet + "%f7%bf%bf%bf"); // sequences +const expected4 = "ABC\ufffd\ufffd\ufffd\ufffdXYZ"; + +const inStrings5 = new Array("%f8%80%80%80%80", // Illegal bytes in 5-octet + "%f8%80%80%80%af", // sequences + "%fb%bf%bf%bf%bf"); +const expected5 = "ABC\ufffd\ufffd\ufffd\ufffd\ufffdXYZ"; + +const inStrings6 = new Array("%fc%80%80%80%80%80", // Illegal bytes in 6-octet + "%fc%80%80%80%80%af", // sequences + "%fd%bf%bf%bf%bf%bf"); +const expected6 = "ABC\ufffd\ufffd\ufffd\ufffd\ufffd\ufffdXYZ"; + + + function testCaseInputStream(inStr, expected) { var dataURI = "data:text/plain; charset=UTF-8,ABC" + inStr + "XYZ" dump(inStr + "==>"); var IOService = Cc("@mozilla.org/network/io-service;1", "nsIIOService"); var ConverterInputStream = @@ -69,9 +86,25 @@ function run_test() { for (var i = 0; i < inStrings1.length; ++i) { var inStr = inStrings1[i]; testCaseInputStream(inStr, expected1); } for (var i = 0; i < inStrings2.length; ++i) { var inStr = inStrings2[i]; testCaseInputStream(inStr, expected2); } + for (var i = 0; i < inStrings3.length; ++i) { + var inStr = inStrings3[i]; + testCaseInputStream(inStr, expected3); + } + for (var i = 0; i < inStrings4.length; ++i) { + var inStr = inStrings4[i]; + testCaseInputStream(inStr, expected4); + } + for (var i = 0; i < inStrings5.length; ++i) { + var inStr = inStrings5[i]; + testCaseInputStream(inStr, expected5); + } + for (var i = 0; i < inStrings6.length; ++i) { + var inStr = inStrings6[i]; + testCaseInputStream(inStr, expected6); + } }